From b0c3013f2ea2c82a43248e43a0abfaebd5bb105a Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Wed, 24 Apr 2024 16:28:18 +0800 Subject: [PATCH 001/166] ggml: add Qualcomm QNN(Qualcomm Neural Network,aka Qualcomm AI Engine Direct) backend --- ggml-qnn.cpp | 4874 ++++++++++++++++++++++++++++++++++++++++++++++++++ ggml-qnn.h | 55 + ggml.c | 3 +- llama.cpp | 30 +- 4 files changed, 4960 insertions(+), 2 deletions(-) create mode 100644 ggml-qnn.cpp create mode 100644 ggml-qnn.h diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp new file mode 100644 index 0000000000000..5d698f184c25d --- /dev/null +++ b/ggml-qnn.cpp @@ -0,0 +1,4874 @@ +/* + * MIT license + * Copyright (C) 2024 GGML Authors + * SPDX-License-Identifier: MIT + * + * this is implementation of ggml QNN(Qualcomm Neural Network, aka AI Engine Direct) backend + * + * status: + * + * 1. core implementation(data path works fine as expected with whisper.cpp using QNN CPU/GPU backend on Qualcomm's SoC based low-end phone + * + * 2. core implementation(data path works fine as expected with whisper.cpp using QNN HTP(aka DSP) backend on Qualcomm's soC based high-end phone + * + * 3. core implementation(data path works fine as expected with llama.cpp using QNN CPU/GPU/HTP(aka DSP) backend on Qualcomm's soC based high-end phone + * + * 4. GGML_OP_MUL_MAT & GGML_OP_MUL & GGML_OP_ADD using QNN API has been completed + * + * todo: + * + * 1. lack of implementation of other GGML-OPs using QNN API + * + * 2. only support FP32 / FP16 and the input and output tensors must be of the same data type + * + * 3. QNN's RPC feature(which useful for QNN HTP(aka DSP) backend) not used + * + * 4. multi QNN backend(CPU/GPU/DSP) simultaneously not support + * + * 5. multithreading not work with QNN GPU/HTP(aka DSP) backend + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "QnnTypes.h" +#include "QnnCommon.h" +#include "QnnContext.h" +#include "QnnBackend.h" +#include "QnnGraph.h" +#include "QnnProperty.h" +#include "QnnTensor.h" +#include "QnnInterface.h" +#include "Saver/QnnSaver.h" +#include "System/QnnSystemInterface.h" +#include "HTP/QnnHtpDevice.h" + +#include "ggml-qnn.h" + +#include "ggml-backend-impl.h" + + +// ================================================================================================= +// +// forward/external/helper declaration +// +// ================================================================================================= +class qnn_instance; + +//TODO: should be removed because this is a workaround method during development stage +extern "C" void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor); + +#if (defined __ANDROID__) || (defined ANDROID) //Qualcomm's QNN could running on Windows over ARM(aka WoA) +extern "C" int __android_log_print(int prio, const char * tag, const char * fmt, ...) +__attribute__((__format__(printf, 3, 4))); +#endif + +static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...); + + + +// ================================================================================================= +// +// self-defined macro / data structure +// +// ================================================================================================= +#define RPCMEM_DEFAULT_FLAGS 1 +#define RPCMEM_HEAP_ID_SYSTEM 25 + +#define GGML_DUMP_TENSOR(tensor) ggml_tensor_dump(tensor, #tensor) + +#define GGML_QNN_LOGBUF_LEN 4096 +#define GGML_QNN_MAX_BUFFERS 128 +#define MATRIX_ROW_PADDING 512 + +#define BUF_MAJOR_MASK 0xFF000000 +#define BUF_CONTROL_BASE 0xEE000000 + +#define GGML_QNN_DEBUG 1 + +#define QNN_LOG_ERROR(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define QNN_LOG_WARN(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define QNN_LOG_INFO(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) + +#if GGML_QNN_DEBUG +#define QNN_LOG_DEBUG(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#else +#define QNN_LOG_DEBUG(...) +#endif + + +#define VALIDATE(value, status) \ + do { \ + status = value; \ + if (status != QNN_SUCCESS) { \ + QNN_LOG_WARN("%s expected QNN_SUCCESS\n", #value); \ + return status; \ + } \ + } while (0) + +#define VALIDATE_TENSOR_VERSION(tensor, err) VALIDATE(validate_tensor_version(tensor), err) + +#define VALIDATE_OP_CONFIG_VERSION(op, err) VALIDATE(validate_opconfig_version(op), err) + +#define QNN_VER_PTR(x) (&((x).v1)) +#define QNN_OP_CFG_VALID(opConfig) ((opConfig).version == QNN_OPCONFIG_VERSION_1) + +#define QNN_OP_CFG_GET_NAME(opConfig) get_qnn_oponfig_name(opConfig) +#define QNN_OP_CFG_GET_PACKAGE_NAME(opConfig) get_qnn_opconfig_packagename(opConfig) +#define QNN_OP_CFG_GET_TYPE_NAME(opConfig) get_qnn_opconfig_typename(opConfig) +#define QNN_OP_CFG_GET_NUM_PARAMS(opConfig) get_qnn_opconfig_numparams(opConfig) +#define QNN_OP_CFG_GET_PARAMS(opConfig) get_qnn_opconfig_params(opConfig) +#define QNN_OP_CFG_GET_NUM_INPUTS(opConfig) get_qnn_opconfig_numinputs(opConfig) +#define QNN_OP_CFG_GET_INPUTS(opConfig) get_qnn_opconfig_inputs(opConfig) +#define QNN_OP_CFG_GET_NUM_OUTPUTS(opConfig) get_qnn_opconfig_numoutputs(opConfig) +#define QNN_OP_CFG_GET_OUTPUTS(opConfig) get_qnn_opconfig_outputs(opConfig) + +#define QNN_OP_CFG_SET_NAME(opConfig, value) set_qnn_opconfig_name(opConfig, value) +#define QNN_OP_CFG_SET_PACKAGE_NAME(opConfig, value) set_qnn_opconfig_packagename(opConfig, value) +#define QNN_OP_CFG_SET_TYPE_NAME(opConfig, value) set_qnn_opconfig_typename(opConfig, value) + +#define QNN_OP_CFG_SET_PARAMS(opConfig, numOfParams, params) \ + set_qnn_opconfig_params(opConfig, numOfParams, params) + +#define QNN_OP_CFG_SET_INPUTS(opConfig, numOfInputs, inputTensors) \ + set_qnn_opconfig_inputs(opConfig, numOfInputs, inputTensors) + +#define QNN_OP_CFG_SET_OUTPUTS(opConfig, numOfOutputs, outputTensors) \ + set_qnn_opconfig_outputs(opConfig, numOfOutputs, outputTensors) + +#define QNN_TENSOR_GET_ID(tensor) get_qnn_tensorid(tensor) +#define QNN_TENSOR_GET_NAME(tensor) get_qnn_tensorname(tensor) +#define QNN_TENSOR_GET_TYPE(tensor) get_qnn_tensortype(tensor) +#define QNN_TENSOR_GET_DATA_FORMAT(tensor) get_qnn_tensor_dataformat(tensor) +#define QNN_TENSOR_GET_DATA_TYPE(tensor) get_qnn_tensor_datatype(tensor) +#define QNN_TENSOR_GET_QUANT_PARAMS(tensor) get_qnn_tensor_quantparams(tensor) +#define QNN_TENSOR_GET_RANK(tensor) get_qnn_tensor_rank(tensor) +#define QNN_TENSOR_GET_DIMENSIONS(tensor) get_qnn_tensor_dimensions(tensor) +#define QNN_TENSOR_GET_MEM_TYPE(tensor) get_qnn_tensor_memtype(tensor) +#define QNN_TENSOR_GET_CLIENT_BUF(tensor) get_qnn_tensor_clientbuf(tensor) +#define QNN_TENSOR_GET_MEM_HANDLE(tensor) get_qnn_tensor_memhandle(tensor) + +#define QNN_TENSOR_SET_ID(tensor, value) set_qnn_tensor_id(tensor, value) +#define QNN_TENSOR_SET_NAME(tensor, value) set_qnn_tensor_name(tensor, value) +#define QNN_TENSOR_SET_TYPE(tensor, value) set_qnn_tensor_type(tensor, value) +#define QNN_TENSOR_SET_DATA_FORMAT(tensor, value) set_qnn_tensor_dataformat(tensor, value) +#define QNN_TENSOR_SET_DATA_TYPE(tensor, value) set_qnn_tensor_datatype(tensor, value) +#define QNN_TENSOR_SET_QUANT_PARAMS(tensor, value) set_qnn_tensor_quantparams(tensor, value) +#define QNN_TENSOR_SET_RANK(tensor, value) set_qnn_tensor_rank(tensor, value) +#define QNN_TENSOR_SET_DIMENSIONS(tensor, value) set_qnn_tensor_dimensions(tensor, value) +#define QNN_TENSOR_SET_MEM_TYPE(tensor, value) set_qnn_tensor_memtype(tensor, value) +#define QNN_TENSOR_SET_CLIENT_BUF(tensor, value) set_qnn_tensor_clientbuf(tensor, value) +#define QNN_TENSOR_SET_MEM_HANDLE(tensor, value) set_qnn_tensor_memhandle(tensor, value) + + + +using pfn_rpc_mem_init = void (*)(void); +using pfn_rpc_mem_deinit = void (*)(void); +using pfn_rpc_mem_alloc = void *(*)(int, uint32_t, int); +using pfn_rpc_mem_free = void (*)(void *); +using pfn_rpc_mem_to_fd = int (*)(void *); + +using _pfn_QnnSaver_initialize = decltype(QnnSaver_initialize); +using _pfn_QnnInterface_getProviders = decltype(QnnInterface_getProviders); +using _pfn_QnnSystemInterface_getProviders = decltype(QnnSystemInterface_getProviders); + + + +typedef struct qnn_buf_s qnn_buf_t; +typedef struct qnn_buf_s qnn_buf_buffer_t; +typedef struct buf_element_s buf_element_t; +typedef void (*ggml_qnn_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +typedef void (*ggml_qnn_func_common_t)(const ggml_op ggmlop, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); + +enum class ggml_qnn_profile_level { + profile_off = 0, + profile_basic = 1, + profile_detail = 2 +}; + + +struct buf_element_s { + buf_element_t * next; + + unsigned char * mem; + unsigned char * content; /* start of raw content in mem */ + + uint32_t size ; /* size of content */ + int32_t max_size; /* size of pre-allocated memory pointed to by mem */ + uint32_t type; + void (*free_buffer) (buf_element_t * buf); + void * source; /* CPU, GPU, DSP, ... */ + int id; +} ; + + +struct qnn_buf_s { + buf_element_t * first, * last; + + size_t qnn_buf_size; + uint32_t qnn_buf_data_size; + void * qnn_buf_empty_cb_data; + const char * name; + + pthread_mutex_t mutex; + pthread_cond_t not_empty; + + void (*put) (qnn_buf_t * fifo, buf_element_t * buf); + + buf_element_t *(*get) (qnn_buf_t * fifo); + + void (*clear) (qnn_buf_t * fifo) ; + + int (*size) (qnn_buf_t * fifo); + + int (*num_free) (qnn_buf_t * fifo); + + uint32_t (*data_size) (qnn_buf_t * fifo); + + void (*destroy) (qnn_buf_t * fifo); + + buf_element_t * (*buffer_alloc) (qnn_buf_t * self); + + buf_element_t * (*buffer_try_alloc) (qnn_buf_t * self); + + buf_element_t * buffer_pool_top; + pthread_mutex_t buffer_pool_mutex; + pthread_cond_t buffer_pool_cond_not_empty; + int buffer_pool_num_free; + int buffer_pool_capacity; + int buffer_pool_buf_size; + void * buffer_pool_base; /* used to free mem pool */ +} ; + + +struct ggml_backend_qnn_context { + int device; + int threads; + char name[GGML_MAX_NAME]; + char lib[GGML_MAX_NAME]; + qnn_instance * instance; + qnn_buf_t * buffer_pool; + struct ggml_backend * backend; + QNN_INTERFACE_VER_TYPE raw_interface; + QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface; +} ; + + +// ================================================================================================= +// +// static global variables +// +// ================================================================================================= +//TODO: should be removed for support multi QNN backend simultaneously +static ggml_backend_t g_qnn_backend = nullptr; + +//TODO: should be removed for support multi QNN backend simultaneously +static int g_current_device = 3; // 3 is the default ggml backend + +static bool GGML_OP_HAS_INIT [GGML_OP_COUNT] = { 0 }; +static bool GGML_OP_HAS_FINALIZE[GGML_OP_COUNT] = { 0 }; +static void ggml_setup_op_has_task_pass(void) { + { // INIT + bool * p = GGML_OP_HAS_INIT; + + p[GGML_OP_ACC ] = true; + p[GGML_OP_MUL_MAT ] = true; + p[GGML_OP_MUL_MAT_ID ] = true; + p[GGML_OP_OUT_PROD ] = true; + p[GGML_OP_SET ] = true; + p[GGML_OP_GET_ROWS_BACK ] = true; + p[GGML_OP_DIAG_MASK_INF ] = true; + p[GGML_OP_DIAG_MASK_ZERO ] = true; + p[GGML_OP_CONV_TRANSPOSE_1D ] = true; + p[GGML_OP_CONV_TRANSPOSE_2D ] = true; + p[GGML_OP_FLASH_ATTN_BACK ] = true; + p[GGML_OP_CROSS_ENTROPY_LOSS ] = true; + p[GGML_OP_ADD_REL_POS ] = true; + } + + { // FINALIZE + bool * p = GGML_OP_HAS_FINALIZE; + + p[GGML_OP_CROSS_ENTROPY_LOSS ] = true; + } +} + + +//QNN cDSP and HTA backend would not be used currently, just focus on QNN CPU/GPU/HTP(aka DSP) backend currently +static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { + [QNN_CPU] = {.device = 0, .threads = 1, .name = "qnn-cpu", .lib = "libQnnCpu.so", .instance = nullptr, .buffer_pool = nullptr, .backend = nullptr, .raw_interface = nullptr, .raw_system_interface = nullptr}, + [QNN_GPU] = {.device = 1, .threads = 1, .name = "qnn-gpu", .lib = "libQnnGpu.so", .instance = nullptr, .buffer_pool = nullptr, .backend = nullptr, .raw_interface = nullptr, .raw_system_interface = nullptr}, + [QNN_HTP] = {.device = 2, .threads = 1, .name = "qnn-htp(aka dsp)", .lib = "libQnnHtp.so", .instance = nullptr, .buffer_pool = nullptr, .backend = nullptr, .raw_interface = nullptr, .raw_system_interface = nullptr}, +}; + + + +// ================================================================================================= +// +// internal helper functions +// +// ================================================================================================= +static inline int validate_tensor_version(Qnn_Tensor_t tensor) { + if (tensor.version != QNN_TENSOR_VERSION_1) { + QNN_LOG_WARN("validate_tensor_version() tensor %s, got unsupported version %d\n", + tensor.v1.name, + tensor.version); + return 1; + } + return 0; +} + + +static inline int validate_opconfig_version(Qnn_OpConfig_t opConfig) { + if (opConfig.version != QNN_OPCONFIG_VERSION_1) { + QNN_LOG_WARN("validate_opconfig_version() op %s, got unsupported version %d\n", + opConfig.v1.name, + opConfig.version); + return 1; + } + return 0; +} + + +static inline const char * get_qnn_oponfig_name(const Qnn_OpConfig_t & opConfig) { + if (opConfig.version == QNN_OPCONFIG_VERSION_1) { + return opConfig.v1.name; + } + return nullptr; +} + + +static inline const char * get_qnn_oponfig_name(const Qnn_OpConfig_t * opConfig) { + return get_qnn_oponfig_name(*opConfig); +} + + +static inline const char * get_qnn_opconfig_packagename(const Qnn_OpConfig_t & opConfig) { + if (opConfig.version == QNN_OPCONFIG_VERSION_1) { + return opConfig.v1.packageName; + } + return nullptr; +} + + +static inline const char * get_qnn_opconfig_packagename(const Qnn_OpConfig_t * opConfig) { + return get_qnn_opconfig_packagename(*opConfig); +} + + +static inline const char * get_qnn_opconfig_typename(const Qnn_OpConfig_t & opConfig) { + if (opConfig.version == QNN_OPCONFIG_VERSION_1) { + return opConfig.v1.typeName; + } + return nullptr; +} + + +static inline const char * get_qnn_opconfig_typename(const Qnn_OpConfig_t * opConfig) { + return get_qnn_opconfig_typename(*opConfig); +} + + +static inline uint32_t get_qnn_opconfig_numparams(const Qnn_OpConfig_t & opConfig) { + if (opConfig.version == QNN_OPCONFIG_VERSION_1) { + return opConfig.v1.numOfParams; + } + return 0u; +} + + +static inline uint32_t get_qnn_opconfig_numparams(const Qnn_OpConfig_t * opConfig) { + return get_qnn_opconfig_numparams(*opConfig); +} + + +static inline const Qnn_Param_t * get_qnn_opconfig_params(const Qnn_OpConfig_t & opConfig) { + if (opConfig.version == QNN_OPCONFIG_VERSION_1) { + return opConfig.v1.params; + } + return nullptr; +} + + +static inline const Qnn_Param_t * get_qnn_opconfig_params(const Qnn_OpConfig_t * opConfig) { + return get_qnn_opconfig_params(*opConfig); +} + + +static inline uint32_t get_qnn_opconfig_numinputs(const Qnn_OpConfig_t & opConfig) { + if (opConfig.version == QNN_OPCONFIG_VERSION_1) { + return opConfig.v1.numOfInputs; + } + return 0u; +} + + +static inline uint32_t get_qnn_opconfig_numinputs(const Qnn_OpConfig_t * opConfig) { + return get_qnn_opconfig_numinputs(*opConfig); +} + + +static inline const Qnn_Tensor_t * get_qnn_opconfig_inputs(const Qnn_OpConfig_t & opConfig) { + if (opConfig.version == QNN_OPCONFIG_VERSION_1) { + return opConfig.v1.inputTensors; + } + return nullptr; +} + + +static inline const Qnn_Tensor_t * get_qnn_opconfig_inputs(const Qnn_OpConfig_t * opConfig) { + return get_qnn_opconfig_inputs(*opConfig); +} + + +static inline uint32_t get_qnn_opconfig_numoutputs(const Qnn_OpConfig_t & opConfig) { + if (opConfig.version == QNN_OPCONFIG_VERSION_1) { + return opConfig.v1.numOfOutputs; + } + return 0u; +} + + +static inline uint32_t get_qnn_opconfig_numoutputs(const Qnn_OpConfig_t * opConfig) { + return get_qnn_opconfig_numoutputs(*opConfig); +} + + +static inline const Qnn_Tensor_t * get_qnn_opconfig_outputs(const Qnn_OpConfig_t & opConfig) { + if (opConfig.version == QNN_OPCONFIG_VERSION_1) { + return opConfig.v1.outputTensors; + } + return nullptr; +} + + +static inline const Qnn_Tensor_t * get_qnn_opconfig_outputs(const Qnn_OpConfig_t * opConfig) { + return get_qnn_opconfig_outputs(*opConfig); +} + + +static inline void set_qnn_opconfig_name(Qnn_OpConfig_t & opConfig, const char * name) { + if (opConfig.version == QNN_OPCONFIG_VERSION_1) { + opConfig.v1.name = name; + } +} + + +static inline void set_qnn_opconfig_name(Qnn_OpConfig_t * opConfig, const char * name) { + set_qnn_opconfig_name(*opConfig, name); +} + + +static inline void set_qnn_opconfig_packagename(Qnn_OpConfig_t & opConfig, const char * packageName) { + if (opConfig.version == QNN_OPCONFIG_VERSION_1) { + opConfig.v1.packageName = packageName; + } +} + + +static inline void set_qnn_opconfig_packagename(Qnn_OpConfig_t * opConfig, const char * packageName) { + set_qnn_opconfig_packagename(*opConfig, packageName); +} + + +static inline void set_qnn_opconfig_typename(Qnn_OpConfig_t & opConfig, const char * typeName) { + if (opConfig.version == QNN_OPCONFIG_VERSION_1) { + opConfig.v1.typeName = typeName; + } +} + + +static inline void set_qnn_opconfig_typename(Qnn_OpConfig_t * opConfig, const char * typeName) { + set_qnn_opconfig_typename(*opConfig, typeName); +} + + +static inline void set_qnn_opconfig_params(Qnn_OpConfig_t & opConfig, + uint32_t numOfParams, + Qnn_Param_t * params) { + if (opConfig.version == QNN_OPCONFIG_VERSION_1) { + opConfig.v1.numOfParams = numOfParams; + opConfig.v1.params = params; + } +} + + +static inline void set_qnn_opconfig_params(Qnn_OpConfig_t * opConfig, + uint32_t numOfParams, + Qnn_Param_t * params) { + set_qnn_opconfig_params(*opConfig, numOfParams, params); +} + + +static inline void set_qnn_opconfig_inputs(Qnn_OpConfig_t & opConfig, + uint32_t numOfInputs, + Qnn_Tensor_t * inputTensors) { + if (opConfig.version == QNN_OPCONFIG_VERSION_1) { + opConfig.v1.numOfInputs = numOfInputs; + opConfig.v1.inputTensors = inputTensors; + } +} + + +static inline void set_qnn_opconfig_inputs(Qnn_OpConfig_t * opConfig, + uint32_t numOfInputs, + Qnn_Tensor_t * inputTensors) { + set_qnn_opconfig_inputs(*opConfig, numOfInputs, inputTensors); +} + + +static inline void set_qnn_opconfig_outputs(Qnn_OpConfig_t & opConfig, + uint32_t numOfOutputs, + Qnn_Tensor_t * outputTensors) { + if (opConfig.version == QNN_OPCONFIG_VERSION_1) { + opConfig.v1.numOfOutputs = numOfOutputs; + opConfig.v1.outputTensors = outputTensors; + } +} + + +static inline void set_qnn_opconfig_outputs(Qnn_OpConfig_t * opConfig, + uint32_t numOfOutputs, + Qnn_Tensor_t * outputTensors) { + set_qnn_opconfig_outputs(*opConfig, numOfOutputs, outputTensors); +} + + +static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.id; + } + return 0u; +} + + +static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t * tensor) { return get_qnn_tensorid(*tensor); } + + +static inline const char * get_qnn_tensorname(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.name; + } + return nullptr; +} + + +static inline const char * get_qnn_tensorname(const Qnn_Tensor_t * tensor) { + return get_qnn_tensorname(*tensor); +} + + +static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.type; + } + return QNN_TENSOR_TYPE_UNDEFINED; +} + + +static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t * tensor) { + return get_qnn_tensortype(*tensor); +} + + +static inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.dataFormat; + } + return QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER; +} + + +static inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_t * tensor) { + return get_qnn_tensor_dataformat(*tensor); +} + + +static inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.dataType; + } + return QNN_DATATYPE_UNDEFINED; +} + + +static inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t * tensor) { + return get_qnn_tensor_datatype(*tensor); +} + + +static inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.quantizeParams; + } + return QNN_QUANTIZE_PARAMS_INIT; +} + + +static inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t * tensor) { + return get_qnn_tensor_quantparams(*tensor); +} + + +static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.rank; + } + return 0u; +} + + +static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t * tensor) { return get_qnn_tensor_rank(*tensor); } + + +static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.dimensions; + } + return nullptr; +} + + +static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t * tensor) { + return get_qnn_tensor_dimensions(*tensor); +} + + +static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.memType; + } + return QNN_TENSORMEMTYPE_UNDEFINED; +} + + +static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t * tensor) { + return get_qnn_tensor_memtype(*tensor); +} + + +static inline Qnn_ClientBuffer_t get_qnn_tensor_clientbuf(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.clientBuf; + } + return QNN_CLIENT_BUFFER_INIT; +} + + +static inline Qnn_ClientBuffer_t get_qnn_tensor_clientbuf(const Qnn_Tensor_t * tensor) { + return get_qnn_tensor_clientbuf(*tensor); +} + + +static inline Qnn_MemHandle_t get_qnn_tensor_memhandle(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.memHandle; + } + return nullptr; +} + + +static inline Qnn_MemHandle_t get_qnn_tensor_memhandle(const Qnn_Tensor_t * tensor) { + return get_qnn_tensor_memhandle(*tensor); +} + + +static inline void set_qnn_tensor_id(Qnn_Tensor_t & tensor, uint32_t id) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.id = id; + } +} + + +static inline void set_qnn_tensor_id(Qnn_Tensor_t * tensor, uint32_t id) { set_qnn_tensor_id(*tensor, id); } + + +static inline void set_qnn_tensor_name(Qnn_Tensor_t & tensor, const char * name) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.name = name; + } +} + + +static inline void set_qnn_tensor_name(Qnn_Tensor_t * tensor, const char * name) { + set_qnn_tensor_name(*tensor, name); +} + + +static inline void set_qnn_tensor_type(Qnn_Tensor_t & tensor, Qnn_TensorType_t type) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.type = type; + } +} + + +static inline void set_qnn_tensor_type(Qnn_Tensor_t * tensor, Qnn_TensorType_t type) { + set_qnn_tensor_type(*tensor, type); +} + + +static inline void set_qnn_tensor_dataformat(Qnn_Tensor_t & tensor, Qnn_TensorDataFormat_t format) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.dataFormat = format; + } +} + + +static inline void set_qnn_tensor_dataformat(Qnn_Tensor_t * tensor, Qnn_TensorDataFormat_t format) { + set_qnn_tensor_dataformat(*tensor, format); +} + + +static inline void set_qnn_tensor_datatype(Qnn_Tensor_t & tensor, Qnn_DataType_t dataType) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.dataType = dataType; + } +} + + +static inline void set_qnn_tensor_datatype(Qnn_Tensor_t * tensor, Qnn_DataType_t dataType) { + set_qnn_tensor_datatype(*tensor, dataType); +} + + +static inline void set_qnn_tensor_quantparams(Qnn_Tensor_t & tensor, Qnn_QuantizeParams_t params) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.quantizeParams = params; + } +} + + +static inline void set_qnn_tensor_quantparams(Qnn_Tensor_t * tensor, Qnn_QuantizeParams_t params) { + set_qnn_tensor_quantparams(*tensor, params); +} + + +static inline void set_qnn_tensor_rank(Qnn_Tensor_t & tensor, uint32_t rank) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.rank = rank; + } +} + + +static inline void set_qnn_tensor_rank(Qnn_Tensor_t * tensor, uint32_t rank) { + set_qnn_tensor_rank(*tensor, rank); +} + + +static inline void set_qnn_tensor_dimensions(Qnn_Tensor_t & tensor, uint32_t * dims) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.dimensions = dims; + } +} + + +static inline void set_qnn_tensor_dimensions(Qnn_Tensor_t * tensor, uint32_t * dims) { + set_qnn_tensor_dimensions(*tensor, dims); +} + + +static inline void set_qnn_tensor_memtype(Qnn_Tensor_t & tensor, Qnn_TensorMemType_t memType) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.memType = memType; + } +} + + +static inline void set_qnn_tensor_memtype(Qnn_Tensor_t * tensor, Qnn_TensorMemType_t memType) { + set_qnn_tensor_memtype(*tensor, memType); +} + + +static inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t & tensor, Qnn_ClientBuffer_t clientBuf) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.clientBuf = clientBuf; + } +} + + +static inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t * tensor, Qnn_ClientBuffer_t clientBuf) { + set_qnn_tensor_clientbuf(*tensor, clientBuf); +} + + +static inline void set_qnn_tensor_memhandle(Qnn_Tensor_t & tensor, Qnn_MemHandle_t handle) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.memHandle = handle; + } +} + + +static inline void set_qnn_tensor_memhandle(Qnn_Tensor_t * tensor, Qnn_MemHandle_t handle) { + set_qnn_tensor_memhandle(*tensor, handle); +} + + + +static size_t memscpy(void * dst, size_t dstSize, const void * src, size_t copySize) { + if (!dst || !src || !dstSize || !copySize) + return 0; + + size_t minSize = dstSize < copySize ? dstSize : copySize; + + memcpy(dst, src, minSize); + + return minSize; +} + + +static char * ggml_qnn_strndup(const char * source, size_t maxlen) { + return ::strndup(source, maxlen); +} + + +static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) { + int err = 0; + VALIDATE_TENSOR_VERSION(src, err); + + dst.version = src.version; + QNN_TENSOR_SET_NAME( + dst, ggml_qnn_strndup(QNN_TENSOR_GET_NAME(src), std::string(QNN_TENSOR_GET_NAME(src)).size())); + if (QNN_TENSOR_GET_NAME(dst) == nullptr) { + return 1; + } + QNN_TENSOR_SET_ID(dst, QNN_TENSOR_GET_ID(src)); + QNN_TENSOR_SET_TYPE(dst, QNN_TENSOR_GET_TYPE(src)); + QNN_TENSOR_SET_DATA_FORMAT(dst, QNN_TENSOR_GET_DATA_FORMAT(src)); + QNN_TENSOR_SET_DATA_TYPE(dst, QNN_TENSOR_GET_DATA_TYPE(src)); + QNN_TENSOR_SET_MEM_TYPE(dst, QNN_TENSOR_GET_MEM_TYPE(src)); + + // Only metadata (i.e. non-static data) is copied from source to destination. The union still + // must be initialized so that the clientBuf/memHandle do not contain garbage data + if (QNN_TENSOR_GET_MEM_TYPE(src) == QNN_TENSORMEMTYPE_RAW) { + Qnn_ClientBuffer_t clientBuf = {nullptr, 0}; + QNN_TENSOR_SET_CLIENT_BUF(dst, clientBuf); + } else if (QNN_TENSOR_GET_MEM_TYPE(src) == QNN_TENSORMEMTYPE_MEMHANDLE) { + QNN_TENSOR_SET_MEM_HANDLE(dst, nullptr); + } else { + return 1; + } + + Qnn_QuantizeParams_t srcQParam = QNN_TENSOR_GET_QUANT_PARAMS(src); + Qnn_QuantizationEncoding_t encoding = srcQParam.quantizationEncoding; + if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) { + // need to allocate and copy memory for scaleOffset as it is a pointer array + Qnn_QuantizeParams_t srcQParamCpy = srcQParam; + Qnn_AxisScaleOffset_t &axisScaleOffset = srcQParamCpy.axisScaleOffsetEncoding; + Qnn_ScaleOffset_t **scaleOffset = &axisScaleOffset.scaleOffset; + size_t scaleOffsetSize = axisScaleOffset.numScaleOffsets * sizeof(Qnn_ScaleOffset_t); + *scaleOffset = (Qnn_ScaleOffset_t *)malloc(scaleOffsetSize); + memscpy(*scaleOffset, + scaleOffsetSize, + srcQParam.axisScaleOffsetEncoding.scaleOffset, + scaleOffsetSize); + QNN_TENSOR_SET_QUANT_PARAMS(dst, srcQParamCpy); + } else if (encoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) { + // need to allocate and copy memory for scaleOffset as it is a pointer array + Qnn_QuantizeParams_t srcQParamCpy = srcQParam; + Qnn_BwAxisScaleOffset_t &bwAxisScaleOffset = srcQParamCpy.bwAxisScaleOffsetEncoding; + size_t scaleSize = bwAxisScaleOffset.numElements * sizeof(float); + float **scales = &bwAxisScaleOffset.scales; + int32_t **offsets = &bwAxisScaleOffset.offsets; + *scales = (float *)malloc(scaleSize); + memscpy(*scales, scaleSize, srcQParam.bwAxisScaleOffsetEncoding.scales, scaleSize); + + // Only copy offsets if present, nullptr implies all offsets are 0 + if (bwAxisScaleOffset.offsets != nullptr) { + size_t offsetSize = bwAxisScaleOffset.numElements * sizeof(int32_t); + *offsets = (int32_t *)malloc(offsetSize); + memscpy(*offsets, offsetSize, srcQParam.bwAxisScaleOffsetEncoding.offsets, offsetSize); + } + QNN_TENSOR_SET_QUANT_PARAMS(dst, srcQParamCpy); + } else { + QNN_TENSOR_SET_QUANT_PARAMS(dst, srcQParam); + } + + // need to allocate and copy memory for all the pointer members + uint32_t rank = QNN_TENSOR_GET_RANK(src); + QNN_TENSOR_SET_RANK(dst, rank); + size_t dim_size = rank * sizeof(uint32_t); + uint32_t * dimensions = (uint32_t *)malloc(dim_size); + if (dimensions == nullptr) { + QNN_LOG_WARN("deep_copy_qnn_tensors() allocation error while copying tensor %s\n", QNN_TENSOR_GET_NAME(src)); + return 1; + } + memscpy(dimensions, dim_size, QNN_TENSOR_GET_DIMENSIONS(src), dim_size); + QNN_TENSOR_SET_DIMENSIONS(dst, dimensions); + + return err; +} + + +static int free_qnn_tensor(Qnn_Tensor_t & tensor) { + int err = 0; + VALIDATE_TENSOR_VERSION(tensor, err); + + if (nullptr == QNN_TENSOR_GET_NAME(tensor)) { + QNN_LOG_INFO("it should not happen, pls check"); + } else { + //QNN_LOG_DEBUG("QNN tensor name %s", QNN_TENSOR_GET_NAME(tensor)); + free((void *) QNN_TENSOR_GET_NAME(tensor)); + } + if (nullptr == QNN_TENSOR_GET_DIMENSIONS(tensor)) { + QNN_LOG_INFO("it should not happen, pls check"); + } else { + //TODO:why crash in here? why pointer changed with mul_mat? + //memory leak after comment above line + //free(QNN_TENSOR_GET_DIMENSIONS(tensor)); + } + + return err; +} + + +static int free_qnn_tensors(Qnn_Tensor_t *& tensors, uint32_t numTensors) { + int err = 0; + + // free all pointer allocations in struct + for (size_t i = 0; i < numTensors; i++) { + free_qnn_tensor(tensors[i]); + } + free(tensors); + + return err; +} + + +static float ggml_tensor_sum_elements(const ggml_tensor * tensor) { + double sum = 0; + float value = 0; + std::ostringstream tmposs; + if (tensor->type == GGML_TYPE_F32) { + for (int h = 0; h < tensor->ne[3]; h++) { + for (int i = 0; i < tensor->ne[2]; i++) { + for (int j = 0; j < tensor->ne[1]; j++) { + for (int k = 0; k < tensor->ne[0]; k++) { + value = ((float *) tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] + j * tensor->ne[0] + k]; + sum += value; + //QNN_LOG_DEBUG("[%d][%d][%d][%d]%.2f \t", h, i, j, k, value); + tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value << "\t"; + } + if (strlen(tmposs.str().c_str()) > 4000) { + + } else { + QNN_LOG_DEBUG("%s", tmposs.str().c_str()); + } + tmposs.clear(); + tmposs.str(""); + QNN_LOG_DEBUG("\n"); + } + } + } + } + QNN_LOG_DEBUG("\n"); + return sum; +} + + +static void ggml_dump_tensor(const ggml_tensor * tensor, const char * name) { + QNN_LOG_DEBUG("dump ggml tensor %s\n", name); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", name, + tensor->type, ggml_type_name(tensor->type), + tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]); + float sum = ggml_tensor_sum_elements(tensor); + + //QNN_LOG_DEBUG("\n"); + //QNN_LOG_DEBUG("Sum of tensor %s is %6.2f\n", name, sum); +} + + +static uint32_t ggml_get_tensor_rank(const ggml_tensor * tensor) { + uint32_t rank = 0; + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) { + rank++; + } + } + return rank; +} + + +//TODO: +//ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684 +static Qnn_DataType_t qnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) { + switch (ggmltype) { + case GGML_TYPE_Q4_0: + return QNN_DATATYPE_UFIXED_POINT_4; + case GGML_TYPE_Q4_1: + return QNN_DATATYPE_SFIXED_POINT_4; + case GGML_TYPE_Q8_0: + return QNN_DATATYPE_UFIXED_POINT_8; + case GGML_TYPE_Q8_1: + return QNN_DATATYPE_SFIXED_POINT_8; + case GGML_TYPE_F16: + return QNN_DATATYPE_FLOAT_16; + case GGML_TYPE_F32: + return QNN_DATATYPE_FLOAT_32; + + } + return QNN_DATATYPE_FLOAT_32; +} + + +//TODO: +static const char * qnn_opname_from_ggmlop(enum ggml_op ggmlop) { + switch (ggmlop) { + case GGML_OP_ADD: + return QNN_OP_ELEMENT_WISE_ADD; + case GGML_OP_MUL: + return QNN_OP_ELEMENT_WISE_MULTIPLY; + case GGML_OP_MUL_MAT: + return QNN_OP_MAT_MUL; + } + + return nullptr; +} + + +static uint32_t ggml_get_tensor_data_size(const ggml_tensor * tensor) { + /* + size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); + size_t n_dims = ggml_get_tensor_rank(tensor); + for (int i = 1; i < n_dims; i++) { + data_size *= tensor->ne[i]; + } + + return data_size; + */ + return ggml_nbytes(tensor); +} + + +template +Fn load_qnn_functionpointers(void * handle, const char * function_name) { + return reinterpret_cast(dlsym(handle, function_name)); +} + + +static void qnn_xfree(void * ptr) { + if (nullptr != ptr) { + free(ptr); + ptr = nullptr; + } +} + + +static void * qnn_xmalloc(size_t size) { + void * ptr; + + if (!size) + size++; + + if ((ptr = calloc(1, size)) == nullptr) { + QNN_LOG_WARN("malloc(%d) failed: %s\n",size, strerror(errno)); + return nullptr; + } + + return ptr; +} + + +static void * qnn_xmalloc_aligned(size_t alignment, size_t size, void ** base) { + char * ptr; + + *base = ptr = static_cast(qnn_xmalloc(size + alignment)); + + while ((size_t) ptr % alignment) + ptr++; + + return ptr; +} + + +static void buffer_pool_free (buf_element_t * element) { + qnn_buf_t * self = (qnn_buf_t *) element->source; + + pthread_mutex_lock(&self->buffer_pool_mutex); + + element->next = self->buffer_pool_top; + self->buffer_pool_top = element; + + self->buffer_pool_num_free++; + if (self->buffer_pool_num_free > self->buffer_pool_capacity) { + QNN_LOG_DEBUG("TOO MANY FREE\n"); + } + + pthread_cond_signal (&self->buffer_pool_cond_not_empty); + + pthread_mutex_unlock (&self->buffer_pool_mutex); +} + + +static buf_element_t * buffer_pool_alloc (qnn_buf_t * self) { + buf_element_t * buf = nullptr; + int i; + + pthread_mutex_lock (&self->buffer_pool_mutex); + + while (self->buffer_pool_num_free < 2) { + pthread_cond_wait (&self->buffer_pool_cond_not_empty, &self->buffer_pool_mutex); + } + + buf = self->buffer_pool_top; + self->buffer_pool_top = self->buffer_pool_top->next; + self->buffer_pool_num_free--; + + buf->content = buf->mem; + buf->size = 0; + buf->type = 0; + + pthread_mutex_unlock (&self->buffer_pool_mutex); + + return buf; +} + + +static buf_element_t * buffer_pool_try_alloc (qnn_buf_t * self) { + buf_element_t * buf = nullptr; + + pthread_mutex_lock (&self->buffer_pool_mutex); + + if (self->buffer_pool_top) { + buf = self->buffer_pool_top; + self->buffer_pool_top = self->buffer_pool_top->next; + self->buffer_pool_num_free--; + } else { + buf = nullptr; + } + + pthread_mutex_unlock (&self->buffer_pool_mutex); + + if (buf) { + buf->content = buf->mem; + buf->size = 0; + } + + return buf; +} + + +static void qnn_buf_buffer_put(qnn_buf_t * fifo, buf_element_t * element) { + pthread_mutex_lock (&fifo->mutex); + + if (fifo->last) + fifo->last->next = element; + else + fifo->first = element; + + fifo->last = element; + element->next = nullptr; + fifo->qnn_buf_size++; + fifo->qnn_buf_data_size += element->size; + + LOGJ("put:index %d, fifo->size is %d, self->buffer_pool_num_free %d\n", element->id, fifo->qnn_buf_size, fifo->buffer_pool_num_free); + pthread_cond_signal (&fifo->not_empty); + + pthread_mutex_unlock (&fifo->mutex); +} + + +static buf_element_t * qnn_buf_buffer_get (qnn_buf_t * fifo) { + buf_element_t * buf = nullptr; + + pthread_mutex_lock (&fifo->mutex); +#if 0 + while (fifo->first == nullptr) { + pthread_cond_wait (&fifo->not_empty, &fifo->mutex); + } +#else + if (fifo->first == nullptr) { + pthread_mutex_unlock (&fifo->mutex); + return nullptr; + } +#endif + + buf = fifo->first; + + fifo->first = fifo->first->next; + if (fifo->first==nullptr) + fifo->last = nullptr; + + fifo->qnn_buf_size--; + fifo->qnn_buf_data_size -= buf->size; + + pthread_mutex_unlock (&fifo->mutex); + + return buf; +} + + +static void qnn_buf_buffer_clear (qnn_buf_t * fifo) { + buf_element_t * buf, * next, * prev; + + pthread_mutex_lock (&fifo->mutex); + + buf = fifo->first; + prev = nullptr; + + while (buf != nullptr) { + next = buf->next; + if ((buf->type & BUF_MAJOR_MASK) != BUF_CONTROL_BASE) { + if (prev) + prev->next = next; + else + fifo->first = next; + + if (!next) + fifo->last = prev; + + fifo->qnn_buf_size--; + fifo->qnn_buf_data_size -= buf->size; + + buf->free_buffer(buf); + } else { + prev = buf; + } + + buf = next; + } + + QNN_LOG_DEBUG("free buffers after clear: %d\n", fifo->buffer_pool_num_free); + pthread_mutex_unlock (&fifo->mutex); +} + + +static int qnn_buf_buffer_size (qnn_buf_t * self) { + int size = 0; + + pthread_mutex_lock(&self->mutex); + size = self->qnn_buf_size; + pthread_mutex_unlock(&self->mutex); + + return size; +} + + +static uint32_t qnn_buf_buffer_data_size (qnn_buf_t * self) { + uint32_t data_size; + + pthread_mutex_lock(&self->mutex); + data_size = self->qnn_buf_data_size; + pthread_mutex_unlock(&self->mutex); + + return data_size; +} + + +static int qnn_buf_buffer_num_free (qnn_buf_t * self) { + int buffer_pool_num_free = 0; + + pthread_mutex_lock(&self->mutex); + buffer_pool_num_free = self->buffer_pool_num_free; + pthread_mutex_unlock(&self->mutex); + + return buffer_pool_num_free; +} + + +static void qnn_buf_buffer_dispose (qnn_buf_t * self) { + buf_element_t * buf, * next; + int received = 0; + + self->clear( self ); + buf = self->buffer_pool_top; + + while (buf != nullptr) { + next = buf->next; + qnn_xfree(buf); + received++; + + buf = next; + } + + while (received < self->buffer_pool_capacity) { + buf = self->get(self); + qnn_xfree(buf); + received++; + } + + qnn_xfree(self->buffer_pool_base); + pthread_mutex_destroy(&self->mutex); + pthread_cond_destroy(&self->not_empty); + pthread_mutex_destroy(&self->buffer_pool_mutex); + pthread_cond_destroy(&self->buffer_pool_cond_not_empty); + qnn_xfree((void *)self->name); + qnn_xfree (self); +} + + +static qnn_buf_t * qnn_buf_new(const char * name, int num_buffers, uint32_t buf_size) { + int i = 0; + int alignment = 4; + qnn_buf_t * self = nullptr; + uint8_t * multi_buffer = nullptr; + + self = (qnn_buf_t*)qnn_xmalloc(sizeof(qnn_buf_t)); + if (nullptr == self) { + QNN_LOG_WARN("malloc memory failed\n"); + return nullptr; + } + + self->name = strdup(name); + self->first = nullptr; + self->last = nullptr; + self->qnn_buf_size = 0; + self->put = qnn_buf_buffer_put; + self->get = qnn_buf_buffer_get; + self->clear = qnn_buf_buffer_clear; + self->size = qnn_buf_buffer_size; + self->num_free = qnn_buf_buffer_num_free; + self->data_size = qnn_buf_buffer_data_size; + self->destroy = qnn_buf_buffer_dispose; + pthread_mutex_init (&self->mutex, nullptr); + pthread_cond_init (&self->not_empty, nullptr); + + + if (buf_size % alignment != 0) + buf_size += alignment - (buf_size % alignment); + + QNN_LOG_INFO("[%s]allocating %d Mbytes memory(alignment = %d)\n", name, (num_buffers * buf_size) / (1 << 20), alignment); + + multi_buffer = (uint8_t *)qnn_xmalloc_aligned (alignment, num_buffers * buf_size, &self->buffer_pool_base); + if (nullptr == multi_buffer) { + QNN_LOG_WARN("malloc memory failed\n"); + free(self); + return nullptr; + } + + self->buffer_pool_top = nullptr; + + pthread_mutex_init (&self->buffer_pool_mutex, nullptr); + pthread_cond_init (&self->buffer_pool_cond_not_empty, nullptr); + + self->buffer_pool_num_free = 0; + self->buffer_pool_capacity = num_buffers; + self->buffer_pool_buf_size = buf_size; + self->buffer_alloc = buffer_pool_alloc; + self->buffer_try_alloc = buffer_pool_try_alloc; + + for (i = 0; i < num_buffers; i++) { + buf_element_t * buf = nullptr; + + buf = (buf_element_t *)qnn_xmalloc(sizeof (buf_element_t)); + if (nullptr == buf) { + QNN_LOG_WARN("malloc memory failed"); + free(multi_buffer); + free(self); + return nullptr; + } + + buf->id = i; + buf->mem = multi_buffer; + multi_buffer += buf_size; + + buf->max_size = buf_size; + buf->free_buffer = buffer_pool_free; + buf->source = self; + + buffer_pool_free(buf); + } + + return self; +} + + +static const char * get_qnn_backend_name(int n_backend_type) { + switch (n_backend_type) { + case 0: + return "QNN-CPU"; + case 1: + return "QNN-GPU"; + case 2: + return "QNN-HTP(DSP)"; + case 3: + return "ggml"; //the default GGML backend, used to compare performance between QNN backend and the default GGML backend + +#if 0 //QNN cDSP and HTA backend would not be used currently, focus on QNN CPU/GPU/HTP(aka DSP) backend currently + case 3: + return "QNN-cDSP"; + case 4: + return "QNN-HTA"; +#endif + + default: + return "unknown"; + } +} + + +static intptr_t align_to(size_t alignment, intptr_t offset) { + return offset % alignment == 0 ? offset + : offset + + (static_cast(alignment) - + offset % static_cast(alignment)); +} + + +static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...) { + static std::mutex ggml_qnn_log_internal_mutex; + static char s_ggml_qnn_log_internal_buf[GGML_QNN_LOGBUF_LEN]; + + { + std::lock_guard lock(ggml_qnn_log_internal_mutex); + va_list args; + va_start(args, format); + int len_prefix = snprintf(s_ggml_qnn_log_internal_buf, GGML_QNN_LOGBUF_LEN, "[%s, %d]: ", func, line); + int len = vsnprintf(s_ggml_qnn_log_internal_buf + len_prefix, GGML_QNN_LOGBUF_LEN - len_prefix, format, args); + if (len < (GGML_QNN_LOGBUF_LEN - len_prefix)) { +#if (defined __ANDROID__) || (defined ANDROID) + __android_log_print(level, "llamacpp", "%s", s_ggml_qnn_log_internal_buf); +#else + printf("%s", buffer); //Qualcomm's QNN could running on Window over ARM +#endif + } + va_end(args); + } +} + + +// ================================================================================================= +// +// wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK +// +// ================================================================================================= +class qnn_interface { + +#define DEFINE_SHIM_FUNCTION_INTERFACE(F, pointer_name) \ + template \ + inline auto qnn_##F(Args... args) const { \ + return (_qnn_interface->QNN_INTERFACE_VER_NAME.pointer_name)( \ + std::forward(args)...); \ + } + + +#define DEFINE_SHIM_FUNCTION_SYS_INTERFACE(F, pointer_name) \ + template \ + inline auto qnn_##F(Args... args) const { \ + return (_qnn_sys_interface->QNN_SYSTEM_INTERFACE_VER_NAME.pointer_name)( \ + std::forward(args)...); \ + } + + friend class qnn_instance; + +public: + qnn_interface() = default; + + // QnnBackend + DEFINE_SHIM_FUNCTION_INTERFACE(backend_create, backendCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(backend_free, backendFree); + + DEFINE_SHIM_FUNCTION_INTERFACE(backend_register_op_package, backendRegisterOpPackage); + + DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, backendValidateOpConfig); + + DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, backendGetApiVersion); + + // QnnDevice + DEFINE_SHIM_FUNCTION_INTERFACE(device_create, deviceCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(device_free, deviceFree); + + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_infrastructure, deviceGetInfrastructure); + + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, deviceGetPlatformInfo); + + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_info, deviceGetInfo); + + // QnnContext + DEFINE_SHIM_FUNCTION_INTERFACE(context_create, contextCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary_size, contextGetBinarySize); + + DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary, contextGetBinary); + + DEFINE_SHIM_FUNCTION_INTERFACE(context_create_from_binary, contextCreateFromBinary); + + DEFINE_SHIM_FUNCTION_INTERFACE(context_free, contextFree); + + // QnnGraph + DEFINE_SHIM_FUNCTION_INTERFACE(graph_create, graphCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(graph_add_node, graphAddNode); + + DEFINE_SHIM_FUNCTION_INTERFACE(graph_finalize, graphFinalize); + + DEFINE_SHIM_FUNCTION_INTERFACE(graph_execute, graphExecute); + + DEFINE_SHIM_FUNCTION_INTERFACE(graph_retrieve, graphRetrieve); + + // QnnLog + DEFINE_SHIM_FUNCTION_INTERFACE(log_create, logCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(log_free, logFree); + + DEFINE_SHIM_FUNCTION_INTERFACE(log_set_log_level, logSetLogLevel); + + // QnnProfile + DEFINE_SHIM_FUNCTION_INTERFACE(profile_create, profileCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_events, profileGetEvents); + + DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_sub_events, profileGetSubEvents); + + DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_event_data, profileGetEventData); + + DEFINE_SHIM_FUNCTION_INTERFACE(profile_free, profileFree); + + // QnnMem + DEFINE_SHIM_FUNCTION_INTERFACE(mem_register, memRegister); + + DEFINE_SHIM_FUNCTION_INTERFACE(mem_de_register, memDeRegister); + + // QnnProperty + DEFINE_SHIM_FUNCTION_INTERFACE(property_has_capability, propertyHasCapability); + + // QnnTensor + DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_context_tensor, tensorCreateContextTensor); + + DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, tensorCreateGraphTensor); + + // QnnSystem + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_create, systemContextCreate); + + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, systemContextGetBinaryInfo); + + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree); + + void set_qnn_interface(const QnnInterface_t * qnn_interface) { + _qnn_interface = qnn_interface; + } + + void set_qnn_system_interface(const QnnSystemInterface_t * qnn_sys_interface) { + _qnn_sys_interface = qnn_sys_interface; + } + + uint32_t get_backend_id() const { + return _qnn_interface->backendId; + } + + bool is_loaded() const { + return ((_qnn_sys_interface != nullptr) && (_qnn_interface != nullptr)); + } + +private: + const QnnInterface_t *_qnn_interface = nullptr; + + const QnnSystemInterface_t *_qnn_sys_interface = nullptr; +}; + + + +// ================================================================================================= +// +// wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK +// +// and +// +// resource management of QNN resources for GGML's QNN backend +// ================================================================================================= +class qnn_instance { +public: + using BackendIdType = decltype(QnnInterface_t{}.backendId); + + explicit qnn_instance(const std::string & lib_path, const std::string & backend_name, + const std::string & model_name) : + _lib_path(std::move(lib_path)), + _backend_name(std::move(backend_name)), + _model_name(std::move(model_name)) {}; + + ~qnn_instance() { + } + + int qnn_init(const QnnSaver_Config_t ** saver_config); + + int qnn_finalize(); + + const qnn_interface &get_qnn_interface() { + if (!_qnn_interface.is_loaded()) { + QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + } + return _qnn_interface; + } + + + const QNN_INTERFACE_VER_TYPE &get_qnn_raw_interface() { + if (!_qnn_interface.is_loaded()) { + QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + } + return _qnn_raw_interface; + } + + const QNN_SYSTEM_INTERFACE_VER_TYPE &get_qnn_raw_system_interface() { + if (!_qnn_interface.is_loaded()) { + QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + } + return _qnn_raw_system_interface; + } + + const Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; } + + const Qnn_ProfileHandle_t get_qnn_profile_handle() { return _qnn_profile_handle; } + + const Qnn_DeviceHandle_t get_qnn_device_handle() { return _qnn_device_handle; } + + const Qnn_BackendHandle_t get_qnn_backend_handle() { return _qnn_backend_handle; } + + const Qnn_ContextHandle_t get_qnn_context_handle() { return _qnn_context_handle; } + + const QnnSystemContext_Handle_t get_qnn_system_handle() { return _qnn_system_handle; } + + const Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; } + + + int init_qnn_graph(const char * graph_name, + bool debug, + uint8_t do_node_validation = 1, + const QnnGraph_Config_t ** graph_configs = nullptr + ); + + int finalize_qnn_graph(); + + int init_htp_perfinfra() { + QnnDevice_Infrastructure_t device_infra = nullptr; + int error = _qnn_raw_interface.deviceGetInfrastructure(&device_infra); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get qnn device infra\n"); + return 1; + } + + QnnHtpDevice_Infrastructure_t *htp_infra = static_cast(device_infra); + QnnHtpDevice_PerfInfrastructure_t *htp_perfinfra = &htp_infra->perfInfra; + uint32_t power_configid = 1; + uint32_t device_id = 0; + uint32_t core_id = 0; + htp_perfinfra->createPowerConfigId(device_id, core_id, &power_configid); + _qnn_htp_perfinfra = htp_perfinfra; + _qnn_power_configid = power_configid; + + return 0; + } + + + int set_rpc_polling() { + if (_qnn_rpc_pollingtime > 0) { + QnnHtpPerfInfrastructure_PowerConfig_t rpc_pollingTime; + memset(&rpc_pollingTime, 0, sizeof(rpc_pollingTime)); + rpc_pollingTime.option = + QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME; + rpc_pollingTime.rpcPollingTimeConfig = _qnn_rpc_pollingtime; + const QnnHtpPerfInfrastructure_PowerConfig_t *powerConfigs[] = {&rpc_pollingTime, nullptr}; + if (_qnn_htp_perfinfra) { + _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, powerConfigs); + } + } + return 0; + } + + + int set_high_performance_mode() { + if (nullptr == _qnn_htp_perfinfra) { + QNN_LOG_DEBUG("perf intra is null\n"); + return 1; + } + + QnnHtpPerfInfrastructure_PowerConfig_t powerConfig; + memset(&powerConfig, 0, sizeof(powerConfig)); + powerConfig.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3; + powerConfig.dcvsV3Config.dcvsEnable = 0; + powerConfig.dcvsV3Config.setDcvsEnable = 1; + powerConfig.dcvsV3Config.contextId = _qnn_power_configid; + powerConfig.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE; + powerConfig.dcvsV3Config.setSleepLatency = 1; // True to consider Latency parameter otherwise False + powerConfig.dcvsV3Config.setBusParams = 1; // True to consider Bus parameter otherwise False + powerConfig.dcvsV3Config.setCoreParams = 1; // True to consider Core parameter otherwise False + powerConfig.dcvsV3Config.sleepDisable = 0; // True to consider sleep/LPM modes, False to enable + powerConfig.dcvsV3Config.setSleepDisable = 0; // True to consider sleep disable/enable parameter otherwise False + // set Sleep latency parameter + uint32_t latencyValue = 40; + powerConfig.dcvsV3Config.sleepLatency = latencyValue; // range 40-2000 micro sec + // set Bus Clock Parameters (refer QnnHtpPerfInfrastructure_VoltageCorner_t enum) + powerConfig.dcvsV3Config.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + powerConfig.dcvsV3Config.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + powerConfig.dcvsV3Config.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + // set Core Clock Parameters (refer QnnHtpPerfInfrastructure_VoltageCorner_t enum) + powerConfig.dcvsV3Config.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + powerConfig.dcvsV3Config.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + powerConfig.dcvsV3Config.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + // set power config with different performance parameters + const QnnHtpPerfInfrastructure_PowerConfig_t *powerConfigs[] = {&powerConfig, nullptr}; + + _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, powerConfigs); + + return 0; + } + + std::string &get_qnn_graph_name() { return _graph_name; } + + bool is_rpcmem_initialized() { + return _rpcmem_initialized; + } + + void set_rpcmem_initialized(bool initialized) { + _rpcmem_initialized = initialized; + } + + int32_t rpcmem_to_fd(void * buf); + + int register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor); + + void unregister_rpcmem(); + + void *alloc_rpcmem(size_t bytes, size_t alignment); + + void free_rpcmem(void * buf); + + bool is_rpcmem_allocated(void * buf); + + bool is_rpcmem_registered(Qnn_MemHandle_t handle) { + return _qnn_mem_set.count(handle) != 0U; + } + +public: + std::map> _qnn_graph_map; + +private: + int load_system(); + + int unload_system(); + + int load_backend(std::string &lib_path, const QnnSaver_Config_t ** saver_config); + + int unload_backend(); + + void set_qnn_raw_interface(QNN_INTERFACE_VER_TYPE & raw_interface) { + _qnn_raw_interface = raw_interface; + } + + void set_qnn_raw_system_interface(QNN_SYSTEM_INTERFACE_VER_TYPE &raw_interface) { + _qnn_raw_system_interface = raw_interface; + } + +private: + static constexpr const int _required_num_providers = 1; + +private: + std::string _lib_path; + std::string _backend_name; + std::string _model_name; // prebuilt QNN model name, not used in currently + BackendIdType _backend_id; + + bool _debug_tensor = false; // flag to indicate if requested graph is to be run in debug mode + bool _do_node_validations = true; // flag to indicate whether all add_node calls need to be validated + QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_DEBUG; + + ggml_qnn_profile_level _profile_level = ggml_qnn_profile_level::profile_detail; + + qnn_interface _qnn_interface; + + void *_system_lib_handle = nullptr; + void *_model_lib_handle = nullptr; + + Qnn_GraphHandle_t _qnn_graph_handle = nullptr; + + Qnn_LogHandle_t _qnn_log_handle = nullptr; + + Qnn_ProfileHandle_t _qnn_profile_handle = nullptr; + + Qnn_DeviceHandle_t _qnn_device_handle = nullptr; + + Qnn_BackendHandle_t _qnn_backend_handle = nullptr; + + Qnn_ContextHandle_t _qnn_context_handle = nullptr; + + QnnSystemContext_Handle_t _qnn_system_handle = nullptr; + + QnnHtpDevice_PerfInfrastructure_t *_qnn_htp_perfinfra = nullptr; + uint32_t _qnn_power_configid = 1; + uint32_t _qnn_rpc_pollingtime = 9999; // 0-10000 us for high performing + + QNN_INTERFACE_VER_TYPE _qnn_raw_interface; + QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface; + + std::unordered_set _qnn_mem_set; + + static std::mutex _init_mutex; + static std::unordered_map _loaded_lib_handle; + static std::unordered_map _lib_path_to_backend_id; + static std::unordered_map _loaded_backend; + + void *_rpc_lib_handle = nullptr; + std::atomic_bool _rpcmem_initialized{false}; + pfn_rpc_mem_alloc _pfn_rpc_mem_alloc; + pfn_rpc_mem_free _pfn_rpc_mem_free; + pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd; + pfn_rpc_mem_init _pfn_rpc_mem_init; + pfn_rpc_mem_deinit _pfn_rpc_mem_deinit; + std::unordered_map _rpcmem_store_map; + + + std::string _graph_name; +}; + + + +// ================================================================================================= +// +// implementation of wrapper class +// +// ================================================================================================= +std::mutex qnn_instance::_init_mutex; + +std::unordered_map qnn_instance::_loaded_lib_handle; + +std::unordered_map qnn_instance::_lib_path_to_backend_id; + +std::unordered_map qnn_instance::_loaded_backend; + + +void * qnn_instance::alloc_rpcmem(size_t bytes, size_t alignment) { + if (!_rpcmem_initialized) { + QNN_LOG_WARN("rpc memory not initialized\n"); + return nullptr; + } + + auto allocate_bytes = static_cast(bytes + alignment); + void *buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, allocate_bytes); + if (buf == nullptr) { + QNN_LOG_WARN("failed to allocate rpc memory\n"); + return nullptr; + } + + auto aligned_buf = reinterpret_cast(align_to(alignment, + reinterpret_cast(buf))); + bool status = _rpcmem_store_map.insert(std::pair(aligned_buf, buf)).second; + if (!status) { + QNN_LOG_WARN("failed to allocate rpc memory\n"); + _pfn_rpc_mem_free(buf); + } + + return aligned_buf; +} + + +void qnn_instance::free_rpcmem(void * buf) { + if (!_rpcmem_initialized) { + QNN_LOG_WARN("rpc memory not initialized\n"); + } else if (0 == _rpcmem_store_map.count(buf)) { + QNN_LOG_WARN("no allocated tensor\n"); + } else { + _pfn_rpc_mem_free(_rpcmem_store_map[buf]); + _rpcmem_store_map.erase(buf); + } +} + + +int32_t qnn_instance::rpcmem_to_fd(void *buf) { + int32_t mem_fd = -1; + if (!is_rpcmem_initialized()) { + QNN_LOG_WARN("rpc memory not initialized\n"); + } else { + mem_fd = _pfn_rpc_mem_to_fd(buf); + } + + return mem_fd; +} + + +int qnn_instance::register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor) { + if (nullptr == p_data || (nullptr == p_tensor)) { + QNN_LOG_WARN("invalid param\n"); + return 1; + } + + if (!is_rpcmem_initialized()) { + QNN_LOG_WARN("rpc memory not initialized\n"); + return 2; + } + + if (is_rpcmem_allocated(p_data)) { + QNN_LOG_WARN("rpc memory already allocated\n"); + //return 3; + } + if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) { + QNN_LOG_WARN("tensor %s has been registered shared memory\n", (QNN_VER_PTR(*p_tensor)->name)); + return 4; + } + + int32_t mem_fd = rpcmem_to_fd(p_data); + if (-1 == mem_fd) { + QNN_LOG_WARN("failed to get file descriptor\n"); + return 5; + } + QNN_LOG_DEBUG("mem_fd %d\n", mem_fd); + Qnn_MemDescriptor_t descriptor = { + {QNN_VER_PTR(*p_tensor)->rank, QNN_VER_PTR(*p_tensor)->dimensions, nullptr}, + QNN_VER_PTR(*p_tensor)->dataType, + QNN_MEM_TYPE_ION, + {{mem_fd}}}; + Qnn_MemHandle_t handle = nullptr; + int error = QNN_SUCCESS; + error = _qnn_interface.qnn_mem_register( + _qnn_context_handle, + &descriptor, + /*numDescriptors=*/1, + &handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to register shared memory, error %d, %s\n", QNN_GET_ERROR_CODE(error), + strerror(error)); + return 6; + } else { + QNN_LOG_INFO("tensor %s successfully register shared memory\n", (QNN_VER_PTR(*p_tensor)->name)); + } + QNN_VER_PTR(*p_tensor)->memHandle = handle; + _qnn_mem_set.insert(handle); + + return 0; +} + + +void qnn_instance::unregister_rpcmem() { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + + if (_qnn_mem_set.empty()) { + QNN_LOG_WARN("no rpcmem registered\n"); + } + + for (auto &mem_handle : _qnn_mem_set) { + error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to unregister shared memory, error %d\n", QNN_GET_ERROR_CODE(error)); + } + } + _qnn_mem_set.clear(); +} + + +bool qnn_instance::is_rpcmem_allocated(void * buf) { + return _rpcmem_store_map.count(buf) != 0U; +} + + +int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t ** saver_config) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + QNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str()); + + void *lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL); + if (nullptr == lib_handle) { + QNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dlerror()); + return 1; + } + + // load get_provider function + auto get_providers = load_qnn_functionpointers<_pfn_QnnInterface_getProviders *>(lib_handle, + "QnnInterface_getProviders"); + if (nullptr == get_providers) { + QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dlerror()); + return 2; + } + + // get QnnInterface Providers + std::uint32_t num_providers = 0; + const QnnInterface_t **provider_list = nullptr; + error = get_providers(&provider_list, &num_providers); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error)); + return 3; + } + QNN_LOG_DEBUG("num_providers=%d\n", num_providers); + if (num_providers != _required_num_providers) { + QNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers); + return 4; + } + + if (nullptr == provider_list) { + QNN_LOG_WARN("failed to get qnn interface providers\n"); + return 5; + } + bool found_valid_interface = false; + QNN_INTERFACE_VER_TYPE qnn_interface; + for (size_t idx = 0; idx < num_providers; idx++) { + if (QNN_API_VERSION_MAJOR == provider_list[idx]->apiVersion.coreApiVersion.major && + QNN_API_VERSION_MINOR <= provider_list[idx]->apiVersion.coreApiVersion.minor) { + found_valid_interface = true; + qnn_interface = provider_list[idx]->QNN_INTERFACE_VER_NAME; + break; + } + } + + if (!found_valid_interface) { + QNN_LOG_WARN("unable to find a valid qnn interface\n"); + return 6; + } else { + QNN_LOG_INFO("find a valid qnn interface\n"); + } + set_qnn_raw_interface(qnn_interface); + + BackendIdType backend_id = provider_list[0]->backendId; + _lib_path_to_backend_id[lib_path] = backend_id; + if (_loaded_backend.count(backend_id) > 0) { + QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n", + lib_path.c_str(), backend_id); + } + _loaded_backend[backend_id] = provider_list[0]; + if (_loaded_lib_handle.count(backend_id) > 0) { + QNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]); + int dlclose_error = dlclose(_loaded_lib_handle[backend_id]); + if (dlclose_error != 0) { + QNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], dlerror()); + } + } + _loaded_lib_handle[backend_id] = lib_handle; + _backend_id = backend_id; + + auto saver_initialize = load_qnn_functionpointers<_pfn_QnnSaver_initialize *>( + _loaded_lib_handle[backend_id], "QnnSaver_initialize"); + if (nullptr != saver_initialize) { + error = saver_initialize(saver_config); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to saver_initialize,error %d", QNN_GET_ERROR_CODE(error)); + return 7; + } + } else { + QNN_LOG_WARN("saver_initialize is null\n"); + } + + return 0; +} + + +int qnn_instance::unload_backend() { + int dlclose_error = 0; + for (auto &it : _loaded_lib_handle) { + dlclose_error = dlclose(it.second); + if (dlclose_error != 0) { + QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, dlerror()); + } + } + + _loaded_lib_handle.clear(); + _lib_path_to_backend_id.clear(); + _loaded_backend.clear(); + + return 0; +} + + +int qnn_instance::load_system() { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + + std::string system_lib_path = _lib_path + "libQnnSystem.so"; + QNN_LOG_DEBUG("system_lib_path:%s\n", system_lib_path.c_str()); + + _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL); + if (nullptr == _system_lib_handle) { + QNN_LOG_WARN("can not pen QNN library %s, error: %s\n", system_lib_path.c_str(), dlerror()); + return 1; + } + + auto *get_providers = reinterpret_cast<_pfn_QnnSystemInterface_getProviders *>(dlsym( + _system_lib_handle, "QnnSystemInterface_getProviders")); + if (nullptr == get_providers) { + QNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s\n", dlerror()); + return 2; + } + + uint32_t num_providers = 0; + const QnnSystemInterface_t ** provider_list = nullptr; + error = get_providers(&provider_list, &num_providers); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get providers, error %d\n", QNN_GET_ERROR_CODE(error)); + return 3; + } + + if (num_providers != _required_num_providers) { + QNN_LOG_WARN("providers is %d instead of required %d\n", num_providers, _required_num_providers); + return 4; + } + + if (nullptr == provider_list) { + QNN_LOG_WARN("can not get providers\n"); + return 5; + } + + QNN_SYSTEM_INTERFACE_VER_TYPE qnn_system_interface; + bool found_valid_system_interface = false; + for (size_t idx = 0; idx < num_providers; idx++) { + if (QNN_SYSTEM_API_VERSION_MAJOR == + provider_list[idx]->systemApiVersion.major && + QNN_SYSTEM_API_VERSION_MINOR <= + provider_list[idx]->systemApiVersion.minor) { + found_valid_system_interface = true; + qnn_system_interface = provider_list[idx]->QNN_SYSTEM_INTERFACE_VER_NAME; + break; + } + } + if (!found_valid_system_interface) { + QNN_LOG_WARN("unable to find a valid qnn system interface\n"); + return 6; + } else { + QNN_LOG_INFO("find a valid qnn system interface\n"); + } + set_qnn_raw_system_interface(qnn_system_interface); + + _qnn_interface.set_qnn_system_interface(provider_list[0]); + + _qnn_interface.qnn_system_context_create(&_qnn_system_handle); + if (nullptr == _qnn_system_handle) { + LOGW("can not create QNN system contenxt\n"); + } else { + QNN_LOG_DEBUG("initialize qnn system successfully\n"); + } + + return 0; +} + + +int qnn_instance::unload_system() { + int result = 0; + + if (nullptr == _system_lib_handle) { + QNN_LOG_DEBUG("system lib handle is null\n"); + return 1; + } + + if (nullptr != _qnn_system_handle) { + result = _qnn_interface.qnn_system_context_free(_qnn_system_handle); + if (result != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN system context\n"); + } + _qnn_system_handle = nullptr; + } + + int dlclose_error = dlclose(_system_lib_handle); + if (dlclose_error != 0) { + QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", dlerror()); + return 2; + } + + _system_lib_handle = nullptr; + + return 0; +} + + +static void ggml_qnn_logcallback(const char * fmt, + QnnLog_Level_t level, + uint64_t timestamp, + va_list argp) { + + static std::mutex log_mutex; + static unsigned char s_ggml_qnn_logbuf[GGML_QNN_LOGBUF_LEN]; + + const char * levelStr = ""; + switch (level) { + case QNN_LOG_LEVEL_ERROR: + levelStr = " ERROR "; + break; + case QNN_LOG_LEVEL_WARN: + levelStr = "WARNING"; + break; + case QNN_LOG_LEVEL_INFO: + levelStr = " INFO "; + break; + case QNN_LOG_LEVEL_DEBUG: + levelStr = " DEBUG "; + break; + case QNN_LOG_LEVEL_VERBOSE: + levelStr = "VERBOSE"; + break; + case QNN_LOG_LEVEL_MAX: + levelStr = "UNKNOWN"; + break; + } + + double ms = (double) timestamp / 1000000.0; + + { + std::lock_guard lock(log_mutex); + + int len_content = 0; + memset(s_ggml_qnn_logbuf, 0, GGML_QNN_LOGBUF_LEN); + len_content = vsnprintf(reinterpret_cast(s_ggml_qnn_logbuf), GGML_QNN_LOGBUF_LEN, fmt, argp); + //QNN_LOG_DEBUG("%8.1fms [%-7s] %s ", ms, levelStr, s_ggml_qnn_logbuf); + } +} + + +int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { + BackendIdType backend_id = QNN_BACKEND_ID_NULL; + QNN_LOG_DEBUG("enter qni_init\n"); + + const std::lock_guard lock(_init_mutex); + + if (0 != load_system()) { + QNN_LOG_WARN("can not load QNN system lib, pls check why?\n"); + return 1; + } else { + QNN_LOG_DEBUG("load QNN system lib successfully\n"); + } + + std::string bakend_lib_path = _lib_path + _backend_name; + if (0 == _lib_path_to_backend_id.count(bakend_lib_path)) { + int is_load_ok = load_backend(bakend_lib_path, saver_config); + if (0 != is_load_ok) { + QNN_LOG_WARN("failed to load QNN backend\n"); + return 2; + } + } + + backend_id = _lib_path_to_backend_id[bakend_lib_path]; + if (0 == _loaded_backend.count(backend_id) || + 0 == _loaded_lib_handle.count(backend_id)) { + QNN_LOG_WARN("library %s is loaded but loaded backend count=%zu, loaded lib_handle count=%zu\n", + bakend_lib_path.c_str(), + _loaded_backend.count(backend_id), + _loaded_lib_handle.count(backend_id)); + return 3; + } + + _qnn_interface.set_qnn_interface(_loaded_backend[backend_id]); + +#if 1 + _qnn_interface.qnn_log_create(ggml_qnn_logcallback, _qnn_log_level, &_qnn_log_handle); +#else + _qnn_raw_interface.logCreate(ggml_qnn_logcallback, _qnn_log_level, &_qnn_log_handle); +#endif + if (nullptr == _qnn_log_handle) { + QNN_LOG_WARN("why failed to initialize qnn log\n"); //DSP backend not work on Qualcomm SoC based low-end phone + return 4; + } else { + QNN_LOG_DEBUG("initialize qnn log successfully\n"); + } + + + std::vector temp_backend_config; + _qnn_interface.qnn_backend_create(_qnn_log_handle, temp_backend_config.empty() ? nullptr + : temp_backend_config.data(), + &_qnn_backend_handle); + if (nullptr == _qnn_backend_handle) { + QNN_LOG_WARN("why failed to initialize qnn backend\n"); + return 5; + } else { + QNN_LOG_DEBUG("initialize qnn backend successfully\n"); + } + + if (nullptr != _qnn_raw_interface.propertyHasCapability) { + auto qnnStatus = _qnn_raw_interface.propertyHasCapability(QNN_PROPERTY_GROUP_DEVICE); + if (QNN_PROPERTY_NOT_SUPPORTED == qnnStatus) { + QNN_LOG_WARN("device property is not supported\n"); + } + if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnnStatus) { + QNN_LOG_WARN("device property is not known to backend\n"); + } + } + + auto qnnStatus = _qnn_raw_interface.deviceCreate(_qnn_log_handle, nullptr, &_qnn_device_handle); + if (QNN_SUCCESS != qnnStatus && QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnnStatus) { + QNN_LOG_WARN("failed to create QNN device\n"); + } else { + QNN_LOG_INFO("create device successfully\n"); + } + + /* + std::vector temp_device_config; + _qnn_interface.qnn_device_create(_qnn_log_handle, temp_device_config.empty() ? nullptr : temp_device_config.data(), &_qnn_device_handle); + if (nullptr == _qnn_device_handle) { + QNN_LOG_WARN("why failed to initialize qnn device\n"); + //return 6; + } + */ + + if (ggml_qnn_profile_level::profile_off != _profile_level) { + QNN_LOG_INFO("profiling turned on; level = %d", _profile_level); + if (ggml_qnn_profile_level::profile_basic == _profile_level) { + QNN_LOG_INFO("basic profiling requested. creating Qnn Profile object\n"); + if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate( + _qnn_backend_handle, QNN_PROFILE_LEVEL_BASIC, &_qnn_profile_handle)) { + QNN_LOG_WARN("unable to create profile handle in the backend\n"); + return 7; + } else { + QNN_LOG_DEBUG("initialize qnn profile successfully\n"); + } + } else if (ggml_qnn_profile_level::profile_detail == _profile_level) { + QNN_LOG_INFO("detailed profiling requested. Creating Qnn Profile object\n"); + if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate( + _qnn_backend_handle, QNN_PROFILE_LEVEL_DETAILED, &_qnn_profile_handle)) { + QNN_LOG_WARN("unable to create profile handle in the backend\n"); + return 7; + } else { + QNN_LOG_DEBUG("initialize qnn profile successfully\n"); + } + } + } + + + _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL); + if (nullptr == _rpc_lib_handle) { + QNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dlerror()); + return 9; + } else { + QNN_LOG_DEBUG("load rpcmem lib successfully\n"); + set_rpcmem_initialized(true); + } + _pfn_rpc_mem_init = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_init")); + _pfn_rpc_mem_deinit = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_deinit")); + _pfn_rpc_mem_alloc = reinterpret_cast(dlsym(_rpc_lib_handle,"rpcmem_alloc")); + _pfn_rpc_mem_free = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_free")); + _pfn_rpc_mem_to_fd = reinterpret_cast(dlsym(_rpc_lib_handle,"rpcmem_to_fd")); + if (nullptr == _pfn_rpc_mem_alloc || nullptr == _pfn_rpc_mem_free + || nullptr == _pfn_rpc_mem_to_fd) { + QNN_LOG_WARN("unable to access symbols in QNN RPC lib. dlerror(): %s", dlerror()); + dlclose(_rpc_lib_handle); + return 10; + } + + if (nullptr != _pfn_rpc_mem_init) // make Qualcomm's SoC based low-end phone happy + _pfn_rpc_mem_init(); + + std::vector temp_context_config; + _qnn_interface.qnn_context_create(_qnn_backend_handle, _qnn_device_handle, + temp_context_config.empty() ? nullptr + : temp_context_config.data(), + &_qnn_context_handle); + if (nullptr == _qnn_context_handle) { + QNN_LOG_WARN("why failed to initialize qnn context\n"); + return 8; + } else { + QNN_LOG_DEBUG("initialize qnn context successfully\n"); + } + + QNN_LOG_DEBUG("leave qni_init\n"); + + return 0; +} + + +//QNN SDK would/might/should release all allocated resource in SDK's internal +int qnn_instance::qnn_finalize() { + int ret_status = 0; + Qnn_ErrorHandle_t error = QNN_SUCCESS; + + if (nullptr != _pfn_rpc_mem_deinit) // make Qualcomm's SoC based low-end phone happy + _pfn_rpc_mem_deinit(); + + if (dlclose(_rpc_lib_handle) != 0) { + QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dlerror()); + } else { + QNN_LOG_DEBUG("succeed to close rpcmem lib\n"); + } + + if (nullptr != _qnn_context_handle) { + error = _qnn_interface.qnn_context_free(_qnn_context_handle, _qnn_profile_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + + } + _qnn_context_handle = nullptr; + } + + if (nullptr != _qnn_profile_handle) { + error = _qnn_interface.qnn_profile_free(_qnn_profile_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + + } + _qnn_profile_handle = nullptr; + } + + if (nullptr != _qnn_device_handle) { + error = _qnn_interface.qnn_device_free(_qnn_device_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + + } + _qnn_device_handle = nullptr; + } + + if (nullptr != _qnn_backend_handle) { + error = _qnn_interface.qnn_backend_free(_qnn_backend_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + } + _qnn_backend_handle = nullptr; + + } + + if (nullptr != _qnn_log_handle) { + error = _qnn_interface.qnn_log_free(_qnn_log_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + } + _qnn_log_handle = nullptr; + } + + unload_backend(); + + unload_system(); + + return ret_status; +} + + +int qnn_instance::init_qnn_graph(const char * graph_name, bool debug, uint8_t do_node_validation, + const QnnGraph_Config_t ** graph_configs) { + int result = 0; + + if (nullptr == graph_name) { + QNN_LOG_WARN("graph name is null\n"); + return 1; + } + + if (!_graph_name.empty()) { + QNN_LOG_WARN("qnn model for graph %s already initialized\n", graph_name); + return 2; + } + + if (!do_node_validation) { + QNN_LOG_WARN("node validation disabled, backend will not perform op validation prior to adding node\n"); + } + + _graph_name = graph_name; + _debug_tensor = debug; + _do_node_validations = do_node_validation; + + result = _qnn_raw_interface.graphCreate(_qnn_context_handle, graph_name, graph_configs, + &_qnn_graph_handle); + if (result != QNN_GRAPH_NO_ERROR || nullptr == _qnn_graph_handle) { + QNN_LOG_WARN("failed to create graph in qnn context\n"); + return 3; + } else { + QNN_LOG_INFO("succeed to create graph %s, %p\n", graph_name, _qnn_graph_handle); + } + + return 0; +} + + +int qnn_instance::finalize_qnn_graph() { + if (nullptr != _qnn_graph_handle) { + if (_qnn_raw_interface.graphFinalize(_qnn_graph_handle, _qnn_profile_handle, nullptr) != + QNN_GRAPH_NO_ERROR) { + QNN_LOG_WARN("finalizing graph failure\n"); + //return 1; + } + } else { + QNN_LOG_DEBUG("qnn graph handle is null\n"); + } + + return 0; +} + + + +// ================================================================================================= +// +// implementation of GGML's QNN backend +// +// ================================================================================================= +static bool ggml_qnn_can_handle_op(const struct ggml_tensor * src0, const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + + //double check + bool supported_op = ((dst->op == GGML_OP_ADD) || (dst->op == GGML_OP_MUL) || (dst->op == GGML_OP_MUL_MAT)); + if (!supported_op) { + QNN_LOG_DEBUG("op %d(%s)not support", dst->op, ggml_op_name(dst->op)); + return false; + } + + + //make QNN SDK happy + if (dst->op == GGML_OP_ADD) { + return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) && + (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16) && + (dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16) && ((ne00 > 1 && ne01 > 1 && ne10 > 1 && ne11 > 1)) && + (src0->rank == src1->rank); + + } + + if (dst->op == GGML_OP_MUL_MAT) { +#if 1 // log output have significant effect to performance but useful during development stage + QNN_LOG_DEBUG("GGML_OP_MUL_MAT"); + QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, src0->rank, + src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], + src0->nb[0], src0->nb[1], src0->nb[2]); + QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, src1->rank, + src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], + src1->nb[0], src1->nb[1], src1->nb[2]); + QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, dst->rank, + dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], + dst->nb[1], dst->nb[2]); +#endif + } + + //make QNN SDK happy + return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) && + (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16) && + (src0->type == src1->type) && (src0->type == dst->type) && ((ne00 > 1 && ne01 > 1 && ne10 > 1 && ne11 > 1)); + + +} + + +static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + bool graph_initialized = false; + int64_t n_begin_time = 0LL; + int64_t n_end_time = 0LL; + int64_t n_durtion = 0LL; + + qnn_instance * instance = nullptr; + struct ggml_backend_qnn_context * ctx = nullptr; + + std::string graph_name = "ggml_op_qnn_add"; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Tensor_t * tensor_0 = nullptr; + Qnn_Tensor_t * tensor_1 = nullptr; + Qnn_Tensor_t * tensor_2 = nullptr; + + Qnn_QuantizeParams_t quantize_param = QNN_QUANTIZE_PARAMS_INIT; + Qnn_OpConfig_t qnn_opconfig = QNN_OPCONFIG_INIT; + Qnn_Param_t qnn_params[] = {}; + + enum ggml_op ggmlop = GGML_OP_ADD; + Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; + + + if ((nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { + QNN_LOG_WARN("pls check why GGML tensor is null"); + return; + } + tensor_0 = (Qnn_Tensor_t *)src0->extra; + tensor_1 = (Qnn_Tensor_t *)src1->extra; + tensor_2 = (Qnn_Tensor_t *)dst->extra; + if ((nullptr == tensor_0) || (nullptr == tensor_1) || (nullptr == tensor_2)) { + QNN_LOG_WARN("pls check why QNN tensor is null"); + return; + } + ctx = (struct ggml_backend_qnn_context *)g_qnn_backend->context; + if (nullptr == ctx) { + QNN_LOG_WARN("pls check why backend ctx is null"); + return; + } + instance = ctx->instance; + if (nullptr == instance) { + QNN_LOG_WARN("pls check why qnn instance is null"); + return; + } + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + + n_begin_time = ggml_time_us(); +#if 0 //it works fine with whisper.cpp and llama.cpp. comment them because focus on mulmat in llama.cpp inference since 04-23-2024 + QNN_LOG_DEBUG("call %s\n", __func__); + QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, src0->rank, + src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], + src0->nb[0], src0->nb[1], src0->nb[2]); + QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, src1->rank, + src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], + src1->nb[0], src1->nb[1], src1->nb[2]); + QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, dst->rank, + dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], + dst->nb[1], dst->nb[2]); + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0)); + QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1)); + QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2)); +#endif + + QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; + QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; + QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; + + uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], + (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + + + src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); + src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); + dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); + + std::string map_entry = std::string(ggml_op_name(ggmlop)); + if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { + graph_initialized = true; + auto & graph_item = instance->_qnn_graph_map[map_entry]; + graph_handle = std::get<0>(graph_item); + } + + if (!graph_initialized) { + graph_name = graph_name + "_" + std::to_string(ctx->threads) + src0->name + "_" + src1->name; + QNN_LOG_DEBUG("graph name %s", graph_name.c_str()); + //QnnGraph_Config_t graph_config; + //graph_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + //graph_config.customConfig = strdup(graph_name.c_str()); + //const QnnGraph_Config_t * p_graph_config = &graph_config; + error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, &graph_handle); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error); + return; + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + + Qnn_Tensor_t tensor_inputs[] = { + *tensor_0, + *tensor_1 + }; + Qnn_Tensor_t tensor_outputs[] = { + *tensor_2 + }; + Qnn_OpConfig_t opconfig = { + (Qnn_OpConfigVersion_t) 1, .v1 = { + "ggml_op_add", + QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_ELEMENT_WISE_ADD, + 0, + qnn_params, + 2, + tensor_inputs, + 1, + tensor_outputs + } + }; + error = qnn_raw_interface.graphAddNode(graph_handle, opconfig); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); + instance->_qnn_graph_map[map_entry] = graph_item; + } else { + auto & graph_item = instance->_qnn_graph_map[map_entry]; + graph_handle = std::get<0>(graph_item); + tensor_0 = std::get<1>(graph_item); + tensor_1 = std::get<2>(graph_item); + tensor_2 = std::get<3>(graph_item); + + //comment them because focus on mulmat in llama.cpp inference since 04-23-2024 + //QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], + (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + + Qnn_Tensor_t tensor_inputs[] = { + *tensor_0, + *tensor_1 + }; + Qnn_Tensor_t tensor_outputs[] = { + *tensor_2 + }; + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + } + n_end_time = ggml_time_us(); + n_durtion = (n_end_time - n_begin_time) / 1000; + //comment them because focus on mulmat in llama.cpp inference since 04-23-2024 + //QNN_LOG_DEBUG("duration of ggml_qnn_add : %lld milliseconds\n", n_durtion); + //QNN_LOG_DEBUG("call %s done\n", __func__); +} + + + +/* + * ggml_qnn_mul_mat was re-added as a standalone function because + * the following comments came from https://github.com/ggerganov/llama.cpp/pull/1632 + * MUL_MAT take most of the compute time (about 95%). So to speed up llama, we have to focus on MUL_MAT. + * We have three kinds of MUL_MAT to compute: + * mul_mat_f32: both src0 and src1 are F32. + * mul_mat_f16_f32: src0 is F16 and src1 is F32. + * mul_mat_q_f32: src0 is quantized (Q4_0, Q4_1, ...), and src1 is F32. +*/ + +static void ggml_qnn_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + bool graph_initialized = false; + int64_t n_begin_time = 0LL; + int64_t n_end_time = 0LL; + int64_t n_durtion = 0LL; + + qnn_instance * instance = nullptr; + struct ggml_backend_qnn_context * ctx = nullptr; + + std::string graph_name = "ggml_op_qnn_mul_mat"; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Tensor_t * tensor_0 = nullptr; + Qnn_Tensor_t * tensor_1 = nullptr; + Qnn_Tensor_t * tensor_2 = nullptr; + + Qnn_QuantizeParams_t quantize_param = QNN_QUANTIZE_PARAMS_INIT; + Qnn_OpConfig_t qnn_opconfig = QNN_OPCONFIG_INIT; + Qnn_Param_t qnn_params[] = {}; + + enum ggml_op ggmlop = GGML_OP_ADD; + Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; + + + if ((nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { + QNN_LOG_WARN("pls check why GGML tensor is null"); + return; + } + tensor_0 = (Qnn_Tensor_t *)src0->extra; + tensor_1 = (Qnn_Tensor_t *)src1->extra; + tensor_2 = (Qnn_Tensor_t *)dst->extra; + if ((nullptr == tensor_0) || (nullptr == tensor_1) || (nullptr == tensor_2)) { + QNN_LOG_WARN("pls check why QNN tensor is null"); + return; + } + ctx = (struct ggml_backend_qnn_context *)g_qnn_backend->context; + if (nullptr == ctx) { + QNN_LOG_WARN("pls check why backend ctx is null"); + return; + } + instance = ctx->instance; + if (nullptr == instance) { + QNN_LOG_WARN("pls check why qnn instance is null"); + return; + } + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + + n_begin_time = ggml_time_us(); + QNN_LOG_DEBUG("call %s\n", __func__); + QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, src0->rank, + src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], + src0->nb[0], src0->nb[1], src0->nb[2]); + QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, src1->rank, + src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], + src1->nb[0], src1->nb[1], src1->nb[2]); + QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, dst->rank, + dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], + dst->nb[1], dst->nb[2]); + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0)); + QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1)); + QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2)); + + QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; + QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; + QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; + + uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], + (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + + src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); + src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); + dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); + + std::string map_entry = std::string(ggml_op_name(ggmlop)); + if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { + graph_initialized = true; + auto & graph_item = instance->_qnn_graph_map[map_entry]; + graph_handle = std::get<0>(graph_item); + } + + if (!graph_initialized) { + graph_name = graph_name + "_" + std::to_string(ctx->threads) + src0->name + "_" + src1->name; + QNN_LOG_DEBUG("graph name %s", graph_name.c_str()); + error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, &graph_handle); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error); + return; + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + + Qnn_Tensor_t tensor_inputs[] = { + *tensor_0, + *tensor_1 + }; + Qnn_Tensor_t tensor_outputs[] = { + *tensor_2 + }; + Qnn_OpConfig_t opconfig = { + (Qnn_OpConfigVersion_t) 1, .v1 = { + "ggml_op_mul_mat", + QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_MAT_MUL, + 0, + qnn_params, + 2, + tensor_inputs, + 1, + tensor_outputs + } + }; + error = qnn_raw_interface.graphAddNode(graph_handle, opconfig); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); + instance->_qnn_graph_map[map_entry] = graph_item; + } else { + auto & graph_item = instance->_qnn_graph_map[map_entry]; + graph_handle = std::get<0>(graph_item); + tensor_0 = std::get<1>(graph_item); + tensor_1 = std::get<2>(graph_item); + tensor_2 = std::get<3>(graph_item); + + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], + (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + + Qnn_Tensor_t tensor_inputs[] = { + *tensor_0, + *tensor_1 + }; + Qnn_Tensor_t tensor_outputs[] = { + *tensor_2 + }; + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + } + n_end_time = ggml_time_us(); + n_durtion = (n_end_time - n_begin_time) / 1000; + QNN_LOG_DEBUG("duration of ggml_qnn_mul_mat : %lld milliseconds\n", n_durtion); + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +//common function for GGML OPs using QNN API +static void ggml_qnn_hanlde_op(const enum ggml_op ggmlop, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + bool graph_initialized = false; + int64_t n_begin_time = 0LL; + int64_t n_end_time = 0LL; + int64_t n_durtion = 0LL; + + qnn_instance * instance = nullptr; + struct ggml_backend_qnn_context * ctx = nullptr; + + std::string qnn_graph_name = "ggml_qnn_graph"; + std::string qnn_opconfig_name = "ggml_qnn_opconfig"; + const char * qnn_op_name = nullptr; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Tensor_t * tensor_0 = nullptr; + Qnn_Tensor_t * tensor_1 = nullptr; + Qnn_Tensor_t * tensor_2 = nullptr; + + Qnn_QuantizeParams_t quantize_param = QNN_QUANTIZE_PARAMS_INIT; + Qnn_OpConfig_t qnn_opconfig = QNN_OPCONFIG_INIT; + Qnn_Param_t qnn_params[] = {}; + + Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; + + + if ((nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { + QNN_LOG_WARN("pls check why GGML tensor is null"); + return; + } + tensor_0 = (Qnn_Tensor_t *)src0->extra; + tensor_1 = (Qnn_Tensor_t *)src1->extra; + tensor_2 = (Qnn_Tensor_t *)dst->extra; + if ((nullptr == tensor_0) || (nullptr == tensor_1) || (nullptr == tensor_2)) { + QNN_LOG_WARN("pls check why QNN tensor is null"); + return; + } + ctx = (struct ggml_backend_qnn_context *)g_qnn_backend->context; + if (nullptr == ctx) { + QNN_LOG_WARN("pls check why backend ctx is null"); + return; + } + instance = ctx->instance; + if (nullptr == instance) { + QNN_LOG_WARN("pls check why qnn instance is null"); + return; + } + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + + src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); + src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); + dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); + qnn_op_name = qnn_opname_from_ggmlop(ggmlop); + if (nullptr == qnn_op_name) { + QNN_LOG_WARN("pls check why can not get QNN OP name with ggml op %d(%s)", ggmlop, ggml_op_name(ggmlop)); + return; + } + + n_begin_time = ggml_time_us(); + QNN_LOG_DEBUG("call %s\n", __func__); + QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, src0->rank, + src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], + src0->nb[0], src0->nb[1], src0->nb[2]); + QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, src1->rank, + src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], + src1->nb[0], src1->nb[1], src1->nb[2]); + QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, dst->rank, + dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], + dst->nb[1], dst->nb[2]); + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0)); + QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1)); + QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2)); + + QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; + QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; + QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; + + uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], + (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + + std::string map_entry = std::string(ggml_op_name(ggmlop)); + if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { + graph_initialized = true; + auto & graph_item = instance->_qnn_graph_map[map_entry]; + graph_handle = std::get<0>(graph_item); + } + + if (!graph_initialized) { + qnn_graph_name = qnn_graph_name + "_" + ggml_op_name(ggmlop) + std::to_string(ctx->threads) + src0->name + "_" + src1->name; + qnn_opconfig_name = qnn_opconfig_name + "_" + ggml_op_name(ggmlop) + std::to_string(ctx->threads) + src0->name + "_" + src1->name; + QNN_LOG_DEBUG("qnn graph name %s", qnn_graph_name.c_str()); + QNN_LOG_DEBUG("qnn opconfig name %s", qnn_opconfig_name.c_str()); + error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), qnn_graph_name.c_str(), nullptr, &graph_handle); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("can't create qnn graph handle with ggml op %s, graph name %s, error = %d\n", ggml_op_name(ggmlop), qnn_graph_name.c_str(), error); + return; + } + + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + + Qnn_Tensor_t tensor_inputs[] = { + *tensor_0, + *tensor_1 + }; + Qnn_Tensor_t tensor_outputs[] = { + *tensor_2 + }; + Qnn_OpConfig_t opconfig = { + (Qnn_OpConfigVersion_t) 1, .v1 = { + qnn_opconfig_name.c_str(), + QNN_OP_PACKAGE_NAME_QTI_AISW, + qnn_op_name, + 0, + qnn_params, + 2, + tensor_inputs, + 1, + tensor_outputs + } + }; + error = qnn_raw_interface.graphAddNode(graph_handle, opconfig); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); + instance->_qnn_graph_map[map_entry] = graph_item; + } else { + auto & graph_item = instance->_qnn_graph_map[map_entry]; + graph_handle = std::get<0>(graph_item); + tensor_0 = std::get<1>(graph_item); + tensor_1 = std::get<2>(graph_item); + tensor_2 = std::get<3>(graph_item); + + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], + (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + + Qnn_Tensor_t tensor_inputs[] = { + *tensor_0, + *tensor_1 + }; + Qnn_Tensor_t tensor_outputs[] = { + *tensor_2 + }; + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + } + n_end_time = ggml_time_us(); + n_durtion = (n_end_time - n_begin_time) / 1000; + QNN_LOG_DEBUG("duration of ggml_qnn_%s : %lld milliseconds\n", ggml_op_name(ggmlop), n_durtion); + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_acc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + + + + +static void ggml_qnn_div(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_gelu_quick(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_tanh(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_hardsigmoid(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_hardswish(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_leaky_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_group_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_concat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_upscale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_pad(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_qnn_cpy(src0, dst, nullptr); + (void) src1; +} + + +static void ggml_qnn_mul_mat_id(const ggml_tensor * src0, + const ggml_tensor * src1, + ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); + +} + + +static void ggml_qnn_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_pool2d(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_sum_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_argsort(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + (void) src0; + (void) src1; + (void) dst; + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +bool ggml_qnn_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { + ggml_qnn_func_t func = nullptr; + ggml_qnn_func_common_t func_common = nullptr; + + bool supported_op = false; + + bool use_hwaccel = false; + + //begin sanity check + if (nullptr == g_qnn_backend) { + QNN_LOG_ERROR("pls check why qnn subsystem not initialized"); + return false; + } + + //this is special scenario for UT function qnn_ggml_op + //borrow some advantages from PyTorch:the user or the upper layer codes could specify whether a GGML OP(such as add/mul/mulmat) is accelerated by a specify backend) + //otherwise ggml-qnn.cpp don't known whether current caller is whisper.cpp or other scenario(for example, JNI function...) + + //in the all, use_hwaccel is different with supported_op + //this feature is heavily depend on PR in upstream whisper.cpp https://github.com/ggerganov/whisper.cpp/pull/2073 + use_hwaccel = (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU); + + supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL) || (tensor->op == GGML_OP_MUL_MAT)); + //supported_op = (tensor->op == GGML_OP_ADD); //works very good with whisper.cpp(asr result is correct) + + if ((!use_hwaccel) && (!supported_op)) { + //TODO: should be removed because this is a workaround method during development stage + ggml_compute_forward(params, tensor); + return false; + } + + if ((!use_hwaccel) && (!ggml_qnn_can_handle_op(tensor->src[0], tensor->src[1], tensor))) { + //TODO: should be removed because this is a workaround method during development stage + ggml_compute_forward(params, tensor); + return false; + } + //end sanity check + + switch (tensor->op) { + case GGML_OP_ADD: + func = ggml_qnn_add; + //func_common = ggml_qnn_hanlde_op; + break; + + case GGML_OP_MUL: + func_common = ggml_qnn_hanlde_op; + break; + + case GGML_OP_MUL_MAT: + func = ggml_qnn_mul_mat; + //func_common = ggml_qnn_hanlde_op; + break; + + case GGML_OP_REPEAT: + func = ggml_qnn_repeat; + break; + case GGML_OP_GET_ROWS: + func = ggml_qnn_get_rows; + break; + case GGML_OP_DUP: + func = ggml_qnn_dup; + break; + + case GGML_OP_ACC: + func = ggml_qnn_acc; + break; + + case GGML_OP_DIV: + func = ggml_qnn_div; + break; + + case GGML_OP_UNARY: + switch (ggml_get_unary_op(tensor)) { + case GGML_UNARY_OP_GELU: + func = ggml_qnn_gelu; + break; + case GGML_UNARY_OP_SILU: + func = ggml_qnn_silu; + break; + case GGML_UNARY_OP_GELU_QUICK: + func = ggml_qnn_gelu_quick; + break; + case GGML_UNARY_OP_TANH: + func = ggml_qnn_tanh; + break; + case GGML_UNARY_OP_RELU: + func = ggml_qnn_relu; + break; + case GGML_UNARY_OP_HARDSIGMOID: + func = ggml_qnn_hardsigmoid; + break; + case GGML_UNARY_OP_HARDSWISH: + func = ggml_qnn_hardswish; + break; + default: + return false; + } + break; + case GGML_OP_NORM: + func = ggml_qnn_norm; + break; + case GGML_OP_GROUP_NORM: + func = ggml_qnn_group_norm; + break; + case GGML_OP_CONCAT: + func = ggml_qnn_concat; + break; + case GGML_OP_UPSCALE: + func = ggml_qnn_upscale; + break; + case GGML_OP_PAD: + func = ggml_qnn_pad; + break; + case GGML_OP_LEAKY_RELU: + func = ggml_qnn_leaky_relu; + break; + case GGML_OP_RMS_NORM: + func = ggml_qnn_rms_norm; + break; + + case GGML_OP_MUL_MAT_ID: + func = ggml_qnn_mul_mat_id; + break; + case GGML_OP_SCALE: + func = ggml_qnn_scale; + break; + case GGML_OP_SQR: + func = ggml_qnn_sqr; + break; + case GGML_OP_CLAMP: + func = ggml_qnn_clamp; + break; + case GGML_OP_CPY: + func = ggml_qnn_cpy; + break; + case GGML_OP_CONT: + func = ggml_qnn_dup; + break; + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + func = ggml_qnn_nop; + break; + case GGML_OP_DIAG_MASK_INF: + func = ggml_qnn_diag_mask_inf; + break; + case GGML_OP_SOFT_MAX: + func = ggml_qnn_soft_max; + break; + case GGML_OP_ROPE: + func = ggml_qnn_rope; + break; + case GGML_OP_ALIBI: + func = ggml_qnn_alibi; + break; + case GGML_OP_IM2COL: + func = ggml_qnn_im2col; + break; + case GGML_OP_POOL_2D: + func = ggml_qnn_pool2d; + break; + case GGML_OP_SUM_ROWS: + func = ggml_qnn_sum_rows; + break; + case GGML_OP_ARGSORT: + func = ggml_qnn_argsort; + break; + default: + return false; + } + + + //ok, real show time in Qualcomm's QNN internal + if (nullptr != func) + func(tensor->src[0], tensor->src[1], tensor); + if (nullptr != func_common) + func_common(tensor->op, tensor->src[0], tensor->src[1], tensor); + + return true; +} + + +struct ggml_backend_qnn_buffer_context { + ~ggml_backend_qnn_buffer_context() { + if (buffer) { + free(buffer); + } + for (auto * sub_buffer : sub_buffers) { + free(sub_buffer); + } + + for (auto * qnn_tensor : qnn_tensors) { + free_qnn_tensor(*qnn_tensor); + free(qnn_tensor); + } + + std::map>::iterator graph_it; + struct ggml_backend_qnn_context * ctx = (struct ggml_backend_qnn_context *) g_qnn_backend->context; + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->instance->get_qnn_raw_interface(); + for (graph_it = backend_ctx->instance->_qnn_graph_map.begin(); graph_it != backend_ctx->instance->_qnn_graph_map.end(); graph_it++) { + auto & graph_item = graph_it->second; + Qnn_GraphHandle_t & graph_handle = std::get<0>(graph_item); + QNN_LOG_DEBUG("graph type:%s", graph_it->first.c_str()); + } + backend_ctx->instance->_qnn_graph_map.clear(); + + sub_buffers.clear(); + qnn_tensors.clear(); + } + void * buffer = nullptr; + + struct ggml_backend_qnn_context * backend_ctx = nullptr; + + size_t buffer_size = 0; + std::vector sub_buffers; + std::vector qnn_tensors; +}; + +static const char * ggml_backend_qnn_buffer_get_name(ggml_backend_buffer_t buffer) { + GGML_UNUSED(buffer); + return "QNN"; +} + + +GGML_CALL static bool ggml_backend_buffer_is_qnn(ggml_backend_buffer_t buffer) { + return buffer->iface.get_name == ggml_backend_qnn_buffer_get_name; +} + + +static void ggml_backend_qnn_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; + delete ctx; +} + + +//TODO:not used +static void * ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t buffer) { + ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; + + return ctx->buffer; +} + + +static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *)buffer->context; + + /* + if (tensor->view_src != nullptr && tensor->view_offs == 0) { + assert(tensor->view_src->buffer->buft == buffer->buft); + tensor->backend = tensor->view_src->backend; + tensor->extra = tensor->view_src->extra; + return; + } + */ + + uint32_t dimensions[] = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1], (uint32_t) tensor->ne[2], (uint32_t) tensor->ne[3]}; + //TODO:only support FP32 & FP16 + Qnn_DataType_t qnn_data_type = QNN_DATATYPE_FLOAT_32; + Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; + + + if (tensor->flags & GGML_TENSOR_FLAG_INPUT) { + qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; + } else if (tensor->flags & GGML_TENSOR_FLAG_OUTPUT) { + qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ; + } + Qnn_Tensor_t qnn_tensor = { + .version= QNN_TENSOR_VERSION_1, + {.v1= { + .id=0, + .name= tensor->name, + .type= qnn_tensor_type, + .dataFormat= QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, + .dataType= qnn_data_type, + .quantizeParams= {QNN_DEFINITION_UNDEFINED, + QNN_QUANTIZATION_ENCODING_UNDEFINED, + {.scaleOffsetEncoding= {.scale= 0.0000000000000000f, .offset= 0}}}, + .rank= ggml_get_tensor_rank(tensor), + .dimensions=dimensions, + .memType= QNN_TENSORMEMTYPE_RAW, + {.clientBuf= {.data=nullptr, + .dataSize=0}}}} + }; + Qnn_Tensor_t * p_qnn_tensor = (Qnn_Tensor_t *)malloc(sizeof(Qnn_Tensor_t)); + if (nullptr == p_qnn_tensor) { + QNN_LOG_WARN("init tensor failed"); + return; + } + Qnn_Tensor_t tensor_copy; + error = deep_copy_qnn_tensors(qnn_tensor, *p_qnn_tensor); + if (error != QNN_SUCCESS) { + free(p_qnn_tensor); + QNN_LOG_DEBUG("init tensor failed"); + return; + } + tensor->extra = p_qnn_tensor; + ctx->qnn_tensors.push_back(p_qnn_tensor); + + if (ggml_is_quantized(tensor->type)) { + //TODO + QNN_LOG_DEBUG("is quantized"); + } +} + + +static void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + GGML_UNUSED(buffer); + + memcpy((char *)tensor->data + offset, data, size); +} + + +static void ggml_backend_qnn_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { + GGML_UNUSED(buffer); + memcpy(data, (const char *)tensor->data + offset, size); +} + + +static bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { + GGML_UNUSED(buffer); + if (ggml_backend_buffer_is_host(src->buffer)) { + memcpy(dst->data, src->data, ggml_nbytes(src)); + return true; + } + + return false; +} + + +static void ggml_backend_qnn_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; + + memset(ctx->buffer, value, ctx->buffer_size); +} + + + +static void ggml_backend_qnn_buffer_reset(ggml_backend_buffer_t buffer) { + ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; + for (auto * sub_buffer : ctx->sub_buffers) { + free(sub_buffer); + } + ctx->sub_buffers.clear(); +} + + +static ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = { + /* .get_name = */ ggml_backend_qnn_buffer_get_name, + /* .free_buffer = */ ggml_backend_qnn_buffer_free_buffer, + /* .get_base = */ ggml_backend_qnn_buffer_get_base, + /* .init_tensor = */ ggml_backend_qnn_buffer_init_tensor, + /* .set_tensor = */ ggml_backend_qnn_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_qnn_buffer_get_tensor, + /* .cpy_tensor = */ ggml_backend_qnn_buffer_cpy_tensor, + /* .clear = */ ggml_backend_qnn_buffer_clear, + /* .reset = */ nullptr, +}; + + +static const char * ggml_backend_qnn_buffer_type_name(ggml_backend_buffer_type_t buft) { + return "QNN"; +} + + +static void * ggml_qnn_host_malloc(size_t n) { + void * data = nullptr; + const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n); + if (result != 0) { + QNN_LOG_WARN("%s: error: posix_memalign failed\n", __func__); + return nullptr; + } + + return data; +} + + +static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + ggml_backend_qnn_buffer_context * ctx = new ggml_backend_qnn_buffer_context; + + const size_t size_page = sysconf(_SC_PAGESIZE); + + size_t size_aligned = size; + if ((size_aligned % size_page) != 0) { + size_aligned += (size_page - (size_aligned % size_page)); + } + + //TODO:use pre-allocated buffer in internal memory pool + ctx->buffer = ggml_qnn_host_malloc(size_aligned); + ctx->buffer_size = size_aligned; + + ctx->backend_ctx = &g_qnn_mgr[g_current_device]; + + if (nullptr == ctx->buffer) { + QNN_LOG_WARN("%s: failed to allocate %.2f MiB\n", __func__, size / (1 << 20)); + return nullptr; + } + + return ggml_backend_buffer_init(buft, ggml_backend_qnn_buffer_interface, ctx, size); +} + + +static size_t ggml_backend_qnn_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + return 32; +} + + +//TODO: this value is an experimental value +static size_t ggml_backend_qnn_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + + return (38 * 1024 * 1024); +} + + +static bool ggml_backend_qnn_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, + ggml_backend_t backend) { + GGML_UNUSED(buft); + + return ggml_backend_is_qnn(backend) || ggml_backend_is_cpu(backend); +} + + +// attention here because Qualcomm's QNN SDK is a highly well-designed SDK +// +// refer to https://developer.qualcomm.com/sites/default/files/attachments/qnn_software_stack.png +// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/overview.html +static bool ggml_backend_qnn_buffer_is_host(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + return true; +} + +static ggml_backend_buffer_type_i ggml_backend_qnn_buffer_type_interface = { + /* .get_name = */ ggml_backend_qnn_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_qnn_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_qnn_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_qnn_buffer_type_get_max_size, + /* .get_alloc_size = */ nullptr, + /* .supports_backend = */ ggml_backend_qnn_buffer_type_supports_backend, + /* .is_host = */ ggml_backend_qnn_buffer_is_host +}; + + +static const char * ggml_backend_qnn_name(ggml_backend_t backend) { + return "QNN"; +} + + +static void ggml_backend_qnn_free(ggml_backend_t backend) { + QNN_LOG_INFO("enter %s", __func__ ); + ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; + QNN_LOG_DEBUG("idx %d, name:%s", ctx->device, g_qnn_mgr[ctx->device].name); + + qnn_instance * instance = (qnn_instance*)g_qnn_mgr[ctx->device].instance; + if (instance != nullptr) { + instance->qnn_finalize(); + delete instance; + g_qnn_mgr[ctx->device].instance = nullptr; + } + + qnn_buf_t * buffer_pool = (qnn_buf_t*)g_qnn_mgr[ctx->device].buffer_pool; + if (buffer_pool != nullptr) { + buffer_pool->destroy(buffer_pool); + g_qnn_mgr[ctx->device].buffer_pool = nullptr; + } + + if (g_qnn_mgr[ctx->device].backend != nullptr) { + delete backend; + g_qnn_backend = nullptr; + g_qnn_mgr[ctx->device].backend = nullptr; + } + QNN_LOG_INFO("leave %s", __func__ ); +} + + +static ggml_backend_buffer_type_t ggml_backend_qnn_get_default_buffer_type(ggml_backend_t backend) { + ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; + + return ggml_backend_qnn_buffer_type(ctx->device); +} + + +#if 0 +static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const ggml_tensor * op) { + GGML_UNUSED(backend); + + switch (op->op) { + case GGML_OP_UNARY: + switch (ggml_get_unary_op(op)) { + case GGML_UNARY_OP_GELU: + case GGML_UNARY_OP_SILU: + case GGML_UNARY_OP_RELU: + case GGML_UNARY_OP_HARDSIGMOID: + case GGML_UNARY_OP_HARDSWISH: + case GGML_UNARY_OP_GELU_QUICK: + case GGML_UNARY_OP_TANH: + return true; + default: + return false; + } + break; + case GGML_OP_MUL_MAT: + case GGML_OP_MUL_MAT_ID: { + struct ggml_tensor *a; + struct ggml_tensor *b; + if (op->op == GGML_OP_MUL_MAT) { + a = op->src[0]; + b = op->src[1]; + } else { + a = op->src[2]; + b = op->src[1]; + } + if (a->ne[3] != b->ne[3]) { + return false; + } + ggml_type a_type = a->type; + if (a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ2_S || + a_type == GGML_TYPE_IQ4_XS) { + return false; + } + return true; + } + break; + case GGML_OP_GET_ROWS: { + switch (op->src[0]->type) { + case GGML_TYPE_F16: + case GGML_TYPE_F32: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + return true; + default: + return false; + } + } + break; + case GGML_OP_CPY: { + ggml_type src0_type = op->src[0]->type; + ggml_type src1_type = op->src[1]->type; + if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) { + return true; + } + if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) { + return true; + } + if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) { + return true; + } + if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) { + return true; + } + if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_1) { + return true; + } + if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) { + return true; + } + if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) { + return true; + } + return false; + } + break; + case GGML_OP_CONCAT: { + ggml_type src0_type = op->src[0]->type; + return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16; + } + break; + case GGML_OP_DUP: + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_REPEAT: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + case GGML_OP_NORM: + case GGML_OP_ADD: + case GGML_OP_MUL: + case GGML_OP_DIV: + case GGML_OP_RMS_NORM: + case GGML_OP_SCALE: + case GGML_OP_SQR: + case GGML_OP_CLAMP: + case GGML_OP_CONT: + case GGML_OP_DIAG_MASK_INF: + case GGML_OP_SOFT_MAX: + case GGML_OP_ROPE: + case GGML_OP_ALIBI: + case GGML_OP_IM2COL: + case GGML_OP_POOL_2D: + case GGML_OP_SUM_ROWS: + case GGML_OP_ARGSORT: + case GGML_OP_ACC: + case GGML_OP_GROUP_NORM: + case GGML_OP_UPSCALE: + case GGML_OP_PAD: + case GGML_OP_LEAKY_RELU: + return true; + default: + return false; + } +} +# else +static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const ggml_tensor * op) { + GGML_UNUSED(backend); + + switch (op->op) { + case GGML_OP_MUL_MAT: + return true; + default: + return false; + } +} +#endif + + +static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { + enum ggml_status result = GGML_STATUS_SUCCESS; + int node_n = -1; + int task_phase = GGML_TASK_TYPE_FINALIZE; + ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; + + struct ggml_cplan plan = ggml_graph_plan(cgraph, 1); + + buf_element_t * qnn_buf = nullptr; + + if (plan.work_size > 0) { + //plan.work_data = static_cast(malloc(plan.work_size)); + plan.work_data = static_cast(ctx->buffer_pool->buffer_pool_base); + if (plan.work_data == nullptr) { + QNN_LOG_ERROR("malloc failed"); + return GGML_STATUS_FAILED; + } + } + struct ggml_cplan * cplan = &plan; + GGML_ASSERT(cplan->n_threads > 0); + if (cplan->work_size > 0) { + GGML_ASSERT(cplan->work_data); + } + + while (true) { + if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { + result = GGML_STATUS_ABORTED; + break; + } + struct ggml_compute_params params = { + /*.type =*/ GGML_TASK_TYPE_FINALIZE, + /*.ith =*/ 0, + /*.nth =*/ 0, + /*.wsize =*/ cplan->work_size, + /*.wdata =*/ cplan->work_data, + }; + + if (node_n != -1) { + /* FINALIZE */ + struct ggml_tensor * node = cgraph->nodes[node_n]; + if (GGML_OP_HAS_FINALIZE[node->op]) { + params.nth = 1; + ggml_qnn_compute_forward(¶ms, node); + } + } + + while (++node_n < cgraph->n_nodes) { + struct ggml_tensor * node = cgraph->nodes[node_n]; + params.nth = 1; + if (GGML_OP_HAS_INIT[node->op]) { + params.type = GGML_TASK_TYPE_INIT; + ggml_qnn_compute_forward(¶ms, node); + } + params.type = GGML_TASK_TYPE_COMPUTE; + ggml_qnn_compute_forward(¶ms, node); + if (GGML_OP_HAS_FINALIZE[node->op]) { + params.type = GGML_TASK_TYPE_FINALIZE; + ggml_qnn_compute_forward(¶ms, node); + } + if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { + result = GGML_STATUS_ABORTED; + break; + } + } + task_phase = GGML_TASK_TYPE_INIT; + if (node_n >= cgraph->n_nodes) { + //QNN_LOG_INFO("node_n %d", node_n); + //QNN_LOG_INFO("cgraph->n_nodes %d", cgraph->n_nodes); + break; + } + } + + //free(plan.work_data); + + return result; +} + + +struct ggml_compute_state_shared { + const struct ggml_cgraph * cgraph; + const struct ggml_cplan * cplan; + + int64_t perf_node_start_cycles; + int64_t perf_node_start_time_us; + + const int n_threads; + + // synchronization primitives + atomic_int n_active; // num active threads + atomic_int node_n; // active graph node + atomic_int node_task; // active graph node task phase + + ggml_abort_callback abort_callback; // abort ggml_graph_compute when true + void * abort_callback_data; +}; + +struct ggml_compute_state { + pthread_t thrd; + int ith; + struct ggml_compute_state_shared * shared; + enum ggml_status ec; +}; + + +#ifdef GGML_PERF +#define ggml_perf_time_ms() ggml_time_ms() +#define ggml_perf_time_us() ggml_time_us() +#define ggml_perf_cycles() ggml_cycles() +#define ggml_perf_cycles_per_ms() ggml_cycles_per_ms() +#else +#define ggml_perf_time_ms() 0 +#define ggml_perf_time_us() 0 +#define ggml_perf_cycles() 0 +#define ggml_perf_cycles_per_ms() 0 +#endif +#undef MIN +#undef MAX + +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + + +static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) { + int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles; + int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us; + + node->perf_runs++; + node->perf_cycles += cycles_cur; + node->perf_time_us += time_us_cur; +} + + +static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_compute_state * state, const bool do_yield) { + // wait for other threads to finish + const int last_node_n = * node_n; + + while (true) { + if (do_yield) { + sched_yield(); + } + + * node_n = atomic_load(&state->shared->node_n); + if (* node_n != last_node_n) break; + } +} + + +static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) { + // wait for other threads to finish + const int last_task_phase = * task_phase; + + while (true) { + if (do_yield) { + sched_yield(); + } + + * task_phase = atomic_load(&state->shared->node_task); + if (* task_phase != last_task_phase) break; + } +} + + +static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_threads) { + int n_tasks = 0; + + if (ggml_is_empty(node)) { + // no need to multi-thread a no-op + n_tasks = 1; + return n_tasks; + } + + switch (node->op) { + case GGML_OP_CPY: + case GGML_OP_DUP: + case GGML_OP_ADD: + case GGML_OP_ADD1: + case GGML_OP_ACC: { + n_tasks = n_threads; + } + break; + case GGML_OP_SUB: + case GGML_OP_SQR: + case GGML_OP_SQRT: + case GGML_OP_LOG: + case GGML_OP_SUM: + case GGML_OP_SUM_ROWS: + case GGML_OP_MEAN: + case GGML_OP_ARGMAX: + case GGML_OP_REPEAT: + case GGML_OP_REPEAT_BACK: + case GGML_OP_LEAKY_RELU: { + n_tasks = 1; + } + break; + case GGML_OP_UNARY: + switch (ggml_get_unary_op(node)) { + case GGML_UNARY_OP_ABS: + case GGML_UNARY_OP_SGN: + case GGML_UNARY_OP_NEG: + case GGML_UNARY_OP_STEP: + case GGML_UNARY_OP_TANH: + case GGML_UNARY_OP_ELU: + case GGML_UNARY_OP_RELU: + case GGML_UNARY_OP_HARDSWISH: + case GGML_UNARY_OP_HARDSIGMOID: { + n_tasks = 1; + } + break; + + case GGML_UNARY_OP_GELU: + case GGML_UNARY_OP_GELU_QUICK: + case GGML_UNARY_OP_SILU: { + n_tasks = n_threads; + } + break; + default: + GGML_ASSERT(false); + } + break; + case GGML_OP_SILU_BACK: + case GGML_OP_MUL: + case GGML_OP_DIV: + case GGML_OP_NORM: + case GGML_OP_RMS_NORM: + case GGML_OP_RMS_NORM_BACK: + case GGML_OP_GROUP_NORM: + case GGML_OP_CONCAT: { + n_tasks = n_threads; + } + break; + case GGML_OP_MUL_MAT: { + n_tasks = n_threads; + } + break; + case GGML_OP_MUL_MAT_ID: { + n_tasks = n_threads; + } + break; + case GGML_OP_OUT_PROD: { + n_tasks = n_threads; + } + break; + case GGML_OP_GET_ROWS: { + n_tasks = MIN(n_cur_threads, ggml_nelements(node->src[1])); + } + break; + case GGML_OP_SCALE: + case GGML_OP_SET: + case GGML_OP_CONT: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + case GGML_OP_GET_ROWS_BACK: + case GGML_OP_DIAG: { + n_tasks = 1; + } + break; + case GGML_OP_DIAG_MASK_ZERO: + case GGML_OP_DIAG_MASK_INF: + case GGML_OP_SOFT_MAX_BACK: + case GGML_OP_ROPE: + case GGML_OP_ROPE_BACK: + case GGML_OP_ADD_REL_POS: { + n_tasks = n_threads; + } + break; + case GGML_OP_ALIBI: { + n_tasks = 1; + } + break; + case GGML_OP_CLAMP: { + n_tasks = 1; + } + break; + case GGML_OP_SOFT_MAX: { + n_tasks = MIN(n_threads, ggml_nrows(node->src[0])); + } + break; + case GGML_OP_CONV_TRANSPOSE_1D: { + n_tasks = n_threads; + } + break; + case GGML_OP_IM2COL: { + n_tasks = n_threads; + } + break; + case GGML_OP_CONV_TRANSPOSE_2D: { + n_tasks = n_threads; + } + break; + case GGML_OP_POOL_1D: + case GGML_OP_POOL_2D: { + n_tasks = 1; + } + break; + case GGML_OP_UPSCALE: { + n_tasks = n_threads; + } + break; + case GGML_OP_PAD: { + n_tasks = n_threads; + } + break; + case GGML_OP_ARANGE: { + n_tasks = n_threads; + } + break; + case GGML_OP_TIMESTEP_EMBEDDING: { + n_tasks = n_threads; + } + break; + case GGML_OP_ARGSORT: { + n_tasks = n_threads; + } + break; + case GGML_OP_FLASH_ATTN: { + n_tasks = n_threads; + } + break; + case GGML_OP_FLASH_FF: { + n_tasks = n_threads; + } + break; + case GGML_OP_FLASH_ATTN_BACK: { + n_tasks = n_threads; + } + break; + case GGML_OP_SSM_CONV: + case GGML_OP_SSM_SCAN: { + n_tasks = n_threads; + } + break; + case GGML_OP_WIN_PART: + case GGML_OP_WIN_UNPART: + case GGML_OP_GET_REL_POS: + case GGML_OP_MAP_UNARY: + case GGML_OP_MAP_BINARY: + case GGML_OP_MAP_CUSTOM1_F32: + case GGML_OP_MAP_CUSTOM2_F32: + case GGML_OP_MAP_CUSTOM3_F32: { + n_tasks = 1; + } + break; + case GGML_OP_MAP_CUSTOM1: { + QNN_LOG_ERROR("not support"); + } + break; + case GGML_OP_MAP_CUSTOM2: { + QNN_LOG_ERROR("not support"); + } + break; + case GGML_OP_MAP_CUSTOM3: { + QNN_LOG_ERROR("not support"); + } + break; + case GGML_OP_CROSS_ENTROPY_LOSS: { + n_tasks = n_threads; + } + break; + case GGML_OP_CROSS_ENTROPY_LOSS_BACK: { + n_tasks = n_threads; + } + break; + case GGML_OP_NONE: { + n_tasks = 1; + } + break; + case GGML_OP_COUNT: { + GGML_ASSERT(false); + } + break; + default: { + QNN_LOG_WARN("%s: op not implemented: ", __func__); + if (node->op < GGML_OP_COUNT) { + QNN_LOG_DEBUG("%s\n", ggml_op_name(node->op)); + } else { + QNN_LOG_DEBUG("%d\n", node->op); + } + GGML_ASSERT(false); + } + break; + } + + assert(n_tasks > 0); + + return n_tasks; +} + + +static void * ggml_graph_compute_thread(void * data) { + struct ggml_compute_state * state = (struct ggml_compute_state *) data; + + const struct ggml_cgraph * cgraph = state->shared->cgraph; + const struct ggml_cplan * cplan = state->shared->cplan; + + const int n_threads = state->shared->n_threads; + + int node_n = -1; + int task_phase = GGML_TASK_TYPE_FINALIZE; + + while (true) { + if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { + state->shared->node_n += 1; + state->ec = GGML_STATUS_ABORTED; + return 0; + } + + if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) { + // all other threads are finished and spinning + // do finalize and init here so we don't have synchronize again + struct ggml_compute_params params = { + /*.type =*/ GGML_TASK_TYPE_FINALIZE, + /*.ith =*/ 0, + /*.nth =*/ 0, + /*.wsize =*/ cplan->work_size, + /*.wdata =*/ cplan->work_data, + }; + + if (node_n != -1) { + /* FINALIZE */ + struct ggml_tensor * node = cgraph->nodes[node_n]; + if (GGML_OP_HAS_FINALIZE[node->op]) { + params.nth = ggml_get_n_tasks(node, n_threads, state->shared->n_threads); + ggml_qnn_compute_forward(¶ms, node); + } + ggml_graph_compute_perf_stats_node(node, state->shared); + } + + // distribute new work or execute it direct if 1T + while (++node_n < cgraph->n_nodes) { + //QNN_LOG_INFO("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes); + struct ggml_tensor * node = cgraph->nodes[node_n]; + const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads); + + state->shared->perf_node_start_cycles = ggml_perf_cycles(); + state->shared->perf_node_start_time_us = ggml_perf_time_us(); + + params.nth = n_tasks; + + if (n_tasks == 1) { + /* INIT */ + if (GGML_OP_HAS_INIT[node->op]) { + params.type = GGML_TASK_TYPE_INIT; + ggml_qnn_compute_forward(¶ms, node); + } + + // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1, + // they do something more efficient than spinning (?) + params.type = GGML_TASK_TYPE_COMPUTE; + ggml_qnn_compute_forward(¶ms, node); + + if (GGML_OP_HAS_FINALIZE[node->op]) { + params.type = GGML_TASK_TYPE_FINALIZE; + ggml_qnn_compute_forward(¶ms, node); + } + + ggml_graph_compute_perf_stats_node(node, state->shared); + } else { + break; + } + + if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { + break; + } + } + + task_phase = GGML_TASK_TYPE_INIT; + atomic_store(&state->shared->n_active, n_threads); + atomic_store(&state->shared->node_n, node_n); + atomic_store(&state->shared->node_task, task_phase); + } else { + ggml_graph_compute_thread_sync_node(&node_n, state, false); + ggml_graph_compute_thread_sync_task(&task_phase, state, false); + } + + // check if we should stop + if (node_n >= cgraph->n_nodes) break; + + /* INIT & COMPUTE */ + struct ggml_tensor * node = cgraph->nodes[node_n]; + const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads); + + struct ggml_compute_params params = { + /*.type =*/ GGML_TASK_TYPE_INIT, + /*.ith =*/ state->ith, + /*.nth =*/ n_tasks, + /*.wsize =*/ cplan->work_size, + /*.wdata =*/ cplan->work_data, + }; + + if (state->ith < n_tasks) { + if (GGML_OP_HAS_INIT[node->op]) { + ggml_qnn_compute_forward(¶ms, node); + } + } + + if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) { + task_phase = GGML_TASK_TYPE_COMPUTE; + atomic_store(&state->shared->n_active, n_threads); + atomic_store(&state->shared->node_task, task_phase); + } + else { + const bool do_yield = node_n < 0 || cgraph->nodes[node_n]->op == GGML_OP_MUL_MAT; + ggml_graph_compute_thread_sync_task(&task_phase, state, do_yield); + } + + if (state->ith < n_tasks) { + params.type = GGML_TASK_TYPE_COMPUTE; + ggml_qnn_compute_forward(¶ms, node); + } + + if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) { + task_phase = GGML_TASK_TYPE_FINALIZE; + atomic_store(&state->shared->n_active, n_threads); + atomic_store(&state->shared->node_task, task_phase); + } + else { + ggml_graph_compute_thread_sync_task(&task_phase, state, false); + } + } + + return 0; +} + + +static ggml_status ggml_backend_qnn_graph_compute_multithread(ggml_backend_t backend, ggml_cgraph * cgraph) { + ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; + + int num_threads = ctx->threads; + + if (QNN_GPU == ctx->device || QNN_HTP == ctx->device) { + //TODO:multithreading not supported using QNN GPU/HTP(aka DSP) backend + num_threads = 1; + } + struct ggml_cplan plan = ggml_graph_plan(cgraph, num_threads); + + + if (plan.work_size > 0) { + //QNN_LOG_INFO("work size %d(%d MB)", plan.work_size, plan.work_size / (1 << 20)); + plan.work_data = static_cast(malloc(plan.work_size)); + if (plan.work_data == nullptr) { + QNN_LOG_ERROR("malloc failed"); + return GGML_STATUS_FAILED; + } + } + + struct ggml_cplan * cplan = &plan; + GGML_ASSERT(cplan->n_threads > 0); + if (cplan->work_size > 0) { + GGML_ASSERT(cplan->work_data); + } + + //QNN_LOG_DEBUG("cgraph %p, cplan %p, work size %d, work data %p", cgraph, cplan, cplan->work_size, cplan->work_data); + const int n_threads = cplan->n_threads; + + struct ggml_compute_state_shared state_shared = { + /*.cgraph =*/ cgraph, + /*.cgraph_plan =*/ cplan, + /*.perf_node_start_cycles =*/ 0, + /*.perf_node_start_time_us =*/ 0, + /*.n_threads =*/ n_threads, + /*.n_active =*/ n_threads, + /*.node_n =*/ -1, + /*.node_task =*/ GGML_TASK_TYPE_FINALIZE, + /*.abort_callback =*/ nullptr, + /*.abort_callback_data =*/ nullptr, + }; + struct ggml_compute_state * workers = (struct ggml_compute_state*)alloca(sizeof(struct ggml_compute_state) * n_threads); + if (nullptr == workers) { + QNN_LOG_ERROR("malloc failed"); + if (plan.work_data != nullptr) { + free(plan.work_data); + } + return GGML_STATUS_FAILED; + } + + // create thread pool + if (n_threads > 1) { + for (int j = 1; j < n_threads; ++j) { + workers[j] = (struct ggml_compute_state) { + .thrd = 0, + .ith = j, + .shared = &state_shared, + .ec = GGML_STATUS_SUCCESS, + }; + + const int rc = pthread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]); + GGML_ASSERT(rc == 0); + } + } + + workers[0].ith = 0; + workers[0].shared = &state_shared; + workers[0].ec = GGML_STATUS_SUCCESS; + + // this is a work thread too + ggml_graph_compute_thread(&workers[0]); + enum ggml_status compute_status = workers[0].ec; + + // join or kill thread pool + if (n_threads > 1) { + for (int j = 1; j < n_threads; j++) { + const int rc = pthread_join(workers[j].thrd, NULL); + GGML_ASSERT(rc == 0); + if (workers[j].ec != GGML_STATUS_SUCCESS) + compute_status = workers[j].ec; + } + } + + if (plan.work_data != nullptr) { + free(plan.work_data); + } + + return compute_status; +} + + +static bool ggml_backend_qnn_offload_op(ggml_backend_t backend, const ggml_tensor * op) { + GGML_UNUSED(backend); + + const int min_batch_size = 32; + + return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS; +} + + +static ggml_backend_i ggml_backend_qnn_interface = { + /* .get_name = */ ggml_backend_qnn_name, + /* .free = */ ggml_backend_qnn_free, + /* .get_default_buffer_type = */ ggml_backend_qnn_get_default_buffer_type, + /* .set_tensor_async = */ nullptr, + /* .get_tensor_async = */ nullptr, + /* .cpy_tensor_async = */ nullptr, + /* .synchronize = */ nullptr, + /* .graph_plan_create = */ nullptr, + /* .graph_plan_free = */ nullptr, + /* .graph_plan_compute = */ nullptr, + /* .graph_compute = */ ggml_backend_qnn_graph_compute_multithread, + /* .supports_op = */ ggml_backend_qnn_supports_op, + /* .offload_op = */ nullptr, + /* .event_new = */ nullptr, + /* .event_free = */ nullptr, + /* .event_record = */ nullptr, + /* .event_wait = */ nullptr, + /* .event_synchronize = */ nullptr, +}; + + +static ggml_guid_t ggml_backend_qnn_guid() { + static ggml_guid guid = {0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81, 0x92, 0xa3, 0xb4, 0xc5, + 0xd6, 0xe7, 0xf8, 0x09}; + return &guid; +} + + +static ggml_backend_t ggml_backend_qnn_reg_init(const char * params, void * user_data) { + ggml_backend_t qnn_backend = ggml_backend_qnn_init((int) (intptr_t) user_data, params); + + return qnn_backend; +} + + +bool ggml_backend_is_qnn(ggml_backend_t backend) { + return backend != nullptr && ggml_guid_matches(backend->guid, ggml_backend_qnn_guid()); +} + + +void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int n_threads) { + GGML_ASSERT(ggml_backend_is_qnn(backend)); + + struct ggml_backend_qnn_context * ctx = (struct ggml_backend_qnn_context *)backend->context; + ctx->threads = n_threads; +} + +const char * ggml_backend_qnn_get_name(ggml_backend_t backend) { + return backend->iface.get_name(backend); +} + +int ggml_backend_qnn_get_device_count() { + return GGML_QNN_MAX_DEVICES; +} + + +void ggml_backend_qnn_get_device_description(int device, char * description, size_t description_size) { + if (nullptr == description || 0 == description_size) { + QNN_LOG_WARN("invalid param"); + return; + } + + if (device >= GGML_QNN_MAX_DEVICES) { + QNN_LOG_WARN("invalid param"); + return; + } + + snprintf(description, description_size, "%s", g_qnn_mgr[device].name); + QNN_LOG_DEBUG("description:%s", description); +} + + +ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device_index) { + if (device_index >= GGML_QNN_MAX_DEVICES) { + QNN_LOG_DEBUG("ggml_backend_qnn_buffer_type error: device_index:%d is out of range [0, %d]\n", + device_index, GGML_QNN_MAX_DEVICES - 1); + return nullptr; + } + + static struct ggml_backend_buffer_type ggml_backend_buffer_type_qnn = { + /* .iface = */ { + /* .get_name = */ ggml_backend_qnn_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_qnn_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_qnn_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_qnn_buffer_type_get_max_size, + /* .get_alloc_size = */ nullptr,// defaults to ggml_nbytes + /* .supports_backend = */ ggml_backend_qnn_buffer_type_supports_backend, + /* .is_host = */ ggml_backend_qnn_buffer_is_host + }, + /* .context = */ nullptr, + }; + + return &ggml_backend_buffer_type_qnn; +} + + +/** + * + * @param device 0: QNN_CPU 1: QNN_GPU 2: QNN_HTP(aka DSP) + * @param qnn_lib_path qnn library path, such as "/data/data/com.ggml.llamacpp/" on Android which can got by JNI from Java layer + * @return + */ +ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { + int result = 0; + + if (nullptr == qnn_lib_path) + return nullptr; + + QNN_LOG_DEBUG("device %d", device); + QNN_LOG_DEBUG("qnn_lib_path %s", qnn_lib_path); + if (device >= GGML_QNN_MAX_DEVICES) { + QNN_LOG_ERROR("invalid device %d", device); + return nullptr; + } + + if (nullptr != g_qnn_mgr[device].backend) { + QNN_LOG_ERROR("qnn backend %d(%s) already loaded, it should not happened, pls check why?", device, get_qnn_backend_name(device)); + if (device == g_current_device) { + g_qnn_backend = g_qnn_mgr[device].backend; + QNN_LOG_INFO("re-use cached backend %d(%s)", device, get_qnn_backend_name(device)); + return g_qnn_mgr[device].backend; + } else { + QNN_LOG_INFO("delete previous backend %d(%s)", device, get_qnn_backend_name(device)); + ggml_backend_qnn_free(g_qnn_backend); + } + } + + static bool is_first_call = true; + if (is_first_call) { + ggml_setup_op_has_task_pass(); + is_first_call = false; + } + + if (QNN_HTP == device) { + std::string path = qnn_lib_path; + if (0 == setenv("LD_LIBRARY_PATH", + (path + + ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images").c_str(), + 1)) { + QNN_LOG_INFO("QNN DSP backend setenv successfully"); + } else { + QNN_LOG_ERROR("QNN DSP backend setenv failure"); + } + if (0 == setenv("ADSP_LIBRARY_PATH", + (path + + ";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp").c_str(), + 1)) { + QNN_LOG_INFO("QNN DSP backend setenv successfully"); + } else { + QNN_LOG_ERROR("QNN DSP backend setenv failure"); + } + } + + qnn_instance * instance = nullptr; + instance = new qnn_instance(qnn_lib_path, g_qnn_mgr[device].lib, ""); + result = instance->qnn_init(nullptr); + if (0 != result) { + QNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why\n", get_qnn_backend_name(device)); + delete instance; + return nullptr; + } + qnn_interface qnn_interface = instance->get_qnn_interface(); + if (!qnn_interface.is_loaded()) { + QNN_LOG_WARN("qnn subsystem failure\n"); + delete instance; + return nullptr; + } + + std::string device_name = GGML_QNN_NAME + std::string("_") + std::to_string(device) + std::string("_") + get_qnn_backend_name(device); + QNN_LOG_INFO("qnn device name %s", device_name.c_str()); + instance->init_qnn_graph(device_name.c_str(), false); + g_qnn_mgr[device].instance = instance; + g_qnn_mgr[device].raw_interface = instance->get_qnn_raw_interface(); + g_qnn_mgr[device].raw_system_interface = instance->get_qnn_raw_system_interface(); + //TODO:refine internal buffer management + g_qnn_mgr[device].buffer_pool = qnn_buf_new(get_qnn_backend_name(device), GGML_QNN_MAX_BUFFERS, (1 << 20)); + GGML_ASSERT(g_qnn_mgr[device].buffer_pool != nullptr); + + ggml_backend_t qnn_backend = new ggml_backend{ + /* .guid = */ ggml_backend_qnn_guid(), + /* .iface = */ ggml_backend_qnn_interface, + /* .context = */ &g_qnn_mgr[device] + }; + g_qnn_mgr[device].backend = qnn_backend; + g_qnn_backend = g_qnn_mgr[device].backend; + g_current_device = device; + + return qnn_backend; +} + + +extern "C" int ggml_backend_qnn_reg_devices(); + + +int ggml_backend_qnn_reg_devices() { + for (size_t idx = 0; idx < GGML_QNN_MAX_DEVICES; idx++) { + int id = g_qnn_mgr[idx].device; + char name[GGML_MAX_NAME]; + ggml_backend_qnn_get_device_description(idx, name, GGML_MAX_NAME); + ggml_backend_register(name, ggml_backend_qnn_reg_init, ggml_backend_qnn_buffer_type(idx), + (void *) (intptr_t)idx); + } + + return GGML_QNN_MAX_DEVICES; +} diff --git a/ggml-qnn.h b/ggml-qnn.h new file mode 100644 index 0000000000000..51f02d4ba3078 --- /dev/null +++ b/ggml-qnn.h @@ -0,0 +1,55 @@ +/* + * MIT license + * Copyright (C) 2024 GGML Authors + * SPDX-License-Identifier: MIT + * + * this is implementation of ggml QNN(Qualcomm Nerual Network, aka AI Engine Direct) backend + */ +#pragma once + +#include "ggml.h" +#include "ggml-backend.h" + +#ifdef __cplusplus +extern "C" { +#endif + + +#define GGML_QNN_NAME "QNN" +#define GGML_QNN_MAX_DEVICES 3 + +//QNN cDSP and HTA backend would not be used currently, just focus on QNN CPU/GPU/HTP(aka DSP) backend currently +enum QNNBackend { + QNN_CPU, + QNN_GPU, + QNN_HTP, +}; + +GGML_API int ggml_backend_qnn_reg_devices(); + +/** + * + * @param device 0: QNN_CPU 1: QNN_GPU 2: QNN_HTP(aka DSP) + * @param qnn_lib_path qnn library path, such as "/data/data/com.ggml.llamacpp/" on Android which can got by JNI from Java layer + * @return + */ +GGML_API ggml_backend_t ggml_backend_qnn_init(size_t dev_num, const char * qnn_lib_path); + +GGML_API bool ggml_backend_is_qnn(ggml_backend_t backend); + +GGML_API void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int n_threads); + +GGML_API int ggml_backend_qnn_get_device_count(void); +GGML_API void ggml_backend_qnn_get_device_description(int device, char * description, size_t description_size); + + +GGML_API ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t dev_num); + + +//temporary API, should be removed in the future +GGML_API bool ggml_qnn_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor); + + +#ifdef __cplusplus +} +#endif diff --git a/ggml.c b/ggml.c index 086db96af7fcd..919eb0b7b1ff1 100644 --- a/ggml.c +++ b/ggml.c @@ -16153,7 +16153,8 @@ static void ggml_compute_forward_cross_entropy_loss_back( ///////////////////////////////// -static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { +//workaround for Qualcomm QNN backend +void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { GGML_ASSERT(params); if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) { diff --git a/llama.cpp b/llama.cpp index 30fe190373b43..a10c3e1fc8488 100644 --- a/llama.cpp +++ b/llama.cpp @@ -17,6 +17,8 @@ # include "ggml-sycl.h" #elif defined(GGML_USE_KOMPUTE) # include "ggml-kompute.h" +#elif defined(GGML_USE_QNN) +# include "ggml-qnn.h" #endif #ifdef GGML_USE_METAL @@ -1680,6 +1682,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) { if (buft == nullptr) { LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu); } +#elif defined(GGML_USE_QNN) + buft = ggml_backend_qnn_buffer_type(gpu); #endif if (buft == nullptr) { @@ -1720,6 +1724,8 @@ static size_t llama_get_device_count() { return ggml_backend_sycl_get_device_count(); #elif defined(GGML_USE_VULKAN) return ggml_backend_vk_get_device_count(); +#elif defined(GGML_USE_QNN) + return ggml_backend_qnn_get_device_count(); #else return 1; #endif @@ -15090,6 +15096,8 @@ size_t llama_max_devices(void) { return GGML_SYCL_MAX_DEVICES; #elif defined(GGML_USE_VULKAN) return GGML_VK_MAX_DEVICES; +#elif defined(GGML_USE_QNN) + return GGML_QNN_MAX_DEVICES; #else return 1; #endif @@ -15105,7 +15113,7 @@ bool llama_supports_mlock(void) { bool llama_supports_gpu_offload(void) { #if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \ - defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) + defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_QNN) // Defined when llama.cpp is compiled with support for offloading model layers to GPU. return true; #else @@ -15392,6 +15400,17 @@ struct llama_context * llama_new_context_with_model( } ctx->backends.push_back(backend); } +#elif defined(GGML_USE_QNN) + if (model->n_gpu_layers > 0) { + //the second param is package name of Andorid app, can be got by JNI from Java layer + ggml_backend_t backend = ggml_backend_qnn_init(QNN_CPU, "/data/data/com.ggml.llamacpp/"); + if (nullptr == backend) { + LLAMA_LOG_ERROR("%s: failed to initialize QNN backend\n", __func__); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); + } #endif ctx->backend_cpu = ggml_backend_cpu_init(); if (ctx->backend_cpu == nullptr) { @@ -17558,6 +17577,14 @@ void llama_reset_timings(struct llama_context * ctx) { ctx->t_p_eval_us = ctx->n_p_eval = 0; } +static int llama_has_qnn(void) { +#ifdef GGML_USE_QNN + return 1; +#else + return 0; +#endif +} + const char * llama_print_system_info(void) { static std::string s; @@ -17579,6 +17606,7 @@ const char * llama_print_system_info(void) { s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | "; s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | "; s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | "; + s += "QNN = " + std::to_string(llama_has_qnn()) + " | "; return s.c_str(); } From d325088dbf8e86722a41b37ef44549b86211742d Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Wed, 24 Apr 2024 16:28:18 +0800 Subject: [PATCH 002/166] ggml: add Qualcomm QNN(Qualcomm Neural Network,aka Qualcomm AI Engine Direct) backend --- ggml-qnn.cpp | 3590 ++++++++++++++++++++++++++++++ ggml-qnn.h | 43 + llama.cpp | 23 +- tests/ggml-qnn/CMakeLists.txt | 60 + tests/ggml-qnn/build-ggml-qnn.sh | 95 + tests/ggml-qnn/run-ggml-qnn.sh | 108 + tests/ggml-qnn/test-qnn-ops.cpp | 450 ++++ 7 files changed, 4368 insertions(+), 1 deletion(-) create mode 100644 ggml-qnn.cpp create mode 100644 ggml-qnn.h create mode 100644 tests/ggml-qnn/CMakeLists.txt create mode 100755 tests/ggml-qnn/build-ggml-qnn.sh create mode 100755 tests/ggml-qnn/run-ggml-qnn.sh create mode 100644 tests/ggml-qnn/test-qnn-ops.cpp diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp new file mode 100644 index 0000000000000..9319db227795d --- /dev/null +++ b/ggml-qnn.cpp @@ -0,0 +1,3590 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "QnnTypes.h" +#include "QnnCommon.h" +#include "QnnContext.h" +#include "QnnBackend.h" +#include "QnnGraph.h" +#include "QnnProperty.h" +#include "QnnTensor.h" +#include "QnnInterface.h" +#include "Saver/QnnSaver.h" +#include "System/QnnSystemInterface.h" +#include "HTP/QnnHtpDevice.h" + +#include "ggml-qnn.h" + +#include "ggml-backend-impl.h" + + +// ================================================================================================= +// +// forward/external/helper declaration +// +// ================================================================================================= +class qnn_instance; + + +#if (defined __ANDROID__) || (defined ANDROID) +extern "C" int __android_log_print(int prio, const char * tag, const char * fmt, ...) +__attribute__((__format__(printf, 3, 4))); +#endif +static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...); + + +// ================================================================================================= +// +// self-defined macro / data structure +// +// ================================================================================================= +#define RPCMEM_DEFAULT_FLAGS 1 +#define RPCMEM_HEAP_ID_SYSTEM 25 + +#define GGML_DUMP_TENSOR(tensor) ggml_tensor_dump(tensor, #tensor) + +#define GGML_QNN_LOGBUF_LEN 4096 + +#define GGML_QNN_DEBUG 1 //for troubleshooting QNN backend + +#define QNN_LOG_ERROR(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define QNN_LOG_WARN(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define QNN_LOG_INFO(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) + +#if GGML_QNN_DEBUG +#define QNN_LOG_DEBUG(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#else +#define QNN_LOG_DEBUG(...) +#endif + + +#define VALIDATE(value, status) \ + do { \ + status = value; \ + if (status != QNN_SUCCESS) { \ + QNN_LOG_WARN("%s expected QNN_SUCCESS\n", #value); \ + return status; \ + } \ + } while (0) + +#define VALIDATE_TENSOR_VERSION(tensor, err) VALIDATE(validate_tensor_version(tensor), err) + +#define VALIDATE_OP_CONFIG_VERSION(op, err) VALIDATE(validate_op_config_version(op), err) + +#define QNN_VER_PTR(x) (&((x).v1)) +#define QNN_OP_CFG_VALID(op_config) ((op_config).version == QNN_OPCONFIG_VERSION_1) + +#define QNN_OP_CFG_GET_NAME(op_config) get_qnn_oponfig_name(op_config) +#define QNN_OP_CFG_GET_PACKAGE_NAME(op_config) get_qnn_op_config_packagename(op_config) +#define QNN_OP_CFG_GET_TYPE_NAME(op_config) get_qnn_op_config_typename(op_config) +#define QNN_OP_CFG_GET_NUM_PARAMS(op_config) get_qnn_op_config_numparams(op_config) +#define QNN_OP_CFG_GET_PARAMS(op_config) get_qnn_op_config_params(op_config) +#define QNN_OP_CFG_GET_NUM_INPUTS(op_config) get_qnn_op_config_numinputs(op_config) +#define QNN_OP_CFG_GET_INPUTS(op_config) get_qnn_op_config_inputs(op_config) +#define QNN_OP_CFG_GET_NUM_OUTPUTS(op_config) get_qnn_op_config_numoutputs(op_config) +#define QNN_OP_CFG_GET_OUTPUTS(op_config) get_qnn_op_config_outputs(op_config) + +#define QNN_OP_CFG_SET_NAME(op_config, value) set_qnn_op_config_name(op_config, value) +#define QNN_OP_CFG_SET_PACKAGE_NAME(op_config, value) set_qnn_op_config_packagename(op_config, value) +#define QNN_OP_CFG_SET_TYPE_NAME(op_config, value) set_qnn_op_config_typename(op_config, value) + +#define QNN_OP_CFG_SET_PARAMS(op_config, num_of_params, params) \ + set_qnn_op_config_params(op_config, num_of_params, params) + +#define QNN_OP_CFG_SET_INPUTS(op_config, num_of_inputs, inputTensors) \ + set_qnn_op_config_inputs(op_config, num_of_inputs, inputTensors) + +#define QNN_OP_CFG_SET_OUTPUTS(op_config, num_of_outputs, output_tensors) \ + set_qnn_op_config_outputs(op_config, num_of_outputs, output_tensors) + +#define QNN_TENSOR_GET_ID(tensor) get_qnn_tensorid(tensor) +#define QNN_TENSOR_GET_NAME(tensor) get_qnn_tensorname(tensor) +#define QNN_TENSOR_GET_TYPE(tensor) get_qnn_tensortype(tensor) +#define QNN_TENSOR_GET_DATA_FORMAT(tensor) get_qnn_tensor_dataformat(tensor) +#define QNN_TENSOR_GET_DATA_TYPE(tensor) get_qnn_tensor_datatype(tensor) +#define QNN_TENSOR_GET_QUANT_PARAMS(tensor) get_qnn_tensor_quantparams(tensor) +#define QNN_TENSOR_GET_RANK(tensor) get_qnn_tensor_rank(tensor) +#define QNN_TENSOR_GET_DIMENSIONS(tensor) get_qnn_tensor_dimensions(tensor) +#define QNN_TENSOR_GET_MEM_TYPE(tensor) get_qnn_tensor_memtype(tensor) +#define QNN_TENSOR_GET_CLIENT_BUF(tensor) get_qnn_tensor_clientbuf(tensor) +#define QNN_TENSOR_GET_MEM_HANDLE(tensor) get_qnn_tensor_memhandle(tensor) + +#define QNN_TENSOR_SET_ID(tensor, value) set_qnn_tensor_id(tensor, value) +#define QNN_TENSOR_SET_NAME(tensor, value) set_qnn_tensor_name(tensor, value) +#define QNN_TENSOR_SET_TYPE(tensor, value) set_qnn_tensor_type(tensor, value) +#define QNN_TENSOR_SET_DATA_FORMAT(tensor, value) set_qnn_tensor_dataformat(tensor, value) +#define QNN_TENSOR_SET_DATA_TYPE(tensor, value) set_qnn_tensor_datatype(tensor, value) +#define QNN_TENSOR_SET_QUANT_PARAMS(tensor, value) set_qnn_tensor_quantparams(tensor, value) +#define QNN_TENSOR_SET_RANK(tensor, value) set_qnn_tensor_rank(tensor, value) +#define QNN_TENSOR_SET_DIMENSIONS(tensor, value) set_qnn_tensor_dimensions(tensor, value) +#define QNN_TENSOR_SET_MEM_TYPE(tensor, value) set_qnn_tensor_memtype(tensor, value) +#define QNN_TENSOR_SET_CLIENT_BUF(tensor, value) set_qnn_tensor_clientbuf(tensor, value) +#define QNN_TENSOR_SET_MEM_HANDLE(tensor, value) set_qnn_tensor_memhandle(tensor, value) + + +using pfn_rpc_mem_init = void (*)(void); +using pfn_rpc_mem_deinit = void (*)(void); +using pfn_rpc_mem_alloc = void *(*)(int, uint32_t, int); +using pfn_rpc_mem_free = void (*)(void *); +using pfn_rpc_mem_to_fd = int (*)(void *); + +using _pfn_QnnSaver_initialize = decltype(QnnSaver_initialize); +using _pfn_QnnInterface_getProviders = decltype(QnnInterface_getProviders); +using _pfn_QnnSystemInterface_getProviders = decltype(QnnSystemInterface_getProviders); + + +typedef void (* ggml_qnn_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +typedef void (* ggml_qnn_func_common_t)(const ggml_op ggml_op, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); + +enum class ggml_qnn_profile_level { + profile_off = 0, + profile_basic = 1, + profile_detail = 2 +}; + + +struct ggml_backend_qnn_context { + int device; + int threads; + char name[GGML_MAX_NAME]; + char lib[GGML_MAX_NAME]; + qnn_instance * instance; + struct ggml_backend * backend; + QNN_INTERFACE_VER_TYPE raw_interface; + QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface; +} ; + + +// ================================================================================================= +// +// static global variables +// +// ================================================================================================= +static ggml_backend_t g_qnn_backend = nullptr; + +static int g_current_device = QNN_BACKEND_GGML; + + +//QNN cDSP and HTA backend would not be used currently, just focus on QNN CPU/GPU/NPU(aka HTP/DSP) backend currently +static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { + [QNN_BACKEND_CPU] = {.device = 0, .threads = 1, .name = "qnn-cpu", .lib = "libQnnCpu.so", .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}}, + [QNN_BACKEND_GPU] = {.device = 1, .threads = 1, .name = "qnn-gpu", .lib = "libQnnGpu.so", .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}}, + [QNN_BACKEND_NPU] = {.device = 2, .threads = 1, .name = "qnn-npu", .lib = "libQnnHtp.so", .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}}, +}; + + +// ================================================================================================= +// +// QNN helper functions and other internal helper functions +// +// ================================================================================================= +static inline int validate_tensor_version(Qnn_Tensor_t tensor) { + if (tensor.version != QNN_TENSOR_VERSION_1) { + QNN_LOG_WARN("validate_tensor_version() tensor %s, got unsupported version %d\n", + tensor.v1.name, + tensor.version); + return 1; + } + return 0; +} + + +[[maybe_unused]] static inline int validate_op_config_version(Qnn_OpConfig_t op_config) { + if (op_config.version != QNN_OPCONFIG_VERSION_1) { + QNN_LOG_WARN("validate_op_config_version() op %s, got unsupported version %d\n", + op_config.v1.name, + op_config.version); + return 1; + } + return 0; +} + + +static inline const char * get_qnn_oponfig_name(const Qnn_OpConfig_t & op_config) { + if (op_config.version == QNN_OPCONFIG_VERSION_1) { + return op_config.v1.name; + } + return nullptr; +} + + +[[maybe_unused]] static inline const char * get_qnn_oponfig_name(const Qnn_OpConfig_t * op_config) { + return get_qnn_oponfig_name(*op_config); +} + + +static inline const char * get_qnn_op_config_packagename(const Qnn_OpConfig_t & op_config) { + if (op_config.version == QNN_OPCONFIG_VERSION_1) { + return op_config.v1.packageName; + } + return nullptr; +} + + +[[maybe_unused]] static inline const char * get_qnn_op_config_packagename(const Qnn_OpConfig_t * op_config) { + return get_qnn_op_config_packagename(*op_config); +} + + +static inline const char * get_qnn_op_config_typename(const Qnn_OpConfig_t & op_config) { + if (op_config.version == QNN_OPCONFIG_VERSION_1) { + return op_config.v1.typeName; + } + return nullptr; +} + + +[[maybe_unused]] static inline const char * get_qnn_op_config_typename(const Qnn_OpConfig_t * op_config) { + return get_qnn_op_config_typename(*op_config); +} + + +static inline uint32_t get_qnn_op_config_numparams(const Qnn_OpConfig_t & op_config) { + if (op_config.version == QNN_OPCONFIG_VERSION_1) { + return op_config.v1.numOfParams; + } + return 0u; +} + + +[[maybe_unused]] static inline uint32_t get_qnn_op_config_numparams(const Qnn_OpConfig_t * op_config) { + return get_qnn_op_config_numparams(*op_config); +} + + +static inline const Qnn_Param_t * get_qnn_op_config_params(const Qnn_OpConfig_t & op_config) { + if (op_config.version == QNN_OPCONFIG_VERSION_1) { + return op_config.v1.params; + } + return nullptr; +} + + +[[maybe_unused]] static inline const Qnn_Param_t * get_qnn_op_config_params(const Qnn_OpConfig_t * op_config) { + return get_qnn_op_config_params(*op_config); +} + + +static inline uint32_t get_qnn_op_config_numinputs(const Qnn_OpConfig_t & op_config) { + if (op_config.version == QNN_OPCONFIG_VERSION_1) { + return op_config.v1.numOfInputs; + } + return 0u; +} + + +[[maybe_unused]] static inline uint32_t get_qnn_op_config_numinputs(const Qnn_OpConfig_t * op_config) { + return get_qnn_op_config_numinputs(*op_config); +} + + +static inline const Qnn_Tensor_t * get_qnn_op_config_inputs(const Qnn_OpConfig_t & op_config) { + if (op_config.version == QNN_OPCONFIG_VERSION_1) { + return op_config.v1.inputTensors; + } + return nullptr; +} + + +[[maybe_unused]] static inline const Qnn_Tensor_t * get_qnn_op_config_inputs(const Qnn_OpConfig_t * op_config) { + return get_qnn_op_config_inputs(*op_config); +} + + +static inline uint32_t get_qnn_op_config_numoutputs(const Qnn_OpConfig_t & op_config) { + if (op_config.version == QNN_OPCONFIG_VERSION_1) { + return op_config.v1.numOfOutputs; + } + return 0u; +} + + +[[maybe_unused]] static inline uint32_t get_qnn_op_config_numoutputs(const Qnn_OpConfig_t * op_config) { + return get_qnn_op_config_numoutputs(*op_config); +} + + +static inline const Qnn_Tensor_t * get_qnn_op_config_outputs(const Qnn_OpConfig_t & op_config) { + if (op_config.version == QNN_OPCONFIG_VERSION_1) { + return op_config.v1.outputTensors; + } + return nullptr; +} + + +[[maybe_unused]] static inline const Qnn_Tensor_t * get_qnn_op_config_outputs(const Qnn_OpConfig_t * op_config) { + return get_qnn_op_config_outputs(*op_config); +} + + +static inline void set_qnn_op_config_name(Qnn_OpConfig_t & op_config, const char * name) { + if (op_config.version == QNN_OPCONFIG_VERSION_1) { + op_config.v1.name = name; + } +} + + +[[maybe_unused]] static inline void set_qnn_op_config_name(Qnn_OpConfig_t * op_config, const char * name) { + set_qnn_op_config_name(*op_config, name); +} + + +static inline void set_qnn_op_config_packagename(Qnn_OpConfig_t & op_config, const char * package_name) { + if (op_config.version == QNN_OPCONFIG_VERSION_1) { + op_config.v1.packageName = package_name; + } +} + + +[[maybe_unused]] static inline void set_qnn_op_config_packagename(Qnn_OpConfig_t * op_config, const char * package_name) { + set_qnn_op_config_packagename(*op_config, package_name); +} + + +static inline void set_qnn_op_config_typename(Qnn_OpConfig_t & op_config, const char * type_name) { + if (op_config.version == QNN_OPCONFIG_VERSION_1) { + op_config.v1.typeName = type_name; + } +} + + +[[maybe_unused]] static inline void set_qnn_op_config_typename(Qnn_OpConfig_t * op_config, const char * type_name) { + set_qnn_op_config_typename(*op_config, type_name); +} + + +static inline void set_qnn_op_config_params(Qnn_OpConfig_t & op_config, + uint32_t num_of_params, + Qnn_Param_t * params) { + if (op_config.version == QNN_OPCONFIG_VERSION_1) { + op_config.v1.numOfParams = num_of_params; + op_config.v1.params = params; + } +} + + +[[maybe_unused]] static inline void set_qnn_op_config_params(Qnn_OpConfig_t * op_config, + uint32_t num_of_params, + Qnn_Param_t * params) { + set_qnn_op_config_params(*op_config, num_of_params, params); +} + + +static inline void set_qnn_op_config_inputs(Qnn_OpConfig_t & op_config, + uint32_t num_of_inputs, + Qnn_Tensor_t * input_tensors) { + if (op_config.version == QNN_OPCONFIG_VERSION_1) { + op_config.v1.numOfInputs = num_of_inputs; + op_config.v1.inputTensors = input_tensors; + } +} + + +[[maybe_unused]] static inline void set_qnn_op_config_inputs(Qnn_OpConfig_t * op_config, + uint32_t num_of_inputs, + Qnn_Tensor_t * input_tensors) { + set_qnn_op_config_inputs(*op_config, num_of_inputs, input_tensors); +} + + +static inline void set_qnn_op_config_outputs(Qnn_OpConfig_t & op_config, + uint32_t num_of_outputs, + Qnn_Tensor_t * output_tensors) { + if (op_config.version == QNN_OPCONFIG_VERSION_1) { + op_config.v1.numOfOutputs = num_of_outputs; + op_config.v1.outputTensors = output_tensors; + } +} + + +[[maybe_unused]] static inline void set_qnn_op_config_outputs(Qnn_OpConfig_t * op_config, + uint32_t num_of_outputs, + Qnn_Tensor_t * output_tensors) { + set_qnn_op_config_outputs(*op_config, num_of_outputs, output_tensors); +} + + +static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.id; + } + + return 0u; +} + + +[[maybe_unused]] static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t * tensor) { + return get_qnn_tensorid(*tensor); +} + + +static inline const char * get_qnn_tensorname(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.name; + } + return nullptr; +} + + +static inline const char * get_qnn_tensorname(const Qnn_Tensor_t * tensor) { + return get_qnn_tensorname(*tensor); +} + + +static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.type; + } + return QNN_TENSOR_TYPE_UNDEFINED; +} + + +[[maybe_unused]] static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t * tensor) { + return get_qnn_tensortype(*tensor); +} + + +static inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.dataFormat; + } + return QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER; +} + + +[[maybe_unused]] static inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_t * tensor) { + return get_qnn_tensor_dataformat(*tensor); +} + + +static inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.dataType; + } + return QNN_DATATYPE_UNDEFINED; +} + + +[[maybe_unused]] static inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t * tensor) { + return get_qnn_tensor_datatype(*tensor); +} + + +static inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.quantizeParams; + } + return QNN_QUANTIZE_PARAMS_INIT; +} + + +[[maybe_unused]] static inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t * tensor) { + return get_qnn_tensor_quantparams(*tensor); +} + + +static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.rank; + } + return 0u; +} + + +[[maybe_unused]] static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t * tensor) { + return get_qnn_tensor_rank(*tensor); +} + + +static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.dimensions; + } + return nullptr; +} + + +[[maybe_unused]] static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t * tensor) { + return get_qnn_tensor_dimensions(*tensor); +} + + +static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.memType; + } + return QNN_TENSORMEMTYPE_UNDEFINED; +} + + +[[maybe_unused]] static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t * tensor) { + return get_qnn_tensor_memtype(*tensor); +} + + +static inline Qnn_ClientBuffer_t get_qnn_tensor_clientbuf(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.clientBuf; + } + return QNN_CLIENT_BUFFER_INIT; +} + + +[[maybe_unused]] static inline Qnn_ClientBuffer_t get_qnn_tensor_clientbuf(const Qnn_Tensor_t * tensor) { + return get_qnn_tensor_clientbuf(*tensor); +} + + +static inline Qnn_MemHandle_t get_qnn_tensor_memhandle(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.memHandle; + } + return nullptr; +} + + +[[maybe_unused]] static inline Qnn_MemHandle_t get_qnn_tensor_memhandle(const Qnn_Tensor_t * tensor) { + return get_qnn_tensor_memhandle(*tensor); +} + + +static inline void set_qnn_tensor_id(Qnn_Tensor_t & tensor, uint32_t id) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.id = id; + } +} + + +[[maybe_unused]] static inline void set_qnn_tensor_id(Qnn_Tensor_t * tensor, uint32_t id) { + set_qnn_tensor_id(*tensor, id); +} + + +static inline void set_qnn_tensor_name(Qnn_Tensor_t & tensor, const char * name) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.name = name; + } +} + + +[[maybe_unused]] static inline void set_qnn_tensor_name(Qnn_Tensor_t * tensor, const char * name) { + set_qnn_tensor_name(*tensor, name); +} + + +static inline void set_qnn_tensor_type(Qnn_Tensor_t & tensor, Qnn_TensorType_t type) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.type = type; + } +} + + +[[maybe_unused]] static inline void set_qnn_tensor_type(Qnn_Tensor_t * tensor, Qnn_TensorType_t type) { + set_qnn_tensor_type(*tensor, type); +} + + +static inline void set_qnn_tensor_dataformat(Qnn_Tensor_t & tensor, Qnn_TensorDataFormat_t format) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.dataFormat = format; + } +} + + +[[maybe_unused]] static inline void set_qnn_tensor_dataformat(Qnn_Tensor_t * tensor, Qnn_TensorDataFormat_t format) { + set_qnn_tensor_dataformat(*tensor, format); +} + + +static inline void set_qnn_tensor_datatype(Qnn_Tensor_t & tensor, Qnn_DataType_t dataType) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.dataType = dataType; + } +} + + +[[maybe_unused]] static inline void set_qnn_tensor_datatype(Qnn_Tensor_t * tensor, Qnn_DataType_t dataType) { + set_qnn_tensor_datatype(*tensor, dataType); +} + + +static inline void set_qnn_tensor_quantparams(Qnn_Tensor_t & tensor, Qnn_QuantizeParams_t params) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.quantizeParams = params; + } +} + + +[[maybe_unused]] static inline void set_qnn_tensor_quantparams(Qnn_Tensor_t * tensor, Qnn_QuantizeParams_t params) { + set_qnn_tensor_quantparams(*tensor, params); +} + + +static inline void set_qnn_tensor_rank(Qnn_Tensor_t & tensor, uint32_t rank) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.rank = rank; + } +} + + +[[maybe_unused]] static inline void set_qnn_tensor_rank(Qnn_Tensor_t * tensor, uint32_t rank) { + set_qnn_tensor_rank(*tensor, rank); +} + + +static inline void set_qnn_tensor_dimensions(Qnn_Tensor_t & tensor, uint32_t * dims) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.dimensions = dims; + } +} + + +[[maybe_unused]] static inline void set_qnn_tensor_dimensions(Qnn_Tensor_t * tensor, uint32_t * dims) { + set_qnn_tensor_dimensions(*tensor, dims); +} + + +static inline void set_qnn_tensor_memtype(Qnn_Tensor_t & tensor, Qnn_TensorMemType_t memType) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.memType = memType; + } +} + + +[[maybe_unused]] static inline void set_qnn_tensor_memtype(Qnn_Tensor_t * tensor, Qnn_TensorMemType_t memType) { + set_qnn_tensor_memtype(*tensor, memType); +} + + +static inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t & tensor, Qnn_ClientBuffer_t clientBuf) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.clientBuf = clientBuf; + } +} + + +[[maybe_unused]] static inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t * tensor, Qnn_ClientBuffer_t clientBuf) { + set_qnn_tensor_clientbuf(*tensor, clientBuf); +} + + +static inline void set_qnn_tensor_memhandle(Qnn_Tensor_t & tensor, Qnn_MemHandle_t handle) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.memHandle = handle; + } +} + + +[[maybe_unused]] static inline void set_qnn_tensor_memhandle(Qnn_Tensor_t * tensor, Qnn_MemHandle_t handle) { + set_qnn_tensor_memhandle(*tensor, handle); +} + + +static size_t memscpy(void * dst, size_t dstSize, const void * src, size_t copySize) { + if (!dst || !src || !dstSize || !copySize) + return 0; + + size_t minSize = dstSize < copySize ? dstSize : copySize; + + memcpy(dst, src, minSize); + + return minSize; +} + + +static char * ggml_qnn_strndup(const char * source, size_t maxlen) { + return ::strndup(source, maxlen); +} + + +static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) { + int err = 0; + VALIDATE_TENSOR_VERSION(src, err); + + dst.version = src.version; + QNN_TENSOR_SET_NAME( + dst, ggml_qnn_strndup(QNN_TENSOR_GET_NAME(src), std::string(QNN_TENSOR_GET_NAME(src)).size())); + if (QNN_TENSOR_GET_NAME(dst) == nullptr) { + return 1; + } + QNN_TENSOR_SET_ID(dst, QNN_TENSOR_GET_ID(src)); + QNN_TENSOR_SET_TYPE(dst, QNN_TENSOR_GET_TYPE(src)); + QNN_TENSOR_SET_DATA_FORMAT(dst, QNN_TENSOR_GET_DATA_FORMAT(src)); + QNN_TENSOR_SET_DATA_TYPE(dst, QNN_TENSOR_GET_DATA_TYPE(src)); + QNN_TENSOR_SET_MEM_TYPE(dst, QNN_TENSOR_GET_MEM_TYPE(src)); + + // Only metadata (i.e. non-static data) is copied from source to destination. The union still + // must be initialized so that the clientBuf/memHandle do not contain garbage data + if (QNN_TENSOR_GET_MEM_TYPE(src) == QNN_TENSORMEMTYPE_RAW) { + Qnn_ClientBuffer_t client_buf = {nullptr, 0}; + QNN_TENSOR_SET_CLIENT_BUF(dst, client_buf); + } else if (QNN_TENSOR_GET_MEM_TYPE(src) == QNN_TENSORMEMTYPE_MEMHANDLE) { + QNN_TENSOR_SET_MEM_HANDLE(dst, nullptr); + } else { + return 1; + } + + Qnn_QuantizeParams_t src_qparam = QNN_TENSOR_GET_QUANT_PARAMS(src); + Qnn_QuantizationEncoding_t encoding = src_qparam.quantizationEncoding; + if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) { + // need to allocate and copy memory for scaleOffset as it is a pointer array + Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; + Qnn_AxisScaleOffset_t &axis_scale_offset = src_qparam_cpy.axisScaleOffsetEncoding; + Qnn_ScaleOffset_t **scaleOffset = &axis_scale_offset.scaleOffset; + size_t scaleOffsetSize = axis_scale_offset.numScaleOffsets * sizeof(Qnn_ScaleOffset_t); + *scaleOffset = (Qnn_ScaleOffset_t *)malloc(scaleOffsetSize); + memscpy(*scaleOffset, + scaleOffsetSize, + src_qparam.axisScaleOffsetEncoding.scaleOffset, + scaleOffsetSize); + QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam_cpy); + } else if (encoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) { + // need to allocate and copy memory for scaleOffset as it is a pointer array + Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; + Qnn_BwAxisScaleOffset_t &bwaxis_scale_offset = src_qparam_cpy.bwAxisScaleOffsetEncoding; + size_t scaleSize = bwaxis_scale_offset.numElements * sizeof(float); + float **scales = &bwaxis_scale_offset.scales; + int32_t **offsets = &bwaxis_scale_offset.offsets; + *scales = (float *)malloc(scaleSize); + memscpy(*scales, scaleSize, src_qparam.bwAxisScaleOffsetEncoding.scales, scaleSize); + + // only copy offsets if present, nullptr implies all offsets are 0 + if (bwaxis_scale_offset.offsets != nullptr) { + size_t offsetSize = bwaxis_scale_offset.numElements * sizeof(int32_t); + *offsets = (int32_t *)malloc(offsetSize); + memscpy(*offsets, offsetSize, src_qparam.bwAxisScaleOffsetEncoding.offsets, offsetSize); + } + QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam_cpy); + } else { + QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam); + } + + // allocate and copy memory for all the pointer members + uint32_t rank = QNN_TENSOR_GET_RANK(src); + QNN_TENSOR_SET_RANK(dst, rank); + size_t dim_size = rank * sizeof(uint32_t); + uint32_t * dimensions = (uint32_t *)malloc(dim_size); + if (dimensions == nullptr) { + QNN_LOG_WARN("deep_copy_qnn_tensors() allocation error while copying tensor %s\n", QNN_TENSOR_GET_NAME(src)); + return 1; + } + memscpy(dimensions, dim_size, QNN_TENSOR_GET_DIMENSIONS(src), dim_size); + QNN_TENSOR_SET_DIMENSIONS(dst, dimensions); + + return err; +} + + +static int free_qnn_tensor(Qnn_Tensor_t & tensor) { + int err = 0; + VALIDATE_TENSOR_VERSION(tensor, err); + + free((void *) QNN_TENSOR_GET_NAME(tensor)); + free(QNN_TENSOR_GET_DIMENSIONS(tensor)); + + return err; +} + + +[[maybe_unused]] static int free_qnn_tensors(Qnn_Tensor_t *& tensors, uint32_t num_tensors) { + int err = 0; + + // free all pointer allocations in struct + for (size_t i = 0; i < num_tensors; i++) { + free_qnn_tensor(tensors[i]); + } + free(tensors); + + return err; +} + + +static uint32_t ggml_get_tensor_rank(const ggml_tensor * tensor) { + uint32_t rank = 0; + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) { + rank++; + } + } + return rank; +} + + +//TODO: mapping more ggml data type to QNN data type +//ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684 +static Qnn_DataType_t qnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) { + switch (ggmltype) { + case GGML_TYPE_F16: + return QNN_DATATYPE_FLOAT_16; + case GGML_TYPE_F32: + return QNN_DATATYPE_FLOAT_32; + default: + break; + + } + return QNN_DATATYPE_UNDEFINED; +} + + +//TODO: only support GGML_OP_ADD/GGML_OP_MUL/GGML_OP_MUL_MAT +static const char * qnn_opname_from_ggmlop(enum ggml_op ggmlop) { + switch (ggmlop) { + case GGML_OP_ADD: + return QNN_OP_ELEMENT_WISE_ADD; + case GGML_OP_MUL: + return QNN_OP_ELEMENT_WISE_MULTIPLY; + case GGML_OP_MUL_MAT: + return QNN_OP_MAT_MUL; + default: + break; + } + + return nullptr; +} + + +static uint32_t ggml_get_tensor_data_size(const ggml_tensor * tensor) { + /* + size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); + size_t n_dims = ggml_get_tensor_rank(tensor); + for (int i = 1; i < n_dims; i++) { + data_size *= tensor->ne[i]; + } + + return data_size; + */ + return ggml_nbytes(tensor); +} + + +template +Fn load_qnn_functionpointers(void * handle, const char * function_name) { + return reinterpret_cast(dlsym(handle, function_name)); +} + + +static const char * get_qnn_backend_name(int n_backend_type) { + switch (n_backend_type) { + case 0: + return "QNN-CPU"; + case 1: + return "QNN-GPU"; + case 2: + return "QNN-NPU"; + case 3: + return "ggml"; //the default GGML backend, used to compare performance between QNN backend and the default GGML backend + +#if 0 //QNN cDSP and HTA backend would not be used currently, focus on QNN CPU/GPU/NPU(aka HTP/DSP) backend currently + case 3: + return "QNN-cDSP"; + case 4: + return "QNN-HTA"; +#endif + default: + return "unknown"; + } +} + + +static intptr_t align_to(size_t alignment, intptr_t offset) { + return offset % alignment == 0 ? offset + : offset + + (static_cast(alignment) - + offset % static_cast(alignment)); +} + + +static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...) { + static std::mutex ggml_qnn_log_internal_mutex; + static char s_ggml_qnn_log_internal_buf[GGML_QNN_LOGBUF_LEN]; + + { + std::lock_guard lock(ggml_qnn_log_internal_mutex); + va_list args; + va_start(args, format); + int len_prefix = snprintf(s_ggml_qnn_log_internal_buf, GGML_QNN_LOGBUF_LEN, "[%s, %d]: ", func, line); + int len = vsnprintf(s_ggml_qnn_log_internal_buf + len_prefix, GGML_QNN_LOGBUF_LEN - len_prefix, format, args); + if (len < (GGML_QNN_LOGBUF_LEN - len_prefix)) { +#if (defined __ANDROID__) || (defined ANDROID) + //for Android APK + __android_log_print(level, "ggml-qnn", "%s\n", s_ggml_qnn_log_internal_buf); +#endif + //for Android command line application or WoA + printf("%s\n", s_ggml_qnn_log_internal_buf); + } + va_end(args); + } +} + + +// ================================================================================================= +// +// wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK +// +// ================================================================================================= +class qnn_interface { + +#define DEFINE_SHIM_FUNCTION_INTERFACE(F, pointer_name) \ + template \ + inline auto qnn_##F(Args... args) const { \ + return (_qnn_interface->QNN_INTERFACE_VER_NAME.pointer_name)( \ + std::forward(args)...); \ + } + + +#define DEFINE_SHIM_FUNCTION_SYS_INTERFACE(F, pointer_name) \ + template \ + inline auto qnn_##F(Args... args) const { \ + return (_qnn_sys_interface->QNN_SYSTEM_INTERFACE_VER_NAME.pointer_name)( \ + std::forward(args)...); \ + } + + friend class qnn_instance; + +public: + qnn_interface() = default; + + // QnnBackend + DEFINE_SHIM_FUNCTION_INTERFACE(backend_create, backendCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(backend_free, backendFree); + + DEFINE_SHIM_FUNCTION_INTERFACE(backend_register_op_package, backendRegisterOpPackage); + + DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, backendValidateOpConfig); + + DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, backendGetApiVersion); + + // QnnDevice + DEFINE_SHIM_FUNCTION_INTERFACE(device_create, deviceCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(device_free, deviceFree); + + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_infrastructure, deviceGetInfrastructure); + + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, deviceGetPlatformInfo); + + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_info, deviceGetInfo); + + // QnnContext + DEFINE_SHIM_FUNCTION_INTERFACE(context_create, contextCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary_size, contextGetBinarySize); + + DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary, contextGetBinary); + + DEFINE_SHIM_FUNCTION_INTERFACE(context_create_from_binary, contextCreateFromBinary); + + DEFINE_SHIM_FUNCTION_INTERFACE(context_free, contextFree); + + // QnnGraph + DEFINE_SHIM_FUNCTION_INTERFACE(graph_create, graphCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(graph_add_node, graphAddNode); + + DEFINE_SHIM_FUNCTION_INTERFACE(graph_finalize, graphFinalize); + + DEFINE_SHIM_FUNCTION_INTERFACE(graph_execute, graphExecute); + + DEFINE_SHIM_FUNCTION_INTERFACE(graph_retrieve, graphRetrieve); + + // QnnLog + DEFINE_SHIM_FUNCTION_INTERFACE(log_create, logCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(log_free, logFree); + + DEFINE_SHIM_FUNCTION_INTERFACE(log_set_log_level, logSetLogLevel); + + // QnnProfile + DEFINE_SHIM_FUNCTION_INTERFACE(profile_create, profileCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_events, profileGetEvents); + + DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_sub_events, profileGetSubEvents); + + DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_event_data, profileGetEventData); + + DEFINE_SHIM_FUNCTION_INTERFACE(profile_free, profileFree); + + // QnnMem + DEFINE_SHIM_FUNCTION_INTERFACE(mem_register, memRegister); + + DEFINE_SHIM_FUNCTION_INTERFACE(mem_de_register, memDeRegister); + + // QnnProperty + DEFINE_SHIM_FUNCTION_INTERFACE(property_has_capability, propertyHasCapability); + + // QnnTensor + DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_context_tensor, tensorCreateContextTensor); + + DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, tensorCreateGraphTensor); + + // QnnSystem + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_create, systemContextCreate); + + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, systemContextGetBinaryInfo); + + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree); + + void set_qnn_interface(const QnnInterface_t * qnn_interface) { + _qnn_interface = qnn_interface; + } + + void set_qnn_system_interface(const QnnSystemInterface_t * qnn_sys_interface) { + _qnn_sys_interface = qnn_sys_interface; + } + + uint32_t get_backend_id() const { + return _qnn_interface->backendId; + } + + bool is_loaded() const { + return ((_qnn_sys_interface != nullptr) && (_qnn_interface != nullptr)); + } + +private: + const QnnInterface_t *_qnn_interface = nullptr; + + const QnnSystemInterface_t *_qnn_sys_interface = nullptr; +}; + + + +// ================================================================================================= +// +// wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK +// +// and +// +// resource management of QNN resources for GGML's QNN backend +// ================================================================================================= +class qnn_instance { +public: + using BackendIdType = decltype(QnnInterface_t{}.backendId); + + explicit qnn_instance(const std::string & lib_path, const std::string & backend_name, + const std::string & model_name) : + _lib_path(std::move(lib_path)), + _backend_name(std::move(backend_name)), + _model_name(std::move(model_name)) {}; + + ~qnn_instance() { + } + + int qnn_init(const QnnSaver_Config_t ** saver_config); + + int qnn_finalize(); + + const qnn_interface &get_qnn_interface() { + if (!_qnn_interface.is_loaded()) { + QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + } + return _qnn_interface; + } + + + const QNN_INTERFACE_VER_TYPE &get_qnn_raw_interface() { + if (!_qnn_interface.is_loaded()) { + QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + } + return _qnn_raw_interface; + } + + const QNN_SYSTEM_INTERFACE_VER_TYPE &get_qnn_raw_system_interface() { + if (!_qnn_interface.is_loaded()) { + QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + } + return _qnn_raw_system_interface; + } + + const Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; } + + const Qnn_ProfileHandle_t get_qnn_profile_handle() { return _qnn_profile_handle; } + + const Qnn_DeviceHandle_t get_qnn_device_handle() { return _qnn_device_handle; } + + const Qnn_BackendHandle_t get_qnn_backend_handle() { return _qnn_backend_handle; } + + const Qnn_ContextHandle_t get_qnn_context_handle() { return _qnn_context_handle; } + + const QnnSystemContext_Handle_t get_qnn_system_handle() { return _qnn_system_handle; } + + const Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; } + + + int init_qnn_graph(const char * graph_name, + bool debug, + uint8_t do_node_validation = 1, + const QnnGraph_Config_t ** graph_configs = nullptr + ); + + int finalize_qnn_graph(); + + int init_htp_perfinfra() { + QnnDevice_Infrastructure_t device_infra = nullptr; + int error = _qnn_raw_interface.deviceGetInfrastructure(&device_infra); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get qnn device infra\n"); + return 1; + } + + QnnHtpDevice_Infrastructure_t *htp_infra = static_cast(device_infra); + QnnHtpDevice_PerfInfrastructure_t *htp_perfinfra = &htp_infra->perfInfra; + uint32_t power_configid = 1; + uint32_t device_id = 0; + uint32_t core_id = 0; + htp_perfinfra->createPowerConfigId(device_id, core_id, &power_configid); + _qnn_htp_perfinfra = htp_perfinfra; + _qnn_power_configid = power_configid; + + return 0; + } + + + int set_rpc_polling() { + if (_qnn_rpc_pollingtime > 0) { + QnnHtpPerfInfrastructure_PowerConfig_t rpc_pollingTime; + memset(&rpc_pollingTime, 0, sizeof(rpc_pollingTime)); + rpc_pollingTime.option = + QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME; + rpc_pollingTime.rpcPollingTimeConfig = _qnn_rpc_pollingtime; + const QnnHtpPerfInfrastructure_PowerConfig_t *powerConfigs[] = {&rpc_pollingTime, nullptr}; + if (_qnn_htp_perfinfra) { + _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, powerConfigs); + } + } + return 0; + } + + + int set_high_performance_mode() { + if (nullptr == _qnn_htp_perfinfra) { + QNN_LOG_DEBUG("perf intra is null\n"); + return 1; + } + + QnnHtpPerfInfrastructure_PowerConfig_t powerConfig; + memset(&powerConfig, 0, sizeof(powerConfig)); + powerConfig.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3; + powerConfig.dcvsV3Config.dcvsEnable = 0; + powerConfig.dcvsV3Config.setDcvsEnable = 1; + powerConfig.dcvsV3Config.contextId = _qnn_power_configid; + powerConfig.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE; + powerConfig.dcvsV3Config.setSleepLatency = 1; // True to consider Latency parameter otherwise False + powerConfig.dcvsV3Config.setBusParams = 1; // True to consider Bus parameter otherwise False + powerConfig.dcvsV3Config.setCoreParams = 1; // True to consider Core parameter otherwise False + powerConfig.dcvsV3Config.sleepDisable = 0; // True to consider sleep/LPM modes, False to enable + powerConfig.dcvsV3Config.setSleepDisable = 0; // True to consider sleep disable/enable parameter otherwise False + // set Sleep latency parameter + uint32_t latencyValue = 40; + powerConfig.dcvsV3Config.sleepLatency = latencyValue; // range 40-2000 micro sec + // set Bus Clock Parameters (refer QnnHtpPerfInfrastructure_VoltageCorner_t enum) + powerConfig.dcvsV3Config.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + powerConfig.dcvsV3Config.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + powerConfig.dcvsV3Config.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + // set Core Clock Parameters (refer QnnHtpPerfInfrastructure_VoltageCorner_t enum) + powerConfig.dcvsV3Config.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + powerConfig.dcvsV3Config.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + powerConfig.dcvsV3Config.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + // set power config with different performance parameters + const QnnHtpPerfInfrastructure_PowerConfig_t *powerConfigs[] = {&powerConfig, nullptr}; + + _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, powerConfigs); + + return 0; + } + + std::string &get_qnn_graph_name() { return _graph_name; } + + bool is_rpcmem_initialized() { + return _rpcmem_initialized; + } + + void set_rpcmem_initialized(bool initialized) { + _rpcmem_initialized = initialized; + } + + int32_t rpcmem_to_fd(void * buf); + + int register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor); + + void unregister_rpcmem(); + + void *alloc_rpcmem(size_t bytes, size_t alignment); + + void free_rpcmem(void * buf); + + bool is_rpcmem_allocated(void * buf); + + bool is_rpcmem_registered(Qnn_MemHandle_t handle) { + return _qnn_mem_set.count(handle) != 0U; + } + +public: + std::map> _qnn_graph_map; + +private: + int load_system(); + + int unload_system(); + + int load_backend(std::string &lib_path, const QnnSaver_Config_t ** saver_config); + + int unload_backend(); + + void set_qnn_raw_interface(QNN_INTERFACE_VER_TYPE & raw_interface) { + _qnn_raw_interface = raw_interface; + } + + void set_qnn_raw_system_interface(QNN_SYSTEM_INTERFACE_VER_TYPE &raw_interface) { + _qnn_raw_system_interface = raw_interface; + } + +private: + static constexpr const int _required_num_providers = 1; + +private: + std::string _lib_path; + std::string _backend_name; + std::string _model_name; // prebuilt QNN model name, not used in currently + BackendIdType _backend_id; + + bool _debug_tensor = false; // flag to indicate if requested graph is to be run in debug mode + bool _do_node_validations = true; // flag to indicate whether all add_node calls need to be validated + QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_DEBUG; + + ggml_qnn_profile_level _profile_level = ggml_qnn_profile_level::profile_detail; + + qnn_interface _qnn_interface; + + void *_system_lib_handle = nullptr; + + Qnn_GraphHandle_t _qnn_graph_handle = nullptr; + + Qnn_LogHandle_t _qnn_log_handle = nullptr; + + Qnn_ProfileHandle_t _qnn_profile_handle = nullptr; + + Qnn_DeviceHandle_t _qnn_device_handle = nullptr; + + Qnn_BackendHandle_t _qnn_backend_handle = nullptr; + + Qnn_ContextHandle_t _qnn_context_handle = nullptr; + + QnnSystemContext_Handle_t _qnn_system_handle = nullptr; + + QnnHtpDevice_PerfInfrastructure_t *_qnn_htp_perfinfra = nullptr; + uint32_t _qnn_power_configid = 1; + uint32_t _qnn_rpc_pollingtime = 9999; // 0-10000 us for high performing + + QNN_INTERFACE_VER_TYPE _qnn_raw_interface; + QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface; + + std::unordered_set _qnn_mem_set; + + static std::mutex _init_mutex; + static std::unordered_map _loaded_lib_handle; + static std::unordered_map _lib_path_to_backend_id; + static std::unordered_map _loaded_backend; + + void *_rpc_lib_handle = nullptr; + std::atomic_bool _rpcmem_initialized{false}; + pfn_rpc_mem_alloc _pfn_rpc_mem_alloc; + pfn_rpc_mem_free _pfn_rpc_mem_free; + pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd; + pfn_rpc_mem_init _pfn_rpc_mem_init; + pfn_rpc_mem_deinit _pfn_rpc_mem_deinit; + std::unordered_map _rpcmem_store_map; + + + std::string _graph_name; +}; + + + +// ================================================================================================= +// +// implementation of wrapper class +// +// ================================================================================================= +std::mutex qnn_instance::_init_mutex; + +std::unordered_map qnn_instance::_loaded_lib_handle; + +std::unordered_map qnn_instance::_lib_path_to_backend_id; + +std::unordered_map qnn_instance::_loaded_backend; + + +void * qnn_instance::alloc_rpcmem(size_t bytes, size_t alignment) { + if (!_rpcmem_initialized) { + QNN_LOG_WARN("rpc memory not initialized\n"); + return nullptr; + } + + auto allocate_bytes = static_cast(bytes + alignment); + void *buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, allocate_bytes); + if (buf == nullptr) { + QNN_LOG_WARN("failed to allocate rpc memory\n"); + return nullptr; + } + + auto aligned_buf = reinterpret_cast(align_to(alignment, + reinterpret_cast(buf))); + bool status = _rpcmem_store_map.insert(std::pair(aligned_buf, buf)).second; + if (!status) { + QNN_LOG_WARN("failed to allocate rpc memory\n"); + _pfn_rpc_mem_free(buf); + } + + return aligned_buf; +} + + +void qnn_instance::free_rpcmem(void * buf) { + if (!_rpcmem_initialized) { + QNN_LOG_WARN("rpc memory not initialized\n"); + } else if (0 == _rpcmem_store_map.count(buf)) { + QNN_LOG_WARN("no allocated tensor\n"); + } else { + _pfn_rpc_mem_free(_rpcmem_store_map[buf]); + _rpcmem_store_map.erase(buf); + } +} + + +int32_t qnn_instance::rpcmem_to_fd(void *buf) { + int32_t mem_fd = -1; + if (!is_rpcmem_initialized()) { + QNN_LOG_WARN("rpc memory not initialized\n"); + } else { + mem_fd = _pfn_rpc_mem_to_fd(buf); + } + + return mem_fd; +} + + +int qnn_instance::register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor) { + if (nullptr == p_data || (nullptr == p_tensor)) { + QNN_LOG_WARN("invalid param\n"); + return 1; + } + + if (!is_rpcmem_initialized()) { + QNN_LOG_WARN("rpc memory not initialized\n"); + return 2; + } + + if (is_rpcmem_allocated(p_data)) { + QNN_LOG_WARN("rpc memory already allocated\n"); + //return 3; + } + if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) { + QNN_LOG_WARN("tensor %s has been registered shared memory\n", (QNN_VER_PTR(*p_tensor)->name)); + return 4; + } + + int32_t mem_fd = rpcmem_to_fd(p_data); + if (-1 == mem_fd) { + QNN_LOG_WARN("failed to get file descriptor\n"); + return 5; + } + QNN_LOG_DEBUG("mem_fd %d\n", mem_fd); + Qnn_MemDescriptor_t descriptor = { + {QNN_VER_PTR(*p_tensor)->rank, QNN_VER_PTR(*p_tensor)->dimensions, nullptr}, + QNN_VER_PTR(*p_tensor)->dataType, + QNN_MEM_TYPE_ION, + {{mem_fd}}}; + Qnn_MemHandle_t handle = nullptr; + int error = QNN_SUCCESS; + error = _qnn_interface.qnn_mem_register( + _qnn_context_handle, + &descriptor, + /*numDescriptors=*/1, + &handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to register shared memory, error %d, %s\n", QNN_GET_ERROR_CODE(error), + strerror(error)); + return 6; + } else { + QNN_LOG_INFO("tensor %s successfully register shared memory\n", (QNN_VER_PTR(*p_tensor)->name)); + } + QNN_VER_PTR(*p_tensor)->memHandle = handle; + _qnn_mem_set.insert(handle); + + return 0; +} + + +void qnn_instance::unregister_rpcmem() { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + + if (_qnn_mem_set.empty()) { + QNN_LOG_WARN("no rpcmem registered\n"); + } + + for (auto &mem_handle : _qnn_mem_set) { + error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to unregister shared memory, error %d\n", QNN_GET_ERROR_CODE(error)); + } + } + _qnn_mem_set.clear(); +} + + +bool qnn_instance::is_rpcmem_allocated(void * buf) { + return _rpcmem_store_map.count(buf) != 0U; +} + + +int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t ** saver_config) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + QNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str()); + + void *lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL); + if (nullptr == lib_handle) { + QNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dlerror()); + return 1; + } + + // load get_provider function + auto get_providers = load_qnn_functionpointers<_pfn_QnnInterface_getProviders *>(lib_handle, + "QnnInterface_getProviders"); + if (nullptr == get_providers) { + QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dlerror()); + return 2; + } + + // get QnnInterface Providers + std::uint32_t num_providers = 0; + const QnnInterface_t **provider_list = nullptr; + error = get_providers(&provider_list, &num_providers); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error)); + return 3; + } + QNN_LOG_DEBUG("num_providers=%d\n", num_providers); + if (num_providers != _required_num_providers) { + QNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers); + return 4; + } + + if (nullptr == provider_list) { + QNN_LOG_WARN("failed to get qnn interface providers\n"); + return 5; + } + bool found_valid_interface = false; + QNN_INTERFACE_VER_TYPE qnn_interface; + for (size_t idx = 0; idx < num_providers; idx++) { + if (QNN_API_VERSION_MAJOR == provider_list[idx]->apiVersion.coreApiVersion.major && + QNN_API_VERSION_MINOR <= provider_list[idx]->apiVersion.coreApiVersion.minor) { + found_valid_interface = true; + qnn_interface = provider_list[idx]->QNN_INTERFACE_VER_NAME; + break; + } + } + + if (!found_valid_interface) { + QNN_LOG_WARN("unable to find a valid qnn interface\n"); + return 6; + } else { + QNN_LOG_INFO("find a valid qnn interface\n"); + } + set_qnn_raw_interface(qnn_interface); + + BackendIdType backend_id = provider_list[0]->backendId; + _lib_path_to_backend_id[lib_path] = backend_id; + if (_loaded_backend.count(backend_id) > 0) { + QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n", + lib_path.c_str(), backend_id); + } + _loaded_backend[backend_id] = provider_list[0]; + if (_loaded_lib_handle.count(backend_id) > 0) { + QNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]); + int dlclose_error = dlclose(_loaded_lib_handle[backend_id]); + if (dlclose_error != 0) { + QNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], dlerror()); + } + } + _loaded_lib_handle[backend_id] = lib_handle; + _backend_id = backend_id; + +#if 0 //comment it for purpose of reduce size of APK + QnnSaver_Config_t outputdir_cfg; + outputdir_cfg.option = QNN_SAVER_CONFIG_OPTION_OUTPUT_DIRECTORY; + outputdir_cfg.outputDirectory = "/data/local/tmp/"; + + QnnSaver_Config_t backendid_cfg; + backendid_cfg.option = QNN_SAVER_CONFIG_OPTION_BACKEND_ID; + backendid_cfg.backendId = _backend_id; + const QnnSaver_Config_t *saverCfg[] = {&outputdir_cfg, &backendid_cfg, nullptr}; + if (0 == QnnSaver_initialize(saverCfg)) { + QNN_LOG_INFO("QnnSaver_initialize successfully"); + } else { + QNN_LOG_WARN("QnnSaver_initialize failure"); + } +#endif + auto saver_initialize = load_qnn_functionpointers<_pfn_QnnSaver_initialize *>( + _loaded_lib_handle[backend_id], "QnnSaver_initialize"); + if (nullptr != saver_initialize) { + error = saver_initialize(saver_config); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to saver_initialize,error %d", QNN_GET_ERROR_CODE(error)); + return 7; + } + } else { + QNN_LOG_WARN("saver_initialize is null\n"); + } + + return 0; +} + + +int qnn_instance::unload_backend() { + int dlclose_error = 0; + for (auto &it : _loaded_lib_handle) { + dlclose_error = dlclose(it.second); + if (dlclose_error != 0) { + QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, dlerror()); + } + } + + _loaded_lib_handle.clear(); + _lib_path_to_backend_id.clear(); + _loaded_backend.clear(); + + return 0; +} + + +int qnn_instance::load_system() { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + + std::string system_lib_path = _lib_path + "libQnnSystem.so"; + QNN_LOG_DEBUG("system_lib_path:%s\n", system_lib_path.c_str()); + + _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL); + if (nullptr == _system_lib_handle) { + QNN_LOG_WARN("can not open QNN library %s, error: %s\n", system_lib_path.c_str(), dlerror()); + return 1; + } + + auto * get_providers = reinterpret_cast<_pfn_QnnSystemInterface_getProviders *>(dlsym( + _system_lib_handle, "QnnSystemInterface_getProviders")); + if (nullptr == get_providers) { + QNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s\n", dlerror()); + return 2; + } + + uint32_t num_providers = 0; + const QnnSystemInterface_t ** provider_list = nullptr; + error = get_providers(&provider_list, &num_providers); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get providers, error %d\n", QNN_GET_ERROR_CODE(error)); + return 3; + } + + if (num_providers != _required_num_providers) { + QNN_LOG_WARN("providers is %d instead of required %d\n", num_providers, _required_num_providers); + return 4; + } + + if (nullptr == provider_list) { + QNN_LOG_WARN("can not get providers\n"); + return 5; + } + + QNN_SYSTEM_INTERFACE_VER_TYPE qnn_system_interface; + bool found_valid_system_interface = false; + for (size_t idx = 0; idx < num_providers; idx++) { + if (QNN_SYSTEM_API_VERSION_MAJOR == + provider_list[idx]->systemApiVersion.major && + QNN_SYSTEM_API_VERSION_MINOR <= + provider_list[idx]->systemApiVersion.minor) { + found_valid_system_interface = true; + qnn_system_interface = provider_list[idx]->QNN_SYSTEM_INTERFACE_VER_NAME; + break; + } + } + if (!found_valid_system_interface) { + QNN_LOG_WARN("unable to find a valid qnn system interface\n"); + return 6; + } else { + QNN_LOG_INFO("find a valid qnn system interface\n"); + } + set_qnn_raw_system_interface(qnn_system_interface); + + _qnn_interface.set_qnn_system_interface(provider_list[0]); + + _qnn_interface.qnn_system_context_create(&_qnn_system_handle); + if (nullptr == _qnn_system_handle) { + QNN_LOG_WARN("can not create QNN system contenxt\n"); + } else { + QNN_LOG_INFO("initialize qnn system successfully\n"); + } + + return 0; +} + + +int qnn_instance::unload_system() { + int result = 0; + + if (nullptr == _system_lib_handle) { + QNN_LOG_DEBUG("system lib handle is null\n"); + return 1; + } + + if (nullptr != _qnn_system_handle) { + result = _qnn_interface.qnn_system_context_free(_qnn_system_handle); + if (result != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN system context\n"); + } + _qnn_system_handle = nullptr; + } + + int dlclose_error = dlclose(_system_lib_handle); + if (dlclose_error != 0) { + QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", dlerror()); + return 2; + } + + _system_lib_handle = nullptr; + + return result; +} + + +static void ggml_qnn_logcallback(const char * fmt, + QnnLog_Level_t level, + uint64_t timestamp, + va_list argp) { + + static std::mutex log_mutex; + static unsigned char s_ggml_qnn_logbuf[GGML_QNN_LOGBUF_LEN]; + + const char * log_level_desc = ""; + switch (level) { + case QNN_LOG_LEVEL_ERROR: + log_level_desc = " ERROR "; + break; + case QNN_LOG_LEVEL_WARN: + log_level_desc = "WARNING"; + break; + case QNN_LOG_LEVEL_INFO: + log_level_desc = " INFO "; + break; + case QNN_LOG_LEVEL_DEBUG: + log_level_desc = " DEBUG "; + break; + case QNN_LOG_LEVEL_VERBOSE: + log_level_desc = "VERBOSE"; + break; + case QNN_LOG_LEVEL_MAX: + log_level_desc = "UNKNOWN"; + break; + } + + double ms = (double) timestamp / 1000000.0; + + { + std::lock_guard lock(log_mutex); + + memset(s_ggml_qnn_logbuf, 0, GGML_QNN_LOGBUF_LEN); + vsnprintf(reinterpret_cast(s_ggml_qnn_logbuf), GGML_QNN_LOGBUF_LEN, fmt, argp); + QNN_LOG_DEBUG("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggml_qnn_logbuf); + } +} + + +int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { + BackendIdType backend_id = QNN_BACKEND_ID_NULL; + QNN_LOG_DEBUG("enter qni_init\n"); + + const std::lock_guard lock(_init_mutex); + + if (0 != load_system()) { + QNN_LOG_WARN("can not load QNN system lib, pls check why?\n"); + return 1; + } else { + QNN_LOG_DEBUG("load QNN system lib successfully\n"); + } + + std::string bakend_lib_path = _lib_path + _backend_name; + if (0 == _lib_path_to_backend_id.count(bakend_lib_path)) { + int is_load_ok = load_backend(bakend_lib_path, saver_config); + if (0 != is_load_ok) { + QNN_LOG_WARN("failed to load QNN backend\n"); + return 2; + } + } + + backend_id = _lib_path_to_backend_id[bakend_lib_path]; + if (0 == _loaded_backend.count(backend_id) || + 0 == _loaded_lib_handle.count(backend_id)) { + QNN_LOG_WARN("library %s is loaded but loaded backend count=%zu, loaded lib_handle count=%zu\n", + bakend_lib_path.c_str(), + _loaded_backend.count(backend_id), + _loaded_lib_handle.count(backend_id)); + return 3; + } + + _qnn_interface.set_qnn_interface(_loaded_backend[backend_id]); + +#if 1 + _qnn_interface.qnn_log_create(ggml_qnn_logcallback, _qnn_log_level, &_qnn_log_handle); +#else + _qnn_raw_interface.logCreate(ggml_qnn_logcallback, _qnn_log_level, &_qnn_log_handle); +#endif + if (nullptr == _qnn_log_handle) { + QNN_LOG_WARN("why failed to initialize qnn log\n"); //DSP backend not work on Qualcomm SoC based low-end phone + return 4; + } else { + QNN_LOG_DEBUG("initialize qnn log successfully\n"); + } + + std::vector temp_backend_config; + _qnn_interface.qnn_backend_create(_qnn_log_handle, temp_backend_config.empty() ? nullptr + : temp_backend_config.data(), + &_qnn_backend_handle); + if (nullptr == _qnn_backend_handle) { + QNN_LOG_WARN("why failed to initialize qnn backend\n"); + return 5; + } else { + QNN_LOG_DEBUG("initialize qnn backend successfully\n"); + } + + if (nullptr != _qnn_raw_interface.propertyHasCapability) { + auto qnnStatus = _qnn_raw_interface.propertyHasCapability(QNN_PROPERTY_GROUP_DEVICE); + if (QNN_PROPERTY_NOT_SUPPORTED == qnnStatus) { + QNN_LOG_WARN("device property is not supported\n"); + } + if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnnStatus) { + QNN_LOG_WARN("device property is not known to backend\n"); + } + } + + auto qnnStatus = _qnn_raw_interface.deviceCreate(_qnn_log_handle, nullptr, &_qnn_device_handle); + if (QNN_SUCCESS != qnnStatus && QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnnStatus) { + QNN_LOG_WARN("failed to create QNN device\n"); + } else { + QNN_LOG_INFO("create device successfully\n"); + } + + if (ggml_qnn_profile_level::profile_off != _profile_level) { + QNN_LOG_INFO("profiling turned on; level = %d", _profile_level); + if (ggml_qnn_profile_level::profile_basic == _profile_level) { + QNN_LOG_INFO("basic profiling requested. creating Qnn Profile object\n"); + if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate( + _qnn_backend_handle, QNN_PROFILE_LEVEL_BASIC, &_qnn_profile_handle)) { + QNN_LOG_WARN("unable to create profile handle in the backend\n"); + return 7; + } else { + QNN_LOG_DEBUG("initialize qnn profile successfully\n"); + } + } else if (ggml_qnn_profile_level::profile_detail == _profile_level) { + QNN_LOG_INFO("detailed profiling requested. Creating Qnn Profile object\n"); + if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate( + _qnn_backend_handle, QNN_PROFILE_LEVEL_DETAILED, &_qnn_profile_handle)) { + QNN_LOG_WARN("unable to create profile handle in the backend\n"); + return 7; + } else { + QNN_LOG_DEBUG("initialize qnn profile successfully\n"); + } + } + } + + _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL); + if (nullptr == _rpc_lib_handle) { + QNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dlerror()); + return 9; + } else { + QNN_LOG_DEBUG("load rpcmem lib successfully\n"); + set_rpcmem_initialized(true); + } + _pfn_rpc_mem_init = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_init")); + _pfn_rpc_mem_deinit = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_deinit")); + _pfn_rpc_mem_alloc = reinterpret_cast(dlsym(_rpc_lib_handle,"rpcmem_alloc")); + _pfn_rpc_mem_free = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_free")); + _pfn_rpc_mem_to_fd = reinterpret_cast(dlsym(_rpc_lib_handle,"rpcmem_to_fd")); + if (nullptr == _pfn_rpc_mem_alloc || nullptr == _pfn_rpc_mem_free + || nullptr == _pfn_rpc_mem_to_fd) { + QNN_LOG_WARN("unable to access symbols in QNN RPC lib. dlerror(): %s", dlerror()); + dlclose(_rpc_lib_handle); + return 10; + } + + if (nullptr != _pfn_rpc_mem_init) // make Qualcomm's SoC based low-end phone happy + _pfn_rpc_mem_init(); + + std::vector temp_context_config; + _qnn_interface.qnn_context_create(_qnn_backend_handle, _qnn_device_handle, + temp_context_config.empty() ? nullptr + : temp_context_config.data(), + &_qnn_context_handle); + if (nullptr == _qnn_context_handle) { + QNN_LOG_WARN("why failed to initialize qnn context\n"); + return 8; + } else { + QNN_LOG_DEBUG("initialize qnn context successfully\n"); + } + + QNN_LOG_DEBUG("leave qni_init\n"); + + return 0; +} + + +int qnn_instance::qnn_finalize() { + int ret_status = 0; + Qnn_ErrorHandle_t error = QNN_SUCCESS; + + if (nullptr != _pfn_rpc_mem_deinit) // make Qualcomm's SoC based low-end phone happy + _pfn_rpc_mem_deinit(); + + if (dlclose(_rpc_lib_handle) != 0) { + QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dlerror()); + } else { + QNN_LOG_DEBUG("succeed to close rpcmem lib\n"); + } + + if (nullptr != _qnn_context_handle) { + error = _qnn_interface.qnn_context_free(_qnn_context_handle, _qnn_profile_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + + } + _qnn_context_handle = nullptr; + } + + if (nullptr != _qnn_profile_handle) { + error = _qnn_interface.qnn_profile_free(_qnn_profile_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + + } + _qnn_profile_handle = nullptr; + } + + if (nullptr != _qnn_device_handle) { + error = _qnn_interface.qnn_device_free(_qnn_device_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + + } + _qnn_device_handle = nullptr; + } + + if (nullptr != _qnn_backend_handle) { + error = _qnn_interface.qnn_backend_free(_qnn_backend_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + } + _qnn_backend_handle = nullptr; + + } + + if (nullptr != _qnn_log_handle) { + error = _qnn_interface.qnn_log_free(_qnn_log_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + } + _qnn_log_handle = nullptr; + } + + unload_backend(); + + unload_system(); + + return ret_status; +} + + +int qnn_instance::init_qnn_graph(const char * graph_name, bool debug, uint8_t do_node_validation, + const QnnGraph_Config_t ** graph_configs) { + int result = 0; + + if (nullptr == graph_name) { + QNN_LOG_WARN("graph name is null\n"); + return 1; + } + + if (!_graph_name.empty()) { + QNN_LOG_WARN("qnn model for graph %s already initialized\n", graph_name); + return 2; + } + + if (!do_node_validation) { + QNN_LOG_WARN("node validation disabled, backend will not perform op validation prior to adding node\n"); + } + + _graph_name = graph_name; + _debug_tensor = debug; + _do_node_validations = do_node_validation; + + result = _qnn_raw_interface.graphCreate(_qnn_context_handle, graph_name, graph_configs, + &_qnn_graph_handle); + if (result != QNN_GRAPH_NO_ERROR || nullptr == _qnn_graph_handle) { + QNN_LOG_WARN("failed to create graph in qnn context\n"); + return 3; + } else { + QNN_LOG_INFO("succeed to create graph %s, %p\n", graph_name, _qnn_graph_handle); + } + + return 0; +} + + +int qnn_instance::finalize_qnn_graph() { + if (nullptr != _qnn_graph_handle) { + if (_qnn_raw_interface.graphFinalize(_qnn_graph_handle, _qnn_profile_handle, nullptr) != + QNN_GRAPH_NO_ERROR) { + QNN_LOG_WARN("finalizing graph failure\n"); + //return 1; + } + } else { + QNN_LOG_DEBUG("qnn graph handle is null\n"); + } + + return 0; +} + + + +// ================================================================================================= +// +// implementation of GGML's QNN backend +// +// ================================================================================================= +static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor, bool b_dump_tensor_info) { + if (nullptr == tensor) + return false; + if (b_dump_tensor_info) { + QNN_LOG_DEBUG("op name:%s, tensor type:%s", ggml_op_name(tensor->op), + ggml_type_name(tensor->type)); + } + //only support the following 3 OPs currently and ensure tensor->src[0] and tensor->src[1] is not nullptr + bool supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL) || (tensor->op == GGML_OP_MUL_MAT)); + if (!supported_op) { + return false; + } + + const struct ggml_tensor * src0 = tensor->src[0]; + const struct ggml_tensor * src1 = tensor->src[1]; + + const int64_t ne00 = tensor->src[0]->ne[0]; + const int64_t ne01 = tensor->src[0]->ne[1]; + + const int64_t ne10 = tensor->src[1]->ne[0]; + const int64_t ne11 = tensor->src[1]->ne[1]; + + const int64_t ne0 = tensor->ne[0]; + const int64_t ne1 = tensor->ne[1]; + + GGML_UNUSED(ne0); + GGML_UNUSED(ne1); + + if (b_dump_tensor_info) { + QNN_LOG_DEBUG("src0 type:%s", ggml_type_name(tensor->src[0]->type)); + QNN_LOG_DEBUG("src1 type:%s", ggml_type_name(tensor->src[1]->type)); + + if (tensor->op == GGML_OP_MUL_MAT) { + QNN_LOG_DEBUG("GGML_OP_MUL_MAT"); + QNN_LOG_DEBUG( + "src0 %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, + src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], + src0->nb[0], src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG( + "src1 %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, + src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], + src1->nb[0], src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG( + " %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + tensor->name, + tensor->type, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1], tensor->ne[2], + tensor->nb[0], + tensor->nb[1], tensor->nb[2]); + + } + } + + if (ggml_is_empty(tensor) || tensor->op == GGML_OP_RESHAPE || tensor->op == GGML_OP_TRANSPOSE || tensor->op == GGML_OP_VIEW || tensor->op == GGML_OP_PERMUTE || tensor->op == GGML_OP_NONE) { + return false; + } + + //make ggml_get_tensor_rank and QNN SDK happy + if ((ne00 <= 1 || ne01 <= 1 || ne10 <= 1 || ne11 <= 1)) { + return false; + } + + if (tensor->op == GGML_OP_ADD) { + //TODO: this is limitation + return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) + && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16) + && (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16); + + } + + if (tensor->op == GGML_OP_MUL_MAT) { + //TODO: this is limitation + return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) + && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16) + && (src0->type == src1->type) && (src0->type == tensor->type); + + if (tensor->ne[1] < 32) { // GPU/NPU inference will slower then CPU inference when tensor->ne[1] < min batch size + return false; + } + + } + + //TODO: this is limitation + return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) + && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16) + && (src0->type == src1->type) && (src0->type == tensor->type); +} + + +static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + bool graph_initialized = false; + int64_t n_begin_time = 0LL; + int64_t n_end_time = 0LL; + int64_t n_duration = 0LL; + + qnn_instance * instance = nullptr; + struct ggml_backend_qnn_context * ctx = nullptr; + + std::string graph_name = "ggml_op_qnn_add"; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Tensor_t * tensor_0 = nullptr; + Qnn_Tensor_t * tensor_1 = nullptr; + Qnn_Tensor_t * tensor_2 = nullptr; + Qnn_Param_t qnn_params[] = {}; + enum ggml_op ggmlop = GGML_OP_ADD; + Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; + + if ((nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { + QNN_LOG_WARN("pls check why GGML tensor is null"); + return; + } + tensor_0 = (Qnn_Tensor_t *)src0->extra; + tensor_1 = (Qnn_Tensor_t *)src1->extra; + tensor_2 = (Qnn_Tensor_t *)dst->extra; + if ((nullptr == tensor_0) || (nullptr == tensor_1) || (nullptr == tensor_2)) { + QNN_LOG_WARN("pls check why QNN tensor is null"); + return; + } + ctx = (struct ggml_backend_qnn_context *)g_qnn_backend->context; + if (nullptr == ctx) { + QNN_LOG_WARN("pls check why backend ctx is null"); + return; + } + instance = ctx->instance; + if (nullptr == instance) { + QNN_LOG_WARN("pls check why qnn instance is null"); + return; + } + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + + n_begin_time = ggml_time_us(); +#if 1 + QNN_LOG_DEBUG("call %s\n", __func__); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, + src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], + src0->nb[0], src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, + src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], + src1->nb[0], src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, + dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], + dst->nb[1], dst->nb[2]); + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0)); + QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1)); + QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2)); +#endif + + QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; + QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; + QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; + + src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); + src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); + dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); + + std::string map_entry = std::string(ggml_op_name(ggmlop)); + if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { + graph_initialized = true; + auto & graph_item = instance->_qnn_graph_map[map_entry]; + graph_handle = std::get<0>(graph_item); + } + + uint32_t * tensor_0_dimensions = QNN_VER_PTR(*tensor_0)->dimensions; + uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions; + uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions; + + if (!graph_initialized) { + graph_name = graph_name + "_" + std::to_string(ctx->threads) + src0->name + "_" + src1->name; + QNN_LOG_DEBUG("graph name %s", graph_name.c_str()); + //QnnGraph_Config_t graph_config; + //graph_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + //graph_config.customConfig = strdup(graph_name.c_str()); + //const QnnGraph_Config_t * p_graph_config = &graph_config; + error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, &graph_handle); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error); + return; + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + + Qnn_Tensor_t tensor_inputs[] = { + *tensor_0, + *tensor_1 + }; + Qnn_Tensor_t tensor_outputs[] = { + *tensor_2 + }; + Qnn_OpConfig_t op_config = { + (Qnn_OpConfigVersion_t) 1, .v1 = { + "ggml_op_add", + QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_ELEMENT_WISE_ADD, + 0, + qnn_params, + 2, + tensor_inputs, + 1, + tensor_outputs + } + }; + error = qnn_raw_interface.graphAddNode(graph_handle, op_config); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); + instance->_qnn_graph_map[map_entry] = graph_item; + } else { + auto & graph_item = instance->_qnn_graph_map[map_entry]; + graph_handle = std::get<0>(graph_item); + tensor_0 = std::get<1>(graph_item); + tensor_1 = std::get<2>(graph_item); + tensor_2 = std::get<3>(graph_item); + + //QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], + (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + + Qnn_Tensor_t tensor_inputs[] = { + *tensor_0, + *tensor_1 + }; + Qnn_Tensor_t tensor_outputs[] = { + *tensor_2 + }; + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + } + QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; + QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions; + QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions; + n_end_time = ggml_time_us(); + n_duration = (n_end_time - n_begin_time) / 1000; + QNN_LOG_DEBUG("duration of ggml_qnn_add : %lld milliseconds\n", n_duration); +} + + + +/* + * ggml_qnn_mul_mat was re-added as a standalone function because + * the following comments came from https://github.com/ggerganov/llama.cpp/pull/1632 + * MUL_MAT take most of the compute time (about 95%). So to speed up llama, we have to focus on MUL_MAT. + * We have three kinds of MUL_MAT to compute: + * mul_mat_f32: both src0 and src1 are F32. + * mul_mat_f16_f32: src0 is F16 and src1 is F32. + * mul_mat_q_f32: src0 is quantized (Q4_0, Q4_1, ...), and src1 is F32. +*/ +static void ggml_qnn_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + bool graph_initialized = false; + int64_t n_begin_time = 0LL; + int64_t n_end_time = 0LL; + int64_t n_duration = 0LL; + + qnn_instance * instance = nullptr; + struct ggml_backend_qnn_context * ctx = nullptr; + + std::string graph_name = "ggml_op_qnn_mul_mat"; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Tensor_t * tensor_0 = nullptr; + Qnn_Tensor_t * tensor_1 = nullptr; + Qnn_Tensor_t * tensor_2 = nullptr; + + Qnn_Param_t qnn_params[] = {}; + + enum ggml_op ggmlop = GGML_OP_ADD; + Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; + + if ((nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { + QNN_LOG_WARN("pls check why GGML tensor is null"); + return; + } + tensor_0 = (Qnn_Tensor_t *)src0->extra; + tensor_1 = (Qnn_Tensor_t *)src1->extra; + tensor_2 = (Qnn_Tensor_t *)dst->extra; + if ((nullptr == tensor_0) || (nullptr == tensor_1) || (nullptr == tensor_2)) { + QNN_LOG_WARN("pls check why QNN tensor is null"); + return; + } + ctx = (struct ggml_backend_qnn_context *)g_qnn_backend->context; + if (nullptr == ctx) { + QNN_LOG_WARN("pls check why backend ctx is null"); + return; + } + instance = ctx->instance; + if (nullptr == instance) { + QNN_LOG_WARN("pls check why qnn instance is null"); + return; + } + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + + n_begin_time = ggml_time_us(); +#if 1 + QNN_LOG_DEBUG("call %s\n", __func__); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, + src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], + src0->nb[0], src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, + src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], + src1->nb[0], src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, + dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], + dst->nb[1], dst->nb[2]); + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0)); + QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1)); + QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2)); +#endif + + QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; + QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; + QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; + + src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); + src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); + dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); + + std::string map_entry = std::string(ggml_op_name(ggmlop)); + if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { + graph_initialized = true; + auto & graph_item = instance->_qnn_graph_map[map_entry]; + graph_handle = std::get<0>(graph_item); + } + + uint32_t * tensor_0_dimensions = QNN_VER_PTR(*tensor_0)->dimensions; + uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions; + uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions; + + if (!graph_initialized) { + graph_name = graph_name + "_" + std::to_string(ctx->threads) + src0->name + "_" + src1->name; + QNN_LOG_DEBUG("graph name %s", graph_name.c_str()); + error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, &graph_handle); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error); + return; + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + + Qnn_Tensor_t tensor_inputs[] = { + *tensor_0, + *tensor_1 + }; + Qnn_Tensor_t tensor_outputs[] = { + *tensor_2 + }; + Qnn_OpConfig_t op_config = { + (Qnn_OpConfigVersion_t) 1, .v1 = { + "ggml_op_mul_mat", + QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_MAT_MUL, + 0, + qnn_params, + 2, + tensor_inputs, + 1, + tensor_outputs + } + }; + error = qnn_raw_interface.graphAddNode(graph_handle, op_config); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); + instance->_qnn_graph_map[map_entry] = graph_item; + } else { + auto & graph_item = instance->_qnn_graph_map[map_entry]; + graph_handle = std::get<0>(graph_item); + tensor_0 = std::get<1>(graph_item); + tensor_1 = std::get<2>(graph_item); + tensor_2 = std::get<3>(graph_item); + + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], + (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + + Qnn_Tensor_t tensor_inputs[] = { + *tensor_0, + *tensor_1 + }; + Qnn_Tensor_t tensor_outputs[] = { + *tensor_2 + }; + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + } + QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; + QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions; + QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions; + n_end_time = ggml_time_us(); + n_duration = (n_end_time - n_begin_time) / 1000; + QNN_LOG_DEBUG("duration of ggml_qnn_mul_mat : %lld milliseconds\n", n_duration); + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +//common function for GGML OPs using QNN API +static void ggml_qnn_hanlde_op(const enum ggml_op ggmlop, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + bool graph_initialized = false; + int64_t n_begin_time = 0LL; + int64_t n_end_time = 0LL; + int64_t n_duration = 0LL; + + qnn_instance * instance = nullptr; + struct ggml_backend_qnn_context * ctx = nullptr; + + std::string qnn_graph_name = "ggml_qnn_graph"; + std::string qnn_op_config_name = "ggml_qnn_op_config"; + const char * qnn_op_name = nullptr; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Tensor_t * tensor_0 = nullptr; + Qnn_Tensor_t * tensor_1 = nullptr; + Qnn_Tensor_t * tensor_2 = nullptr; + + Qnn_Param_t qnn_params[] = {}; + + Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; + + if ((nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { + QNN_LOG_WARN("pls check why GGML tensor is null"); + return; + } + tensor_0 = (Qnn_Tensor_t *)src0->extra; + tensor_1 = (Qnn_Tensor_t *)src1->extra; + tensor_2 = (Qnn_Tensor_t *)dst->extra; + if ((nullptr == tensor_0) || (nullptr == tensor_1) || (nullptr == tensor_2)) { + QNN_LOG_WARN("pls check why QNN tensor is null"); + return; + } + ctx = (struct ggml_backend_qnn_context *)g_qnn_backend->context; + if (nullptr == ctx) { + QNN_LOG_WARN("pls check why backend ctx is null"); + return; + } + instance = ctx->instance; + if (nullptr == instance) { + QNN_LOG_WARN("pls check why qnn instance is null"); + return; + } + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + + src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); + src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); + dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); + qnn_op_name = qnn_opname_from_ggmlop(ggmlop); + if (nullptr == qnn_op_name) { + QNN_LOG_WARN("pls check why can not get QNN OP name with ggml op %d(%s)", ggmlop, ggml_op_name(ggmlop)); + return; + } + + n_begin_time = ggml_time_us(); +#if 1 + QNN_LOG_DEBUG("call %s\n", __func__); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, + src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], + src0->nb[0], src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, + src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], + src1->nb[0], src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, + dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], + dst->nb[1], dst->nb[2]); + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0)); + QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1)); + QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2)); +#endif + + QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; + QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; + QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; + + std::string map_entry = std::string(ggml_op_name(ggmlop)); + if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { + graph_initialized = true; + auto & graph_item = instance->_qnn_graph_map[map_entry]; + graph_handle = std::get<0>(graph_item); + } + + uint32_t * tensor_0_dimensions = QNN_VER_PTR(*tensor_0)->dimensions; + uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions; + uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions; + + if (!graph_initialized) { + qnn_graph_name = qnn_graph_name + "_" + ggml_op_name(ggmlop) + std::to_string(ctx->threads) + src0->name + "_" + src1->name; + qnn_op_config_name = qnn_op_config_name + "_" + ggml_op_name(ggmlop) + std::to_string(ctx->threads) + src0->name + "_" + src1->name; + QNN_LOG_DEBUG("qnn graph name %s", qnn_graph_name.c_str()); + QNN_LOG_DEBUG("qnn op_config name %s", qnn_op_config_name.c_str()); + error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), qnn_graph_name.c_str(), nullptr, &graph_handle); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("can't create qnn graph handle with ggml op %s, graph name %s, error = %d\n", ggml_op_name(ggmlop), qnn_graph_name.c_str(), error); + return; + } + + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + + Qnn_Tensor_t tensor_inputs[] = { + *tensor_0, + *tensor_1 + }; + Qnn_Tensor_t tensor_outputs[] = { + *tensor_2 + }; + Qnn_OpConfig_t op_config = { + (Qnn_OpConfigVersion_t) 1, .v1 = { + qnn_op_config_name.c_str(), + QNN_OP_PACKAGE_NAME_QTI_AISW, + qnn_op_name, + 0, + qnn_params, + 2, + tensor_inputs, + 1, + tensor_outputs + } + }; + error = qnn_raw_interface.graphAddNode(graph_handle, op_config); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); + instance->_qnn_graph_map[map_entry] = graph_item; + } else { + auto & graph_item = instance->_qnn_graph_map[map_entry]; + graph_handle = std::get<0>(graph_item); + tensor_0 = std::get<1>(graph_item); + tensor_1 = std::get<2>(graph_item); + tensor_2 = std::get<3>(graph_item); + + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], + (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + + Qnn_Tensor_t tensor_inputs[] = { + *tensor_0, + *tensor_1 + }; + Qnn_Tensor_t tensor_outputs[] = { + *tensor_2 + }; + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + } + QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; + QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions; + QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions; + n_end_time = ggml_time_us(); + n_duration = (n_end_time - n_begin_time) / 1000; + QNN_LOG_DEBUG("duration of ggml_qnn_%s : %lld milliseconds\n", ggml_op_name(ggmlop), n_duration); + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_acc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_div(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_gelu_quick(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_tanh(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_hardsigmoid(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_hardswish(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_leaky_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_group_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_concat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_upscale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_pad(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_qnn_cpy(src0, dst, nullptr); + (void) src1; +} + + +static void ggml_qnn_mul_mat_id(const ggml_tensor * src0, + const ggml_tensor * src1, + ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); + +} + + +static void ggml_qnn_pool2d(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_sum_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_argsort(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + (void) src0; + (void) src1; + (void) dst; + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +bool ggml_qnn_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { + ggml_qnn_func_t func = nullptr; + ggml_qnn_func_common_t func_common = nullptr; + + switch (tensor->op) { + case GGML_OP_ADD: + func = ggml_qnn_add; + break; + + case GGML_OP_MUL: + func_common = ggml_qnn_hanlde_op; + break; + + case GGML_OP_MUL_MAT: + func = ggml_qnn_mul_mat; + break; + + case GGML_OP_REPEAT: + func = ggml_qnn_repeat; + break; + case GGML_OP_GET_ROWS: + func = ggml_qnn_get_rows; + break; + case GGML_OP_DUP: + func = ggml_qnn_dup; + break; + + case GGML_OP_ACC: + func = ggml_qnn_acc; + break; + + case GGML_OP_DIV: + func = ggml_qnn_div; + break; + + case GGML_OP_UNARY: + switch (ggml_get_unary_op(tensor)) { + case GGML_UNARY_OP_GELU: + func = ggml_qnn_gelu; + break; + case GGML_UNARY_OP_SILU: + func = ggml_qnn_silu; + break; + case GGML_UNARY_OP_GELU_QUICK: + func = ggml_qnn_gelu_quick; + break; + case GGML_UNARY_OP_TANH: + func = ggml_qnn_tanh; + break; + case GGML_UNARY_OP_RELU: + func = ggml_qnn_relu; + break; + case GGML_UNARY_OP_HARDSIGMOID: + func = ggml_qnn_hardsigmoid; + break; + case GGML_UNARY_OP_HARDSWISH: + func = ggml_qnn_hardswish; + break; + default: + return false; + } + break; + case GGML_OP_NORM: + func = ggml_qnn_norm; + break; + case GGML_OP_GROUP_NORM: + func = ggml_qnn_group_norm; + break; + case GGML_OP_CONCAT: + func = ggml_qnn_concat; + break; + case GGML_OP_UPSCALE: + func = ggml_qnn_upscale; + break; + case GGML_OP_PAD: + func = ggml_qnn_pad; + break; + case GGML_OP_LEAKY_RELU: + func = ggml_qnn_leaky_relu; + break; + case GGML_OP_RMS_NORM: + func = ggml_qnn_rms_norm; + break; + case GGML_OP_MUL_MAT_ID: + func = ggml_qnn_mul_mat_id; + break; + case GGML_OP_SCALE: + func = ggml_qnn_scale; + break; + case GGML_OP_SQR: + func = ggml_qnn_sqr; + break; + case GGML_OP_CLAMP: + func = ggml_qnn_clamp; + break; + case GGML_OP_CPY: + func = ggml_qnn_cpy; + break; + case GGML_OP_CONT: + func = ggml_qnn_dup; + break; + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + func = ggml_qnn_nop; + break; + case GGML_OP_DIAG_MASK_INF: + func = ggml_qnn_diag_mask_inf; + break; + case GGML_OP_SOFT_MAX: + func = ggml_qnn_soft_max; + break; + case GGML_OP_ROPE: + func = ggml_qnn_rope; + break; + case GGML_OP_IM2COL: + func = ggml_qnn_im2col; + break; + case GGML_OP_POOL_2D: + func = ggml_qnn_pool2d; + break; + case GGML_OP_SUM_ROWS: + func = ggml_qnn_sum_rows; + break; + case GGML_OP_ARGSORT: + func = ggml_qnn_argsort; + break; + default: + return false; + } + + if (nullptr != func) + func(tensor->src[0], tensor->src[1], tensor); + + if (nullptr != func_common) + func_common(tensor->op, tensor->src[0], tensor->src[1], tensor); + + return true; +} + + +struct ggml_backend_qnn_buffer_context { + ~ggml_backend_qnn_buffer_context() { + if (buffer) { + free(buffer); + } + + for (auto * sub_buffer : sub_buffers) { + free(sub_buffer); + } + + for (auto * qnn_tensor : qnn_tensors) { + free_qnn_tensor(*qnn_tensor); + free(qnn_tensor); + } + + sub_buffers.clear(); + qnn_tensors.clear(); + } + void * buffer = nullptr; + + struct ggml_backend_qnn_context * backend_ctx = nullptr; + + size_t buffer_size = 0; + std::vector sub_buffers; + std::vector qnn_tensors; +}; + + +static const char * ggml_backend_qnn_buffer_get_name(ggml_backend_buffer_t buffer) { + GGML_UNUSED(buffer); + return "QNN"; +} + + +[[maybe_unused]] GGML_CALL static bool ggml_backend_buffer_is_qnn(ggml_backend_buffer_t buffer) { + return buffer->iface.get_name == ggml_backend_qnn_buffer_get_name; +} + + +GGML_CALL static void ggml_backend_qnn_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; + delete ctx; +} + + +GGML_CALL static void * ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t buffer) { + ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; + + return ctx->buffer; +} + + +GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *)buffer->context; + + static int idx = 0; + char tensor_name[GGML_MAX_NAME] = { 0 }; + snprintf(tensor_name, GGML_MAX_NAME, "tensor_%2d", idx++); + + uint32_t dimensions[] = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1], (uint32_t) tensor->ne[2], (uint32_t) tensor->ne[3]}; + //TODO:only support FP32 & FP16 + Qnn_DataType_t qnn_data_type = QNN_DATATYPE_FLOAT_32; + Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; + + if (tensor->flags & GGML_TENSOR_FLAG_INPUT) { + qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; + } else if (tensor->flags & GGML_TENSOR_FLAG_OUTPUT) { + qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ; + } + Qnn_Tensor_t qnn_tensor = { + .version= QNN_TENSOR_VERSION_1, + {.v1= { + .id = 0, + .name = tensor_name, + .type = qnn_tensor_type, + .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, + .dataType = qnn_data_type, + .quantizeParams = {QNN_DEFINITION_UNDEFINED, + QNN_QUANTIZATION_ENCODING_UNDEFINED, + {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}}, + .rank = ggml_get_tensor_rank(tensor), + .dimensions = dimensions, + .memType = QNN_TENSORMEMTYPE_RAW, + {.clientBuf = {.data = nullptr, + .dataSize = 0}}}} + }; + Qnn_Tensor_t * p_qnn_tensor = (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t)); + if (nullptr == p_qnn_tensor) { + QNN_LOG_WARN("calloc failed"); + return; + } + error = deep_copy_qnn_tensors(qnn_tensor, *p_qnn_tensor); + if (error != QNN_SUCCESS) { + free(p_qnn_tensor); + QNN_LOG_DEBUG("init tensor failed"); + return; + } + tensor->extra = p_qnn_tensor; + ctx->qnn_tensors.push_back(p_qnn_tensor); +} + + +GGML_CALL static void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + GGML_UNUSED(buffer); + + memcpy((char *)tensor->data + offset, data, size); +} + + +GGML_CALL static void ggml_backend_qnn_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { + GGML_UNUSED(buffer); + memcpy(data, (const char *)tensor->data + offset, size); +} + + +GGML_CALL static bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { + GGML_UNUSED(buffer); + if (ggml_backend_buffer_is_host(src->buffer)) { + memcpy(dst->data, src->data, ggml_nbytes(src)); + return true; + } + + return false; +} + + +GGML_CALL static void ggml_backend_qnn_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; + + memset(ctx->buffer, value, ctx->buffer_size); +} + + +[[maybe_unused]] GGML_CALL static void ggml_backend_qnn_buffer_reset(ggml_backend_buffer_t buffer) { + ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; + for (auto * sub_buffer : ctx->sub_buffers) { + free(sub_buffer); + } + ctx->sub_buffers.clear(); +} + + +static ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = { + /* .get_name = */ ggml_backend_qnn_buffer_get_name, + /* .free_buffer = */ ggml_backend_qnn_buffer_free_buffer, + /* .get_base = */ ggml_backend_qnn_buffer_get_base, + /* .init_tensor = */ ggml_backend_qnn_buffer_init_tensor, + /* .set_tensor = */ ggml_backend_qnn_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_qnn_buffer_get_tensor, + /* .cpy_tensor = */ ggml_backend_qnn_buffer_cpy_tensor, + /* .clear = */ ggml_backend_qnn_buffer_clear, + /* .reset = */ nullptr, +}; + + +GGML_CALL static const char * ggml_backend_qnn_buffer_type_name(ggml_backend_buffer_type_t buft) { + return "QNN"; +} + + +static void * ggml_qnn_host_malloc(size_t n) { + void * data = nullptr; + const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n); + if (result != 0) { + QNN_LOG_WARN("%s: error: posix_memalign failed\n", __func__); + return nullptr; + } + + return data; +} + + +GGML_CALL static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + ggml_backend_qnn_buffer_context * ctx = new ggml_backend_qnn_buffer_context; + + const size_t size_page = sysconf(_SC_PAGESIZE); + + size_t size_aligned = size; + if ((size_aligned % size_page) != 0) { + size_aligned += (size_page - (size_aligned % size_page)); + } + + //TODO:use pre-allocated buffer in internal memory pool + ctx->buffer = ggml_qnn_host_malloc(size_aligned); + ctx->buffer_size = size_aligned; + + ctx->backend_ctx = &g_qnn_mgr[g_current_device]; + + if (nullptr == ctx->buffer) { + QNN_LOG_WARN("%s: failed to allocate %.2f MiB\n", __func__, size / (1 << 20)); + return nullptr; + } + + return ggml_backend_buffer_init(buft, ggml_backend_qnn_buffer_interface, ctx, size); +} + + +GGML_CALL static size_t ggml_backend_qnn_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + return 32; +} + + +//TODO: this value is an experimental value, works fine with whisper/llm/minicpm-v inference on Android +GGML_CALL static size_t ggml_backend_qnn_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + + return (96 * 1024 * 1024); +} + + +GGML_CALL static bool ggml_backend_qnn_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, + ggml_backend_t backend) { + GGML_UNUSED(buft); + + return ggml_backend_is_qnn(backend) || ggml_backend_is_cpu(backend); +} + + +GGML_CALL static bool ggml_backend_qnn_buffer_is_host(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + return true; +} + + +GGML_CALL static const char * ggml_backend_qnn_name(ggml_backend_t backend) { + return "QNN"; +} + + +GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { + QNN_LOG_INFO("enter %s", __func__ ); + ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; + QNN_LOG_DEBUG("idx %d, name:%s", ctx->device, g_qnn_mgr[ctx->device].name); + + qnn_instance * instance = (qnn_instance*)g_qnn_mgr[ctx->device].instance; + if (instance != nullptr) { + std::map>::iterator graph_it; + for (graph_it = instance->_qnn_graph_map.begin(); graph_it != instance->_qnn_graph_map.end(); graph_it++) { + auto & graph_item = graph_it->second; + Qnn_GraphHandle_t & graph_handle = std::get<0>(graph_item); + GGML_UNUSED(graph_handle); + QNN_LOG_DEBUG("graph type:%s", graph_it->first.c_str()); + } + instance->_qnn_graph_map.clear(); + + instance->qnn_finalize(); + delete instance; + g_qnn_mgr[ctx->device].instance = nullptr; + } + + if (g_qnn_mgr[ctx->device].backend != nullptr) { + delete backend; + g_qnn_backend = nullptr; + g_qnn_mgr[ctx->device].backend = nullptr; + } + QNN_LOG_INFO("leave %s", __func__ ); +} + + +GGML_CALL static ggml_backend_buffer_type_t ggml_backend_qnn_get_default_buffer_type(ggml_backend_t backend) { + ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; + + return ggml_backend_qnn_buffer_type(ctx->device); +} + + +GGML_CALL static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { + enum ggml_status result = GGML_STATUS_SUCCESS; + ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; + GGML_UNUSED(ctx); + + ggml_compute_params params = {}; + params.type = GGML_TASK_TYPE_COMPUTE; + params.ith = 0; + for (int i = 0; i < cgraph->n_nodes; i++) { + ggml_tensor * node = cgraph->nodes[i]; + if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { + continue; + } + bool ok = ggml_qnn_compute_forward(¶ms, node); + if (!ok) { + QNN_LOG_DEBUG("%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); + } + } + + return result; +} + + +GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const ggml_tensor * op) { + GGML_UNUSED(backend); + + return (ggml_qnn_can_handle_op(op, true)); +} + + +//note: this function be used with proposal/refined ggml backend subsystem in this PR: +// https://github.com/ggerganov/llama.cpp/pull/7641 +// new ggml backend(only using system memory: ggml_backend_xxx_buffer_is_host return true) +// can following this style for mixed inference between CPU&GPU / CPU&NPU very easily +GGML_CALL static bool ggml_backend_qnn_offload_op(ggml_backend_t backend, const ggml_tensor * tensor) { + GGML_UNUSED(backend); + + return ggml_qnn_compute_forward(nullptr, (ggml_tensor*)tensor); +} + + +static ggml_backend_i ggml_backend_qnn_interface = { + /* .get_name = */ ggml_backend_qnn_name, + /* .free = */ ggml_backend_qnn_free, + /* .get_default_buffer_type = */ ggml_backend_qnn_get_default_buffer_type, + /* .set_tensor_async = */ nullptr, + /* .get_tensor_async = */ nullptr, + /* .cpy_tensor_async = */ nullptr, + /* .synchronize = */ nullptr, + /* .graph_plan_create = */ nullptr, + /* .graph_plan_free = */ nullptr, + /* .graph_plan_compute = */ nullptr, + /* .graph_compute = */ ggml_backend_qnn_graph_compute, + /* .supports_op = */ ggml_backend_qnn_supports_op, + /* .offload_op = */ ggml_backend_qnn_offload_op, + /* .event_new = */ nullptr, + /* .event_free = */ nullptr, + /* .event_record = */ nullptr, + /* .event_wait = */ nullptr, + /* .event_synchronize = */ nullptr, +}; + + +static ggml_guid_t ggml_backend_qnn_guid() { + static ggml_guid guid = {0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81, 0x92, 0xa3, 0xb4, 0xc5, + 0xd6, 0xe7, 0xf8, 0x09}; + return &guid; +} + + +static ggml_backend_t ggml_backend_qnn_reg_init(const char * params, void * user_data) { + if (nullptr == params) { + //QNN library path + //can be hardcoded to "/data/local/tmp/" for Android command line application + //or specified in JNI layer for Android APK + params = "/data/local/tmp/"; + } + ggml_backend_t qnn_backend = ggml_backend_qnn_init((int) (intptr_t) user_data, params); + + return qnn_backend; +} + + +bool ggml_backend_is_qnn(ggml_backend_t backend) { + return backend != nullptr && ggml_guid_matches(backend->guid, ggml_backend_qnn_guid()); +} + + +void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int n_threads) { + GGML_ASSERT(ggml_backend_is_qnn(backend)); + + struct ggml_backend_qnn_context * ctx = (struct ggml_backend_qnn_context *)backend->context; + ctx->threads = n_threads; +} + + +const char * ggml_backend_qnn_get_name(ggml_backend_t backend) { + return backend->iface.get_name(backend); +} + + +int ggml_backend_qnn_get_device_count() { + return GGML_QNN_MAX_DEVICES; +} + + +void ggml_backend_qnn_get_device_description(size_t dev_num, char * description, size_t description_size) { + if (nullptr == description || 0 == description_size) { + QNN_LOG_WARN("invalid param"); + return; + } + + if (dev_num >= GGML_QNN_MAX_DEVICES) { + QNN_LOG_WARN("invalid param"); + return; + } + + snprintf(description, description_size, "%s", g_qnn_mgr[dev_num].name); + QNN_LOG_DEBUG("description:%s", description); +} + + +ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device_index) { + if (device_index >= GGML_QNN_MAX_DEVICES) { + QNN_LOG_DEBUG("ggml_backend_qnn_buffer_type error: device_index:%d is out of range [0, %d]\n", + device_index, GGML_QNN_MAX_DEVICES - 1); + return nullptr; + } + + static struct ggml_backend_buffer_type ggml_backend_buffer_type_qnn = { + /* .iface = */ { + /* .get_name = */ ggml_backend_qnn_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_qnn_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_qnn_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_qnn_buffer_type_get_max_size, + /* .get_alloc_size = */ nullptr,// defaults to ggml_nbytes + /* .supports_backend = */ ggml_backend_qnn_buffer_type_supports_backend, + /* .is_host = */ ggml_backend_qnn_buffer_is_host + }, + /* .context = */ nullptr, + }; + + return &ggml_backend_buffer_type_qnn; +} + + +/** + * + * @param device 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2: QNN_BACKEND_NPU(aka HTP/DSP) + * @param qnn_lib_path qnn library path, such as "/data/local/tmp/" on Android or specified in JNI layer + * @return + */ +ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { + int result = 0; + + if (nullptr == qnn_lib_path) + return nullptr; + + QNN_LOG_DEBUG("device %d", device); + QNN_LOG_DEBUG("qnn_lib_path %s", qnn_lib_path); + if (device >= GGML_QNN_MAX_DEVICES) { + QNN_LOG_ERROR("invalid device %d", device); + return nullptr; + } + + if (nullptr != g_qnn_mgr[device].backend) { + QNN_LOG_ERROR("qnn backend %d(%s) already loaded", device, get_qnn_backend_name(device)); + if (device == g_current_device) { + g_qnn_backend = g_qnn_mgr[device].backend; + QNN_LOG_INFO("re-use cached backend %d(%s)", device, get_qnn_backend_name(device)); + return g_qnn_mgr[device].backend; + } else { + QNN_LOG_INFO("delete previous backend %d(%s)", device, get_qnn_backend_name(device)); + ggml_backend_qnn_free(g_qnn_backend); + } + } + + std::string path = qnn_lib_path; + if (QNN_BACKEND_NPU == device) { + if (0 == setenv("LD_LIBRARY_PATH", + (path + + ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images").c_str(), + 1)) { + QNN_LOG_INFO("QNN DSP backend setenv successfully"); + } else { + QNN_LOG_ERROR("QNN DSP backend setenv failure"); + } + if (0 == setenv("ADSP_LIBRARY_PATH", + (path + + ";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp").c_str(), + 1)) { + QNN_LOG_INFO("QNN DSP backend setenv successfully"); + } else { + QNN_LOG_ERROR("QNN DSP backend setenv failure"); + } + } else { + if (0 == setenv("LD_LIBRARY_PATH", + (path + + ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images").c_str(), + 1)) { + QNN_LOG_INFO("%s backend setenv successfully\n", get_qnn_backend_name(device)); + } else { + QNN_LOG_ERROR("%s backend setenv failure\n", get_qnn_backend_name(device)); + } + } + + qnn_instance * instance = nullptr; + instance = new qnn_instance(qnn_lib_path, g_qnn_mgr[device].lib, ""); + result = instance->qnn_init(nullptr); + if (0 != result) { + QNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why\n", get_qnn_backend_name(device)); + delete instance; + return nullptr; + } + qnn_interface qnn_interface = instance->get_qnn_interface(); + if (!qnn_interface.is_loaded()) { + QNN_LOG_WARN("qnn subsystem failure\n"); + delete instance; + return nullptr; + } + + std::string device_name = get_qnn_backend_name(device); + QNN_LOG_INFO("qnn device name %s", device_name.c_str()); + instance->init_qnn_graph(device_name.c_str(), false); + g_qnn_mgr[device].instance = instance; + g_qnn_mgr[device].raw_interface = instance->get_qnn_raw_interface(); + g_qnn_mgr[device].raw_system_interface = instance->get_qnn_raw_system_interface(); + + ggml_backend_t qnn_backend = new ggml_backend{ + /* .guid = */ ggml_backend_qnn_guid(), + /* .iface = */ ggml_backend_qnn_interface, + /* .context = */ &g_qnn_mgr[device] + }; + g_qnn_mgr[device].backend = qnn_backend; + g_qnn_backend = g_qnn_mgr[device].backend; + g_current_device = device; + + return qnn_backend; +} + + +extern "C" GGML_CALL int ggml_backend_qnn_reg_devices(void); + +GGML_CALL int ggml_backend_qnn_reg_devices() { + for (size_t idx = 0; idx < GGML_QNN_MAX_DEVICES; idx++) { + char name[GGML_MAX_NAME]; + ggml_backend_qnn_get_device_description(idx, name, GGML_MAX_NAME); + ggml_backend_register(name, ggml_backend_qnn_reg_init, ggml_backend_qnn_buffer_type(idx), + (void *) (intptr_t)idx); + } + + return GGML_QNN_MAX_DEVICES; +} diff --git a/ggml-qnn.h b/ggml-qnn.h new file mode 100644 index 0000000000000..c61ebd25d9ba6 --- /dev/null +++ b/ggml-qnn.h @@ -0,0 +1,43 @@ +#pragma once + +#include "ggml.h" +#include "ggml-backend.h" + +#ifdef __cplusplus +extern "C" { +#endif + + +#define GGML_QNN_MAX_DEVICES 3 + +//QNN cDSP and HTA backend would not be used currently, just focus on QNN CPU/GPU/NPU(aka HTP/DSP) backend currently +enum QNNBackend { + QNN_BACKEND_CPU, + QNN_BACKEND_GPU, + QNN_BACKEND_NPU, + QNN_BACKEND_GGML, //"fake" QNN backend just for compare performance between QNN and original GGML +}; + +GGML_API int ggml_backend_qnn_reg_devices(void); + +/** + * + * @param device 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2: QNN_BACKEND_NPU(aka HTP/DSP) + * @param qnn_lib_path qnn library path, such as "/data/local/tmp/" on Android or specified in JNI layer + * @return + */ +GGML_API ggml_backend_t ggml_backend_qnn_init(size_t dev_num, const char * qnn_lib_path); + +GGML_API bool ggml_backend_is_qnn(ggml_backend_t backend); + +GGML_API void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int thread_counts); + +GGML_API int ggml_backend_qnn_get_device_count(void); + +GGML_API void ggml_backend_qnn_get_device_description(size_t dev_num, char * description, size_t description_size); + +GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t dev_num); + +#ifdef __cplusplus +} +#endif diff --git a/llama.cpp b/llama.cpp index 06889126ecdc4..42a9cb2a44981 100644 --- a/llama.cpp +++ b/llama.cpp @@ -19,6 +19,8 @@ # include "ggml-sycl.h" #elif defined(GGML_USE_KOMPUTE) # include "ggml-kompute.h" +#elif defined(GGML_USE_QNN) +# include "ggml-qnn.h" #endif #ifdef GGML_USE_METAL @@ -2377,6 +2379,8 @@ static size_t llama_get_device_count(const llama_model & model) { count = ggml_backend_sycl_get_device_count(); #elif defined(GGML_USE_VULKAN) count = ggml_backend_vk_get_device_count(); +#elif defined(GGML_USE_QNN) + count = ggml_backend_qnn_get_device_count(); #endif #if defined(GGML_USE_RPC) count += model.rpc_servers.size(); @@ -2409,6 +2413,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_ if (buft == nullptr) { LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu); } +#elif defined(GGML_USE_QNN) + buft = ggml_backend_qnn_buffer_type(gpu); #endif if (buft == nullptr) { @@ -15899,6 +15905,8 @@ size_t llama_max_devices(void) { return GGML_SYCL_MAX_DEVICES; #elif defined(GGML_USE_VULKAN) return GGML_VK_MAX_DEVICES; +#elif defined(GGML_USE_QNN) + return GGML_QNN_MAX_DEVICES; #else return 1; #endif @@ -15914,7 +15922,7 @@ bool llama_supports_mlock(void) { bool llama_supports_gpu_offload(void) { #if defined(GGML_USE_CUDA) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \ - defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC) + defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC) || defined(GGML_USE_QNN) // Defined when llama.cpp is compiled with support for offloading model layers to GPU. return true; #else @@ -16225,6 +16233,19 @@ struct llama_context * llama_new_context_with_model( } ctx->backends.push_back(backend); } +#elif defined(GGML_USE_QNN) + if (model->n_gpu_layers > 0) { + //the second param is data path of prebuit QNN libs provided by Qualcomm + //can be hardcoded to "/data/local/tmp/" for Android command line application + //or specified in JNI layer for Android APK application + ggml_backend_t backend = ggml_backend_qnn_init(model->main_gpu, "/data/local/tmp/"); + if (nullptr == backend) { + LLAMA_LOG_ERROR("%s: failed to initialize QNN backend\n", __func__); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); + } #endif #if defined(GGML_USE_RPC) if (model->n_gpu_layers > 0) { diff --git a/tests/ggml-qnn/CMakeLists.txt b/tests/ggml-qnn/CMakeLists.txt new file mode 100644 index 0000000000000..15ad7be6f6c88 --- /dev/null +++ b/tests/ggml-qnn/CMakeLists.txt @@ -0,0 +1,60 @@ +cmake_minimum_required(VERSION 3.22.1) +project(ggml-qnn-test) + +set(CMAKE_VERBOSE_MAKEFILE on) +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) + +#set to ON if target Android phone is based on Qualcomm Snapdragon 8 Gen 3 +set(TARGET_SNAPDRAGON_8_GEN3 OFF) + +set(QNN_INC_PATH ${QNN_SDK_PATH}/include/QNN) +set(QNN_LIB_PATH ${QNN_SDK_PATH}/lib/aarch64-android) + +include_directories(${QNN_INC_PATH}) +include_directories(../../) # ggml.h + +set(SOURCE_FILES + ../../ggml.c + ../../ggml-alloc.c + ../../ggml-backend.c + ../../ggml-quants.c + ../../ggml-qnn.cpp + test-qnn-ops.cpp +) + + +message("QNN_SDK_PATH : ${QNN_SDK_PATH}") +message("QNN_INC_PATH : ${QNN_INC_PATH}") +message("QNN_LIB_PATH : ${QNN_LIB_PATH}") + +add_definitions(-D__ARM_NEON) +add_definitions(-DGGML_USE_QNN) + +if(CMAKE_BUILD_TYPE STREQUAL "Release") +add_definitions(-DNDEBUG) +add_definitions(-O3) +endif() + +if (TARGET_SNAPDRAGON_8_GEN3) +# the below build optimization only verified and works well on Qualcomm SM8650-AB Snapdragon 8 Gen 3 +add_definitions(-march=armv8.7-a) +add_definitions(-mcpu=cortex-x1) +add_definitions(-mtune=cortex-x1) + +else() +# the below build optimization might be works well on ALL mainstream Android phone based on Qualcomm mobile SoC +add_definitions(-mcpu=cortex-a72) + +endif() + +add_compile_options("-Wall" "-Wno-sign-compare") + +find_library(LOG_LIB log) + +link_libraries(${LOG_LIB} android) + +add_executable(${TARGET_NAME} + ${SOURCE_FILES} +) diff --git a/tests/ggml-qnn/build-ggml-qnn.sh b/tests/ggml-qnn/build-ggml-qnn.sh new file mode 100755 index 0000000000000..baca02f91347d --- /dev/null +++ b/tests/ggml-qnn/build-ggml-qnn.sh @@ -0,0 +1,95 @@ +#!/bin/bash + +set -e + +#https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct +#https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools +QNN_SDK_PATH=/opt/qcom/aistack/qnn/2.20.0.240223/ + +ANDROID_NDK=`pwd`/android-ndk-r26c +ANDROID_PLATFORM=android-34 +TARGET=ggml-qnn-test + + +function dump_vars() +{ + echo -e "ANDROID_NDK: ${ANDROID_NDK}" + echo -e "QNN_SDK_PATH: ${QNN_SDK_PATH}" +} + + +function show_pwd() +{ + echo -e "current working path:$(pwd)\n" +} + + +function check_qnn_sdk() +{ + if [ ! -d ${QNN_SDK_PATH} ]; then + echo -e "QNN_SDK_PATH ${QNN_SDK_PATH} not exist, pls check...\n" + exit 1 + fi +} + + +function check_and_download_ndk() +{ + is_android_ndk_exist=1 + + if [ ! -d ${ANDROID_NDK} ]; then + is_android_ndk_exist=0 + fi + + if [ ! -f ${ANDROID_NDK}/build/cmake/android.toolchain.cmake ]; then + is_android_ndk_exist=0 + fi + + if [ ${is_android_ndk_exist} -eq 0 ]; then + + if [ ! -f android-ndk-r26c-linux.zip ]; then + wget --no-config --quiet --show-progress -O android-ndk-r26c-linux.zip https://dl.google.com/android/repository/android-ndk-r26c-linux.zip + fi + + unzip android-ndk-r26c-linux.zip + + if [ $? -ne 0 ]; then + printf "failed to download android ndk to %s \n" "${ANDROID_NDK}" + exit 1 + fi + + printf "android ndk saved to ${ANDROID_NDK} \n\n" + else + printf "android ndk already exist:${ANDROID_NDK} \n\n" + fi +} + + +function build_arm64 +{ + cmake -H. -B./out/arm64-v8a -DTARGET_NAME=${TARGET} -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=${ANDROID_PLATFORM} -DANDROID_NDK=${ANDROID_NDK} -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DQNN_SDK_PATH=${QNN_SDK_PATH} + + cd ./out/arm64-v8a + make + + ls -lah ${TARGET} + /bin/cp ${TARGET} ../../ + cd - +} + + +function remove_temp_dir() +{ + if [ -d out ]; then + echo "remove out directory in `pwd`" + rm -rf out + fi +} + + +show_pwd +check_and_download_ndk +check_qnn_sdk +dump_vars +remove_temp_dir +build_arm64 diff --git a/tests/ggml-qnn/run-ggml-qnn.sh b/tests/ggml-qnn/run-ggml-qnn.sh new file mode 100755 index 0000000000000..a4c1f22ad70cd --- /dev/null +++ b/tests/ggml-qnn/run-ggml-qnn.sh @@ -0,0 +1,108 @@ +#!/bin/bash + +#https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct +#https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools +QNN_SDK_PATH=/opt/qcom/aistack/qnn/2.20.0.240223/ +GGML_QNN_TEST=ggml-qnn-test +REMOTE_PATH=/data/local/tmp/ + + +function check_qnn_sdk() +{ + if [ ! -d ${QNN_SDK_PATH} ]; then + echo -e "QNN_SDK_PATH ${QNN_SDK_PATH} not exist, pls check or download it from https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct...\n" + exit 1 + fi +} + + +function check_qnn_libs() +{ + #reuse the cached qnn libs in Android phone + adb shell ls ${REMOTE_PATH}/libQnnCpu.so + if [ $? -eq 0 ]; then + printf "QNN libs already exist on Android phone\n" + else + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnSystem.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnCpu.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnGpu.so ${REMOTE_PATH}/ + + #the QNN NPU(aka HTP/DSP) backend only verified on Xiaomi14(Qualcomm SM8650-AB Snapdragon 8 Gen 3) successfully + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpNetRunExtensions.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpPrepare.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpV75Stub.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so ${REMOTE_PATH}/ + fi +} + + +function show_usage() +{ + echo "Usage:" + echo " $0 GGML_OP_ADD 0/1/2" + echo " $0 GGML_OP_MUL 0/1/2" + echo " $0 GGML_OP_MUL_MAT 0/1/2" + echo -e "\n\n\n" +} + + +function main() +{ + check_qnn_libs + + #upload the latest ggml_qnn_test + adb push ${GGML_QNN_TEST} ${REMOTE_PATH} + adb shell chmod +x ${REMOTE_PATH}/${GGML_QNN_TEST} + + case "$ggmlop" in + GGML_OP_ADD) + echo "adb shell ${REMOTE_PATH}/${GGML_QNN_TEST} -t GGML_OP_ADD -b $qnnbackend" + adb shell ${REMOTE_PATH}/${GGML_QNN_TEST} -t GGML_OP_ADD -b $qnnbackend + ;; + + GGML_OP_MUL) + adb shell ${REMOTE_PATH}/${GGML_QNN_TEST} -t GGML_OP_MUL -b $qnnbackend + ;; + + GGML_OP_MUL_MAT) + adb shell ${REMOTE_PATH}/${GGML_QNN_TEST} -t GGML_OP_MUL_MAT -b $qnnbackend + ;; + + *) + printf " \n$arg not supported currently\n" + show_usage + exit 1 + ;; + esac +} + + +check_qnn_sdk + +unset ggmlop +unset qnnbackend +if [ $# == 0 ]; then + show_usage + exit 1 +elif [ $# == 1 ]; then + if [ "$1" == "-h" ]; then + #avoid upload command line program to Android phone in this scenario + show_usage + exit 1 + elif [ "$1" == "help" ]; then + #avoid upload command line program to Android phone in this scenario + show_usage + exit 1 + else + ggmlop=$1 + qnnbackend=0 + fi +elif [ $# == 2 ]; then + ggmlop=$1 + qnnbackend=$2 +else + show_usage + exit 1 +fi +main $arg diff --git a/tests/ggml-qnn/test-qnn-ops.cpp b/tests/ggml-qnn/test-qnn-ops.cpp new file mode 100644 index 0000000000000..27967270bdcd4 --- /dev/null +++ b/tests/ggml-qnn/test-qnn-ops.cpp @@ -0,0 +1,450 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ggml.h" +#include "ggml-alloc.h" +#include "ggml-backend.h" +#include "ggml-qnn.h" + +#define GGML_QNN_DEBUG 1 +#define GGML_QNN_LOGBUF_LEN 4096 + +#define QNN_LOG_ERROR(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define QNN_LOG_WARN(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define QNN_LOG_INFO(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) + +#if GGML_QNN_DEBUG +#define QNN_LOG_DEBUG(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#else +#define QNN_LOG_DEBUG(...) +#endif + +static void tensor_dump(const ggml_tensor * tensor, const char * name); + +#define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor) + +static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...) { + static std::mutex ggml_qnn_log_internal_mutex; + static char s_ggml_qnn_log_internal_buf[GGML_QNN_LOGBUF_LEN]; + + { + std::lock_guard lock(ggml_qnn_log_internal_mutex); + va_list args; + va_start(args, format); + int len_prefix = snprintf(s_ggml_qnn_log_internal_buf, GGML_QNN_LOGBUF_LEN, "[%s, %d]: ", func, line); + int len = vsnprintf(s_ggml_qnn_log_internal_buf + len_prefix, GGML_QNN_LOGBUF_LEN - len_prefix, format, args); + if (len < (GGML_QNN_LOGBUF_LEN - len_prefix)) { + //for Android command line application or WoA + printf("%s\n", s_ggml_qnn_log_internal_buf); + } + va_end(args); + } +} + + +static const char * get_qnn_backend_name(int n_backend_type) { + switch (n_backend_type) { + case 0: + return "QNN-CPU"; + case 1: + return "QNN-GPU"; + case 2: + return "QNN-NPU(HTP/DSP)"; + case 3: + return "ggml"; + default: + return "unknown"; + } +} + + +static bool ggml_graph_compute_helper( + struct ggml_backend * backend, + struct ggml_cgraph * graph, + std::vector & buf, + int n_threads, + ggml_abort_callback abort_callback, + void * abort_callback_data) { + struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); + + plan.abort_callback = abort_callback; + plan.abort_callback_data = abort_callback_data; + + if (plan.work_size > 0) { + buf.resize(plan.work_size); + plan.work_data = buf.data(); + } + + if (ggml_backend_is_cpu(backend)) { + ggml_backend_cpu_set_n_threads(backend, n_threads); + } + +#ifdef GGML_USE_QNN + if (ggml_backend_is_qnn(backend)) { + ggml_backend_qnn_set_n_threads(backend, n_threads); + } +#endif + + //a new approch of mixed inference + if (nullptr != backend) + return ggml_backend_graph_compute(backend, graph) == GGML_STATUS_SUCCESS; + else + return ggml_graph_compute(graph, &plan); +} + + +static void tensor_dump_elements(const ggml_tensor * tensor) { + float value = 0; + std::ostringstream tmposs; + if (tensor->type == GGML_TYPE_F32) { + for (int h = 0; h < tensor->ne[3]; h++) { + for (int i = 0; i < tensor->ne[2]; i++) { + for (int j = 0; j < tensor->ne[1]; j++) { + for (int k = 0; k < tensor->ne[0]; k++) { + value = ((float *) tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] + + j * tensor->ne[0] + k]; + tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value + << " "; + } + if (strlen(tmposs.str().c_str()) <= (GGML_QNN_LOGBUF_LEN - 96)) { + QNN_LOG_DEBUG("%s", tmposs.str().c_str()); + } + tmposs.clear(); + tmposs.str(""); + //QNN_LOG_DEBUG("\n"); + } + } + } + } + + //QNN_LOG_DEBUG("\n"); +} + + +static void tensor_dump(const ggml_tensor * tensor, const char * name) { + QNN_LOG_DEBUG("dump ggml tensor %s(%s)", name, tensor->name); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)", + name, + tensor->type, ggml_type_name(tensor->type), + tensor->ne[0], tensor->ne[1], tensor->ne[2], + tensor->nb[0], tensor->nb[1], tensor->nb[2]); + tensor_dump_elements(tensor); + + QNN_LOG_DEBUG("\n"); +} + + +static uint32_t get_tensor_rank(const ggml_tensor * tensor) { + uint32_t rank = 0; + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) { + rank++; + } + } + return rank; +} + + +static uint32_t get_tensor_data_size(const ggml_tensor * tensor) { + size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); + size_t n_dims = get_tensor_rank(tensor); + for (int i = 1; i < n_dims; i++) { + data_size *= tensor->ne[i]; + } + + QNN_LOG_DEBUG("get_tensor_data_size %d", data_size); + QNN_LOG_DEBUG("ggml_nbytes(tensor) %d", ggml_nbytes(tensor)); + + return ggml_nbytes(tensor); +} + + +//ref: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-backend-ops.cpp#L20 +static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) { + // static RNG initialization (revisit if n_threads stops being constant) + static const size_t n_threads = std::thread::hardware_concurrency(); + static std::vector generators = []() { + std::random_device rd; + std::vector vec; + vec.reserve(n_threads); + //for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(1234 + i); } // fixed seed + for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(rd()); } + return vec; + }(); + + size_t size = ggml_nelements(tensor); + std::vector data(size); + + auto init_thread = [&](size_t ith, size_t start, size_t end) { + std::uniform_real_distribution distribution(min, max); + for (size_t i = start; i < end; i++) { + data[i] = distribution(generators[ith]); + } + }; + + std::vector threads; + threads.reserve(n_threads); + for (size_t i = 0; i < n_threads; i++) { + size_t start = i*size/n_threads; + size_t end = (i+1)*size/n_threads; + threads.emplace_back(init_thread, i, start, end); + } + for (auto & t : threads) { + t.join(); + } + if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) { + ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float)); + } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) { + GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0); + std::vector dataq(ggml_row_size(tensor->type, size)); + std::vector imatrix(tensor->ne[0], 1.0f); // dummy importance matrix + const float * im = imatrix.data(); + if (!ggml_quantize_requires_imatrix(tensor->type)) { + // when the imatrix is optional, we want to test both quantization with and without imatrix + // use one of the random numbers to decide + if (data[0] > 0.5f*(min + max)) { + im = nullptr; + } + } + ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im); + GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size())); + ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size()); + } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) { + // This is going to create some weird integers though. + ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor)); + } else { + GGML_ASSERT(false); + } +} + + +//ref: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-backend-ops.cpp#L310 +static void initialize_tensors(ggml_context * ctx) { + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { + init_tensor_uniform(t); + } +} + + +static void show_usage() { + printf(" " \ + "\nUsage: test_qnn_ops [options]\n" \ + "\n" \ + "Options:\n" \ + " -t GGML_OP_ADD / GGML_OP_MUL / GGML_OP_MULMAT\n" \ + " -b 0(QNN_CPU) 1(QNN_GPU) 2(QNN_NPU)\n" \ + " ?/h print usage infomation\n\n" + ); +} + + +int main(int argc, char * argv[]) { + int64_t n_begin_time = 0LL; + int64_t n_end_time = 0LL; + int64_t n_duration = 0LL; + size_t ctx_size = 0; + int sizey = 4; + int sizex = 4; + int num_threads = 4; + int n_backend_type = QNN_BACKEND_CPU; + int n_ggml_op_type = GGML_OP_ADD; + + struct ggml_context * ctx = nullptr; + struct ggml_cgraph * gf = nullptr; + struct ggml_tensor * src0 = nullptr; + struct ggml_tensor * src1 = nullptr; + struct ggml_tensor * dst = nullptr; + ggml_backend_t backend = nullptr; + ggml_backend_buffer_t buffer= nullptr; + ggml_type qtype = GGML_TYPE_F32; + std::vector work_buffer; + + for (int i = 1; i < argc; i++) { + if (0 == strcmp(argv[i], "-t")) { + if (i + 1 < argc) { + if (0 == memcmp(argv[i + 1], "GGML_OP_ADD", 11)) { + n_ggml_op_type = GGML_OP_ADD; + } else if (0 == memcmp(argv[i + 1], "GGML_OP_MUL_MAT", 15)) { + n_ggml_op_type = GGML_OP_MUL_MAT; + } else if (0 == memcmp(argv[i + 1], "GGML_OP_MUL", 11)) { + n_ggml_op_type = GGML_OP_MUL; + } else { + show_usage(); + return 1; + } + i++; + } + } else if (0 == strcmp(argv[i], "-b")) { + if (i + 1 < argc) { + int backend = atoi(argv[i + 1]); + if (backend <= QNN_BACKEND_NPU) + n_backend_type = backend; + else { + show_usage(); + return 1; + } + i++; + } + } else { + show_usage(); + return 1; + } + } + + QNN_LOG_DEBUG("enter qnn_ggml_op\n"); + QNN_LOG_DEBUG("ggml op:%d(%s)", n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type)); + + n_begin_time = ggml_time_us(); + srand(time(NULL)); + + ctx_size += 1024 * 1024 * 32; + QNN_LOG_DEBUG("Allocating Memory of size %zi bytes, %zi MB\n", ctx_size, + (ctx_size / 1024 / 1024)); + + struct ggml_init_params params = { + /*.mem_size =*/ ctx_size, + /*.mem_buffer =*/ NULL, + /* no_alloc =*/ 0 + }; + + if (n_backend_type != QNN_BACKEND_GGML) { + params.no_alloc = true; + backend = ggml_backend_qnn_init(n_backend_type, "/data/local/tmp/"); + if (nullptr == backend) { + QNN_LOG_ERROR("create qnn backend %d(%s) failed", n_backend_type, get_qnn_backend_name(n_backend_type)); + return 1; + } + } + + ctx = ggml_init(params); + if (!ctx) { + QNN_LOG_ERROR("%s: ggml_init() failed\n"); + return 2; + } + + QNN_LOG_DEBUG("creating new tensors\n"); + QNN_LOG_DEBUG("ggml_blck_size(%s) %d", ggml_type_name(qtype), ggml_blck_size(qtype)); + QNN_LOG_DEBUG("ggml_type_size(%s) %d", ggml_type_name(qtype), ggml_type_size(qtype)); + if (qtype != GGML_TYPE_F32) { + sizex = ggml_blck_size(qtype); + } + + src0 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); + ggml_set_input(src0); + src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); + ggml_set_input(src1); + + switch (n_ggml_op_type) { + case GGML_OP_ADD: + dst = ggml_add(ctx, src0, src1); + break; + case GGML_OP_MUL: + dst = ggml_mul(ctx, src0, src1); + break; + case GGML_OP_MUL_MAT: + dst = ggml_mul_mat(ctx, src0, src1); + break; + default: + QNN_LOG_WARN("ggml op %d(%s) not supported", n_ggml_op_type, + ggml_op_name((enum ggml_op) n_ggml_op_type)); + ggml_free(ctx); + ggml_backend_free(backend); + return 3; + } + + ggml_set_output(dst); +#ifdef GGML_USE_QNN + if (n_backend_type != QNN_BACKEND_GGML) { + buffer = ggml_backend_alloc_ctx_tensors(ctx, backend); + if (!buffer) { + QNN_LOG_ERROR("%s: failed to allocate backend buffer\n", __func__); + ggml_free(ctx); + ggml_backend_free(backend); + return 4; + } + } +#endif + + QNN_LOG_DEBUG("creating compute graph\n"); + gf = ggml_new_graph(ctx); + ggml_build_forward_expand(gf, dst); + +#if 0 + ggml_set_f32(src0, (rand() % 100 + 1)); + ggml_set_f32(src1, (rand() % 100 + 1)); + ggml_set_f32(dst, 0.0f); +#else + if (n_backend_type != QNN_BACKEND_GGML) { + initialize_tensors(ctx); + } +#endif + + ggml_graph_compute_helper(backend, gf, work_buffer, num_threads, nullptr, nullptr); + if (get_tensor_data_size(dst) < (32 * 32)) { + QNN_LOG_DEBUG("dump tensors:\n"); + TENSOR_DUMP(src0); + TENSOR_DUMP(src1); + TENSOR_DUMP(dst); + } else { + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, + src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], + src0->nb[0], src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, + src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], + src1->nb[0], src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, + dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], + dst->nb[1], dst->nb[2]); + } + + ggml_free(ctx); + ggml_backend_buffer_free(buffer); + ggml_backend_free(backend); + + n_end_time = ggml_time_us(); + n_duration = (n_end_time - n_begin_time) / 1000; + QNN_LOG_DEBUG("duration of ut GGML_OP_%s using QNN backend %s: %lld milliseconds\n", ggml_op_name((enum ggml_op)n_ggml_op_type), get_qnn_backend_name(n_backend_type), n_duration); + + return 0; +} From 9c872cbbce2fb76b11766fb4012e9206b27726b9 Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Wed, 5 Jun 2024 12:06:17 +0800 Subject: [PATCH 003/166] refine ggml-qnn-ut program and script to make reviewers happy --- tests/ggml-qnn/CMakeLists.txt | 2 +- tests/ggml-qnn/build-ggml-qnn.sh | 95 --------- tests/ggml-qnn/ggml-qnn-ut-build-run.sh | 198 ++++++++++++++++++ .../{test-qnn-ops.cpp => ggml-qnn-ut.cpp} | 0 tests/ggml-qnn/run-ggml-qnn.sh | 108 ---------- 5 files changed, 199 insertions(+), 204 deletions(-) delete mode 100755 tests/ggml-qnn/build-ggml-qnn.sh create mode 100755 tests/ggml-qnn/ggml-qnn-ut-build-run.sh rename tests/ggml-qnn/{test-qnn-ops.cpp => ggml-qnn-ut.cpp} (100%) delete mode 100755 tests/ggml-qnn/run-ggml-qnn.sh diff --git a/tests/ggml-qnn/CMakeLists.txt b/tests/ggml-qnn/CMakeLists.txt index 15ad7be6f6c88..a78bdaeaf8009 100644 --- a/tests/ggml-qnn/CMakeLists.txt +++ b/tests/ggml-qnn/CMakeLists.txt @@ -21,7 +21,7 @@ set(SOURCE_FILES ../../ggml-backend.c ../../ggml-quants.c ../../ggml-qnn.cpp - test-qnn-ops.cpp + ggml-qnn-ut.cpp ) diff --git a/tests/ggml-qnn/build-ggml-qnn.sh b/tests/ggml-qnn/build-ggml-qnn.sh deleted file mode 100755 index baca02f91347d..0000000000000 --- a/tests/ggml-qnn/build-ggml-qnn.sh +++ /dev/null @@ -1,95 +0,0 @@ -#!/bin/bash - -set -e - -#https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct -#https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools -QNN_SDK_PATH=/opt/qcom/aistack/qnn/2.20.0.240223/ - -ANDROID_NDK=`pwd`/android-ndk-r26c -ANDROID_PLATFORM=android-34 -TARGET=ggml-qnn-test - - -function dump_vars() -{ - echo -e "ANDROID_NDK: ${ANDROID_NDK}" - echo -e "QNN_SDK_PATH: ${QNN_SDK_PATH}" -} - - -function show_pwd() -{ - echo -e "current working path:$(pwd)\n" -} - - -function check_qnn_sdk() -{ - if [ ! -d ${QNN_SDK_PATH} ]; then - echo -e "QNN_SDK_PATH ${QNN_SDK_PATH} not exist, pls check...\n" - exit 1 - fi -} - - -function check_and_download_ndk() -{ - is_android_ndk_exist=1 - - if [ ! -d ${ANDROID_NDK} ]; then - is_android_ndk_exist=0 - fi - - if [ ! -f ${ANDROID_NDK}/build/cmake/android.toolchain.cmake ]; then - is_android_ndk_exist=0 - fi - - if [ ${is_android_ndk_exist} -eq 0 ]; then - - if [ ! -f android-ndk-r26c-linux.zip ]; then - wget --no-config --quiet --show-progress -O android-ndk-r26c-linux.zip https://dl.google.com/android/repository/android-ndk-r26c-linux.zip - fi - - unzip android-ndk-r26c-linux.zip - - if [ $? -ne 0 ]; then - printf "failed to download android ndk to %s \n" "${ANDROID_NDK}" - exit 1 - fi - - printf "android ndk saved to ${ANDROID_NDK} \n\n" - else - printf "android ndk already exist:${ANDROID_NDK} \n\n" - fi -} - - -function build_arm64 -{ - cmake -H. -B./out/arm64-v8a -DTARGET_NAME=${TARGET} -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=${ANDROID_PLATFORM} -DANDROID_NDK=${ANDROID_NDK} -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DQNN_SDK_PATH=${QNN_SDK_PATH} - - cd ./out/arm64-v8a - make - - ls -lah ${TARGET} - /bin/cp ${TARGET} ../../ - cd - -} - - -function remove_temp_dir() -{ - if [ -d out ]; then - echo "remove out directory in `pwd`" - rm -rf out - fi -} - - -show_pwd -check_and_download_ndk -check_qnn_sdk -dump_vars -remove_temp_dir -build_arm64 diff --git a/tests/ggml-qnn/ggml-qnn-ut-build-run.sh b/tests/ggml-qnn/ggml-qnn-ut-build-run.sh new file mode 100755 index 0000000000000..c7bff2ee9c20e --- /dev/null +++ b/tests/ggml-qnn/ggml-qnn-ut-build-run.sh @@ -0,0 +1,198 @@ +#!/bin/bash + +set -e + +#https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct +#https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools +QNN_SDK_PATH=/opt/qcom/aistack/qnn/2.20.0.240223/ + +ANDROID_NDK=`pwd`/android-ndk-r26c +ANDROID_PLATFORM=android-34 + +GGML_QNN_UT=ggml-qnn-ut +REMOTE_PATH=/data/local/tmp/ + + +function dump_vars() +{ + echo -e "ANDROID_NDK: ${ANDROID_NDK}" + echo -e "QNN_SDK_PATH: ${QNN_SDK_PATH}" +} + + +function show_pwd() +{ + echo -e "current working path:$(pwd)\n" +} + + +function check_qnn_sdk() +{ + if [ ! -d ${QNN_SDK_PATH} ]; then + echo -e "QNN_SDK_PATH ${QNN_SDK_PATH} not exist, pls check or download it from https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct...\n" + exit 1 + fi +} + + +function check_and_download_ndk() +{ + is_android_ndk_exist=1 + + if [ ! -d ${ANDROID_NDK} ]; then + is_android_ndk_exist=0 + fi + + if [ ! -f ${ANDROID_NDK}/build/cmake/android.toolchain.cmake ]; then + is_android_ndk_exist=0 + fi + + if [ ${is_android_ndk_exist} -eq 0 ]; then + + if [ ! -f android-ndk-r26c-linux.zip ]; then + wget --no-config --quiet --show-progress -O android-ndk-r26c-linux.zip https://dl.google.com/android/repository/android-ndk-r26c-linux.zip + fi + + unzip android-ndk-r26c-linux.zip + + if [ $? -ne 0 ]; then + printf "failed to download android ndk to %s \n" "${ANDROID_NDK}" + exit 1 + fi + + printf "android ndk saved to ${ANDROID_NDK} \n\n" + else + printf "android ndk already exist:${ANDROID_NDK} \n\n" + fi +} + + +function build_arm64 +{ + cmake -H. -B./out/arm64-v8a -DTARGET_NAME=${GGML_QNN_UT} -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=${ANDROID_PLATFORM} -DANDROID_NDK=${ANDROID_NDK} -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DQNN_SDK_PATH=${QNN_SDK_PATH} + + cd ./out/arm64-v8a + make + + ls -lah ${GGML_QNN_UT} + /bin/cp ${GGML_QNN_UT} ../../ + cd - +} + + +function remove_temp_dir() +{ + if [ -d out ]; then + echo "remove out directory in `pwd`" + rm -rf out + fi +} + + +function check_qnn_libs() +{ + #reuse the cached qnn libs in Android phone + adb shell ls ${REMOTE_PATH}/libQnnCpu.so + if [ $? -eq 0 ]; then + printf "QNN libs already exist on Android phone\n" + else + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnSystem.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnCpu.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnGpu.so ${REMOTE_PATH}/ + + #the QNN NPU(aka HTP/DSP) backend only verified on Xiaomi14(Qualcomm SM8650-AB Snapdragon 8 Gen 3) successfully + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpNetRunExtensions.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpPrepare.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpV75Stub.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so ${REMOTE_PATH}/ + fi +} + + +function build_ggml_qnn_ut() +{ + show_pwd + check_and_download_ndk + check_qnn_sdk + dump_vars + remove_temp_dir + build_arm64 +} + + +function run_ggml_qnn_ut() +{ + check_qnn_libs + + #upload the latest ggml_qnn_test + adb push ${GGML_QNN_UT} ${REMOTE_PATH} + adb shell chmod +x ${REMOTE_PATH}/${GGML_QNN_UT} + + case "$ggmlop" in + GGML_OP_ADD) + echo "adb shell ${REMOTE_PATH}/${GGML_QNN_UT} -t GGML_OP_ADD -b $qnnbackend" + adb shell ${REMOTE_PATH}/${GGML_QNN_UT} -t GGML_OP_ADD -b $qnnbackend + ;; + + GGML_OP_MUL) + adb shell ${REMOTE_PATH}/${GGML_QNN_UT} -t GGML_OP_MUL -b $qnnbackend + ;; + + GGML_OP_MUL_MAT) + adb shell ${REMOTE_PATH}/${GGML_QNN_UT} -t GGML_OP_MUL_MAT -b $qnnbackend + ;; + + *) + printf " \n$arg not supported currently\n" + show_usage + exit 1 + ;; + esac +} + + +function show_usage() +{ + echo "Usage:" + echo " $0 build" + echo " $0 GGML_OP_ADD 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU)" + echo " $0 GGML_OP_MUL 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU)" + echo " $0 GGML_OP_MUL_MAT 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU)" + echo -e "\n\n\n" +} + + +unset ggmlop +unset qnnbackend + +check_qnn_sdk + +if [ $# == 0 ]; then + show_usage + exit 1 +elif [ $# == 1 ]; then + if [ "$1" == "-h" ]; then + #avoid upload command line program to Android phone in this scenario + show_usage + exit 1 + elif [ "$1" == "help" ]; then + #avoid upload command line program to Android phone in this scenario + show_usage + exit 1 + elif [ "$1" == "build" ]; then + build_ggml_qnn_ut + exit 0 + else + ggmlop=$1 + qnnbackend=0 + run_ggml_qnn_ut + fi +elif [ $# == 2 ]; then + ggmlop=$1 + qnnbackend=$2 + run_ggml_qnn_ut +else + show_usage + exit 1 +fi diff --git a/tests/ggml-qnn/test-qnn-ops.cpp b/tests/ggml-qnn/ggml-qnn-ut.cpp similarity index 100% rename from tests/ggml-qnn/test-qnn-ops.cpp rename to tests/ggml-qnn/ggml-qnn-ut.cpp diff --git a/tests/ggml-qnn/run-ggml-qnn.sh b/tests/ggml-qnn/run-ggml-qnn.sh deleted file mode 100755 index a4c1f22ad70cd..0000000000000 --- a/tests/ggml-qnn/run-ggml-qnn.sh +++ /dev/null @@ -1,108 +0,0 @@ -#!/bin/bash - -#https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct -#https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools -QNN_SDK_PATH=/opt/qcom/aistack/qnn/2.20.0.240223/ -GGML_QNN_TEST=ggml-qnn-test -REMOTE_PATH=/data/local/tmp/ - - -function check_qnn_sdk() -{ - if [ ! -d ${QNN_SDK_PATH} ]; then - echo -e "QNN_SDK_PATH ${QNN_SDK_PATH} not exist, pls check or download it from https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct...\n" - exit 1 - fi -} - - -function check_qnn_libs() -{ - #reuse the cached qnn libs in Android phone - adb shell ls ${REMOTE_PATH}/libQnnCpu.so - if [ $? -eq 0 ]; then - printf "QNN libs already exist on Android phone\n" - else - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnSystem.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnCpu.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnGpu.so ${REMOTE_PATH}/ - - #the QNN NPU(aka HTP/DSP) backend only verified on Xiaomi14(Qualcomm SM8650-AB Snapdragon 8 Gen 3) successfully - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpNetRunExtensions.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpPrepare.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpV75Stub.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so ${REMOTE_PATH}/ - fi -} - - -function show_usage() -{ - echo "Usage:" - echo " $0 GGML_OP_ADD 0/1/2" - echo " $0 GGML_OP_MUL 0/1/2" - echo " $0 GGML_OP_MUL_MAT 0/1/2" - echo -e "\n\n\n" -} - - -function main() -{ - check_qnn_libs - - #upload the latest ggml_qnn_test - adb push ${GGML_QNN_TEST} ${REMOTE_PATH} - adb shell chmod +x ${REMOTE_PATH}/${GGML_QNN_TEST} - - case "$ggmlop" in - GGML_OP_ADD) - echo "adb shell ${REMOTE_PATH}/${GGML_QNN_TEST} -t GGML_OP_ADD -b $qnnbackend" - adb shell ${REMOTE_PATH}/${GGML_QNN_TEST} -t GGML_OP_ADD -b $qnnbackend - ;; - - GGML_OP_MUL) - adb shell ${REMOTE_PATH}/${GGML_QNN_TEST} -t GGML_OP_MUL -b $qnnbackend - ;; - - GGML_OP_MUL_MAT) - adb shell ${REMOTE_PATH}/${GGML_QNN_TEST} -t GGML_OP_MUL_MAT -b $qnnbackend - ;; - - *) - printf " \n$arg not supported currently\n" - show_usage - exit 1 - ;; - esac -} - - -check_qnn_sdk - -unset ggmlop -unset qnnbackend -if [ $# == 0 ]; then - show_usage - exit 1 -elif [ $# == 1 ]; then - if [ "$1" == "-h" ]; then - #avoid upload command line program to Android phone in this scenario - show_usage - exit 1 - elif [ "$1" == "help" ]; then - #avoid upload command line program to Android phone in this scenario - show_usage - exit 1 - else - ggmlop=$1 - qnnbackend=0 - fi -elif [ $# == 2 ]; then - ggmlop=$1 - qnnbackend=$2 -else - show_usage - exit 1 -fi -main $arg From 926a8661f31c85499314c3b15f47c0709041ee07 Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Wed, 5 Jun 2024 21:10:59 +0800 Subject: [PATCH 004/166] review: replace external declaration with NDK header file --- ggml-qnn.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index 9319db227795d..15c6538d1870d 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -50,6 +50,9 @@ #include "ggml-backend-impl.h" +#if (defined __ANDROID__) || (defined ANDROID) +#include +#endif // ================================================================================================= // @@ -58,11 +61,6 @@ // ================================================================================================= class qnn_instance; - -#if (defined __ANDROID__) || (defined ANDROID) -extern "C" int __android_log_print(int prio, const char * tag, const char * fmt, ...) -__attribute__((__format__(printf, 3, 4))); -#endif static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...); From dd29834c115f5c644b34fb7e60c0175b9890da29 Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Thu, 6 Jun 2024 17:12:28 +0800 Subject: [PATCH 005/166] add supportive of quantize data type Q8_0 --- ggml-qnn.cpp | 176 +++++++++------ ggml-qnn.h | 5 +- tests/ggml-qnn/ggml-qnn-ut-build-run.sh | 37 ++-- tests/ggml-qnn/ggml-qnn-ut.cpp | 274 ++++++++++++++++-------- 4 files changed, 321 insertions(+), 171 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index 15c6538d1870d..d0927f22e514a 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -72,8 +72,6 @@ static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const #define RPCMEM_DEFAULT_FLAGS 1 #define RPCMEM_HEAP_ID_SYSTEM 25 -#define GGML_DUMP_TENSOR(tensor) ggml_tensor_dump(tensor, #tensor) - #define GGML_QNN_LOGBUF_LEN 4096 #define GGML_QNN_DEBUG 1 //for troubleshooting QNN backend @@ -195,8 +193,17 @@ static ggml_backend_t g_qnn_backend = nullptr; static int g_current_device = QNN_BACKEND_GGML; - -//QNN cDSP and HTA backend would not be used currently, just focus on QNN CPU/GPU/NPU(aka HTP/DSP) backend currently +//according to the QNN SDK Reference Guide, +//CPU - Choose a non-quantized model. Quantized models are currently incompatible with the CPU backend +//GPU - Choose a non-quantized model. Quantized models are currently incompatible with the GPU backend +//HTP - Choose a quantized model. Quantized models are required when running on the HTP backend +//DSP - Choose a quantized model. Quantized models are required when running on the DSP backend +//HTA - Choose a quantized model. Quantized models are required when running on the HTA backend +// +//only focus on Qualcomm CPU/GPU/NPU backend in this implementation of QNN backend for ggml currently +//Qualcomm CPU: Qualcomm Kryo CPU +//Qualcomm GPU: Qualcomm Adreno GPU +//Qualcomm NPU: aka HTP(Hexagon Tensor Processor), ~= cDSP(Compute DSP) + HMX(Hexagon Matrix eXtensions)/HTA(Hexagon Tensor Accelerator) static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { [QNN_BACKEND_CPU] = {.device = 0, .threads = 1, .name = "qnn-cpu", .lib = "libQnnCpu.so", .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}}, [QNN_BACKEND_GPU] = {.device = 1, .threads = 1, .name = "qnn-gpu", .lib = "libQnnGpu.so", .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}}, @@ -849,6 +856,10 @@ static Qnn_DataType_t qnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) { return QNN_DATATYPE_FLOAT_16; case GGML_TYPE_F32: return QNN_DATATYPE_FLOAT_32; + case GGML_TYPE_I8: + return QNN_DATATYPE_INT_8; + case GGML_TYPE_Q8_0: + return QNN_DATATYPE_SFIXED_POINT_8; default: break; @@ -903,14 +914,8 @@ static const char * get_qnn_backend_name(int n_backend_type) { case 2: return "QNN-NPU"; case 3: - return "ggml"; //the default GGML backend, used to compare performance between QNN backend and the default GGML backend + return "ggml"; //"fake" QNN backend, used for compare performance between QNN backend and original GGML -#if 0 //QNN cDSP and HTA backend would not be used currently, focus on QNN CPU/GPU/NPU(aka HTP/DSP) backend currently - case 3: - return "QNN-cDSP"; - case 4: - return "QNN-HTA"; -#endif default: return "unknown"; } @@ -1720,7 +1725,7 @@ static void ggml_qnn_logcallback(const char * fmt, double ms = (double) timestamp / 1000000.0; - { + if (0) { std::lock_guard lock(log_mutex); memset(s_ggml_qnn_logbuf, 0, GGML_QNN_LOGBUF_LEN); @@ -1770,7 +1775,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { _qnn_raw_interface.logCreate(ggml_qnn_logcallback, _qnn_log_level, &_qnn_log_handle); #endif if (nullptr == _qnn_log_handle) { - QNN_LOG_WARN("why failed to initialize qnn log\n"); //DSP backend not work on Qualcomm SoC based low-end phone + QNN_LOG_WARN("why failed to initialize qnn log\n"); //NPU backend not work on Qualcomm SoC based low-end phone return 4; } else { QNN_LOG_DEBUG("initialize qnn log successfully\n"); @@ -2010,14 +2015,14 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor, bool b_dum const struct ggml_tensor * src0 = tensor->src[0]; const struct ggml_tensor * src1 = tensor->src[1]; - const int64_t ne00 = tensor->src[0]->ne[0]; - const int64_t ne01 = tensor->src[0]->ne[1]; + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; - const int64_t ne10 = tensor->src[1]->ne[0]; - const int64_t ne11 = tensor->src[1]->ne[1]; + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; - const int64_t ne0 = tensor->ne[0]; - const int64_t ne1 = tensor->ne[1]; + const int64_t ne0 = tensor->ne[0]; + const int64_t ne1 = tensor->ne[1]; GGML_UNUSED(ne0); GGML_UNUSED(ne1); @@ -2057,30 +2062,15 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor, bool b_dum return false; } - if (tensor->op == GGML_OP_ADD) { - //TODO: this is limitation - return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) - && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16) - && (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16); - + // GPU/NPU inference will slower then CPU inference when tensor->ne[1] < min batch size + if (tensor->ne[1] < 32) { + return false; } - if (tensor->op == GGML_OP_MUL_MAT) { - //TODO: this is limitation - return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) - && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16) - && (src0->type == src1->type) && (src0->type == tensor->type); + int qtype = src0->type; + return (qtype == GGML_TYPE_F32 || qtype == GGML_TYPE_F16 || qtype == GGML_TYPE_Q8_0) + && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); - if (tensor->ne[1] < 32) { // GPU/NPU inference will slower then CPU inference when tensor->ne[1] < min batch size - return false; - } - - } - - //TODO: this is limitation - return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) - && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16) - && (src0->type == src1->type) && (src0->type == tensor->type); } @@ -2129,7 +2119,7 @@ static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggm QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; n_begin_time = ggml_time_us(); -#if 1 + QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", src0->name, @@ -2147,17 +2137,23 @@ static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggm QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0)); QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1)); QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2)); -#endif QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; - src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); - src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); - dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); + src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); + src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); + dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); - std::string map_entry = std::string(ggml_op_name(ggmlop)); + uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], + (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + + std::string map_entry = std::string(ggml_op_name(ggmlop)); if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { graph_initialized = true; auto & graph_item = instance->_qnn_graph_map[map_entry]; @@ -2197,6 +2193,16 @@ static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggm QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + Qnn_Tensor_t tensor_inputs[] = { *tensor_0, *tensor_1 @@ -2245,6 +2251,11 @@ static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggm (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; @@ -2255,10 +2266,6 @@ static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggm QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; - Qnn_Tensor_t tensor_inputs[] = { *tensor_0, *tensor_1 @@ -2337,7 +2344,6 @@ static void ggml_qnn_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; n_begin_time = ggml_time_us(); -#if 1 QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", src0->name, @@ -2355,17 +2361,23 @@ static void ggml_qnn_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0)); QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1)); QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2)); -#endif QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; - src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); - src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); - dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); + src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); + src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); + dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); - std::string map_entry = std::string(ggml_op_name(ggmlop)); + uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], + (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + + std::string map_entry = std::string(ggml_op_name(ggmlop)); if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { graph_initialized = true; auto & graph_item = instance->_qnn_graph_map[map_entry]; @@ -2401,6 +2413,16 @@ static void ggml_qnn_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + Qnn_Tensor_t tensor_inputs[] = { *tensor_0, *tensor_1 @@ -2543,7 +2565,7 @@ static void ggml_qnn_hanlde_op(const enum ggml_op ggmlop, const ggml_tensor * sr } n_begin_time = ggml_time_us(); -#if 1 + QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", src0->name, @@ -2561,11 +2583,17 @@ static void ggml_qnn_hanlde_op(const enum ggml_op ggmlop, const ggml_tensor * sr QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0)); QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1)); QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2)); -#endif QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; + uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], + (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + std::string map_entry = std::string(ggml_op_name(ggmlop)); if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { @@ -2606,6 +2634,16 @@ static void ggml_qnn_hanlde_op(const enum ggml_op ggmlop, const ggml_tensor * sr QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + Qnn_Tensor_t tensor_inputs[] = { *tensor_0, *tensor_1 @@ -3125,10 +3163,9 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t char tensor_name[GGML_MAX_NAME] = { 0 }; snprintf(tensor_name, GGML_MAX_NAME, "tensor_%2d", idx++); - uint32_t dimensions[] = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1], (uint32_t) tensor->ne[2], (uint32_t) tensor->ne[3]}; - //TODO:only support FP32 & FP16 - Qnn_DataType_t qnn_data_type = QNN_DATATYPE_FLOAT_32; - Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; + uint32_t dimensions[] = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1], (uint32_t) tensor->ne[2], (uint32_t) tensor->ne[3]}; + Qnn_DataType_t qnn_data_type = qnn_datatype_from_ggml_datatype(tensor->type); + Qnn_TensorType_t qnn_tensor_type= QNN_TENSOR_TYPE_APP_WRITE; if (tensor->flags & GGML_TENSOR_FLAG_INPUT) { qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; @@ -3365,7 +3402,7 @@ GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const //note: this function be used with proposal/refined ggml backend subsystem in this PR: // https://github.com/ggerganov/llama.cpp/pull/7641 -// new ggml backend(only using system memory: ggml_backend_xxx_buffer_is_host return true) +// any ggml backend(only using system memory: ggml_backend_xxx_buffer_is_host return true) // can following this style for mixed inference between CPU&GPU / CPU&NPU very easily GGML_CALL static bool ggml_backend_qnn_offload_op(ggml_backend_t backend, const ggml_tensor * tensor) { GGML_UNUSED(backend); @@ -3481,7 +3518,7 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device_index) { /** * - * @param device 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2: QNN_BACKEND_NPU(aka HTP/DSP) + * @param device 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2: QNN_BACKEND_NPU * @param qnn_lib_path qnn library path, such as "/data/local/tmp/" on Android or specified in JNI layer * @return */ @@ -3516,22 +3553,21 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { (path + ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images").c_str(), 1)) { - QNN_LOG_INFO("QNN DSP backend setenv successfully"); + QNN_LOG_INFO("QNN NPU backend setenv successfully"); } else { - QNN_LOG_ERROR("QNN DSP backend setenv failure"); + QNN_LOG_ERROR("QNN NPU backend setenv failure"); } if (0 == setenv("ADSP_LIBRARY_PATH", (path + ";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp").c_str(), 1)) { - QNN_LOG_INFO("QNN DSP backend setenv successfully"); + QNN_LOG_INFO("QNN NPU backend setenv successfully"); } else { - QNN_LOG_ERROR("QNN DSP backend setenv failure"); + QNN_LOG_ERROR("QNN NPU backend setenv failure"); } } else { if (0 == setenv("LD_LIBRARY_PATH", - (path + - ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images").c_str(), + path.c_str(), 1)) { QNN_LOG_INFO("%s backend setenv successfully\n", get_qnn_backend_name(device)); } else { diff --git a/ggml-qnn.h b/ggml-qnn.h index c61ebd25d9ba6..9ea3dcda62c64 100644 --- a/ggml-qnn.h +++ b/ggml-qnn.h @@ -10,19 +10,18 @@ extern "C" { #define GGML_QNN_MAX_DEVICES 3 -//QNN cDSP and HTA backend would not be used currently, just focus on QNN CPU/GPU/NPU(aka HTP/DSP) backend currently enum QNNBackend { QNN_BACKEND_CPU, QNN_BACKEND_GPU, QNN_BACKEND_NPU, - QNN_BACKEND_GGML, //"fake" QNN backend just for compare performance between QNN and original GGML + QNN_BACKEND_GGML, //"fake" QNN backend, used for compare performance between QNN and original GGML }; GGML_API int ggml_backend_qnn_reg_devices(void); /** * - * @param device 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2: QNN_BACKEND_NPU(aka HTP/DSP) + * @param device 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2: QNN_BACKEND_NPU * @param qnn_lib_path qnn library path, such as "/data/local/tmp/" on Android or specified in JNI layer * @return */ diff --git a/tests/ggml-qnn/ggml-qnn-ut-build-run.sh b/tests/ggml-qnn/ggml-qnn-ut-build-run.sh index c7bff2ee9c20e..192f2f4bda2f5 100755 --- a/tests/ggml-qnn/ggml-qnn-ut-build-run.sh +++ b/tests/ggml-qnn/ggml-qnn-ut-build-run.sh @@ -4,7 +4,8 @@ set -e #https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct #https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools -QNN_SDK_PATH=/opt/qcom/aistack/qnn/2.20.0.240223/ +#QNN SDK released on 20240531 +QNN_SDK_PATH=/opt/qcom/aistack/qairt/2.23.0.240531/ ANDROID_NDK=`pwd`/android-ndk-r26c ANDROID_PLATFORM=android-34 @@ -89,6 +90,23 @@ function remove_temp_dir() } +function update_qnn_libs() +{ + check_qnn_sdk + + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnSystem.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnCpu.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnGpu.so ${REMOTE_PATH}/ + + #the QNN NPU(aka HTP/DSP) backend only verified on Xiaomi14(Qualcomm SM8650-AB Snapdragon 8 Gen 3) successfully + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpNetRunExtensions.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpPrepare.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpV75Stub.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so ${REMOTE_PATH}/ +} + + function check_qnn_libs() { #reuse the cached qnn libs in Android phone @@ -96,16 +114,7 @@ function check_qnn_libs() if [ $? -eq 0 ]; then printf "QNN libs already exist on Android phone\n" else - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnSystem.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnCpu.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnGpu.so ${REMOTE_PATH}/ - - #the QNN NPU(aka HTP/DSP) backend only verified on Xiaomi14(Qualcomm SM8650-AB Snapdragon 8 Gen 3) successfully - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpNetRunExtensions.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpPrepare.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpV75Stub.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so ${REMOTE_PATH}/ + update_qnn_libs fi } @@ -155,7 +164,8 @@ function run_ggml_qnn_ut() function show_usage() { echo "Usage:" - echo " $0 build" + echo " $0 build (build Android command line UT program)" + echo " $0 updateqnnlibs (upload the latest QNN libs to Android phone)" echo " $0 GGML_OP_ADD 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU)" echo " $0 GGML_OP_MUL 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU)" echo " $0 GGML_OP_MUL_MAT 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU)" @@ -183,6 +193,9 @@ elif [ $# == 1 ]; then elif [ "$1" == "build" ]; then build_ggml_qnn_ut exit 0 + elif [ "$1" == "updateqnnlibs" ]; then + update_qnn_libs + exit 0 else ggmlop=$1 qnnbackend=0 diff --git a/tests/ggml-qnn/ggml-qnn-ut.cpp b/tests/ggml-qnn/ggml-qnn-ut.cpp index 27967270bdcd4..1041252f3770f 100644 --- a/tests/ggml-qnn/ggml-qnn-ut.cpp +++ b/tests/ggml-qnn/ggml-qnn-ut.cpp @@ -87,7 +87,7 @@ static const char * get_qnn_backend_name(int n_backend_type) { case 1: return "QNN-GPU"; case 2: - return "QNN-NPU(HTP/DSP)"; + return "QNN-NPU"; case 3: return "ggml"; default: @@ -131,9 +131,54 @@ static bool ggml_graph_compute_helper( } -static void tensor_dump_elements(const ggml_tensor * tensor) { +#define QK8_0 32 +typedef struct { + uint16_t d; // delta + int8_t qs[QK8_0]; // quants +} block_q8_0; + + +static inline float ggml_compute_fp16_to_fp32(uint16_t h) { + __fp16 tmp; + memcpy(&tmp, &h, sizeof(uint16_t)); + return (float)tmp; +} +#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) + +static void tensor_dump(const ggml_tensor * tensor, const char * name) { + QNN_LOG_DEBUG("dump ggml tensor %s(%s): type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + name, tensor->name, + tensor->type, ggml_type_name(tensor->type), + tensor->ne[0], tensor->ne[1], tensor->ne[2], + tensor->nb[0], tensor->nb[1], tensor->nb[2]); + float value = 0; std::ostringstream tmposs; + if (nullptr == tensor) { + QNN_LOG_WARN("tensor is null"); + return; + } + if (tensor->type == GGML_TYPE_I8) { + for (int h = 0; h < tensor->ne[3]; h++) { + for (int i = 0; i < tensor->ne[2]; i++) { + for (int j = 0; j < tensor->ne[1]; j++) { + for (int k = 0; k < tensor->ne[0]; k++) { + value = ((int8_t *) tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] + + j * tensor->ne[0] + k]; + tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value + << " "; + } + tmposs << "\n"; + } + } + } + if (strlen(tmposs.str().c_str()) <= (GGML_QNN_LOGBUF_LEN - 96)) { + QNN_LOG_DEBUG("\n%s\n", tmposs.str().c_str()); + tmposs.clear(); + tmposs.str(""); + } + } + if (tensor->type == GGML_TYPE_F32) { for (int h = 0; h < tensor->ne[3]; h++) { for (int i = 0; i < tensor->ne[2]; i++) { @@ -144,31 +189,59 @@ static void tensor_dump_elements(const ggml_tensor * tensor) { tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value << " "; } - if (strlen(tmposs.str().c_str()) <= (GGML_QNN_LOGBUF_LEN - 96)) { - QNN_LOG_DEBUG("%s", tmposs.str().c_str()); - } - tmposs.clear(); - tmposs.str(""); - //QNN_LOG_DEBUG("\n"); + tmposs << "\n"; } } } + if (strlen(tmposs.str().c_str()) <= (GGML_QNN_LOGBUF_LEN - 96)) { + QNN_LOG_DEBUG("\n%s\n", tmposs.str().c_str()); + tmposs.clear(); + tmposs.str(""); + } } - //QNN_LOG_DEBUG("\n"); -} - - -static void tensor_dump(const ggml_tensor * tensor, const char * name) { - QNN_LOG_DEBUG("dump ggml tensor %s(%s)", name, tensor->name); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)", - name, - tensor->type, ggml_type_name(tensor->type), - tensor->ne[0], tensor->ne[1], tensor->ne[2], - tensor->nb[0], tensor->nb[1], tensor->nb[2]); - tensor_dump_elements(tensor); + if (tensor->type == GGML_TYPE_F16) { + for (int h = 0; h < tensor->ne[3]; h++) { + for (int i = 0; i < tensor->ne[2]; i++) { + for (int j = 0; j < tensor->ne[1]; j++) { + for (int k = 0; k < tensor->ne[0]; k++) { + unsigned short tmpvalue = ((unsigned short *) tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] + + j * tensor->ne[0] + k]; + value = GGML_FP16_TO_FP32(tmpvalue); + tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value + << " "; + } + tmposs << "\n"; + } + } + } + if (strlen(tmposs.str().c_str()) <= (GGML_QNN_LOGBUF_LEN - 96)) { + QNN_LOG_DEBUG("\n%s\n", tmposs.str().c_str()); + tmposs.clear(); + tmposs.str(""); + } + } - QNN_LOG_DEBUG("\n"); + if (tensor->type == GGML_TYPE_Q8_0) { + block_q8_0 * tmp = ((block_q8_0 *)tensor->data); + for (int j = 0; j < tensor->ne[1]; j++) { + int n = tensor->ne[0] / QK8_0; //blocks per row + for (int z = 0; z < n; z++) { + const float d = GGML_FP16_TO_FP32(tmp[ j * n + z ].d); + for (int k = 0; k < QK8_0; k++) { + value = tmp[j * n + z].qs[k] * d; + tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value + << " "; + } + } + tmposs << "\n"; + } + if (strlen(tmposs.str().c_str()) <= (GGML_QNN_LOGBUF_LEN - 96)) { + QNN_LOG_DEBUG("\n%s\n", tmposs.str().c_str()); + tmposs.clear(); + tmposs.str(""); + } + } } @@ -231,7 +304,8 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m t.join(); } if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) { - ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float)); + //ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float)); + memcpy((char*)tensor->data, data.data(), size * sizeof(float)); } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) { GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0); std::vector dataq(ggml_row_size(tensor->type, size)); @@ -246,10 +320,12 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m } ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im); GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size())); - ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size()); + //ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size()); + memcpy((char*)tensor->data, dataq.data(), dataq.size()); } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) { // This is going to create some weird integers though. - ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor)); + //ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor)); + memcpy((char*)tensor->data, data.data(), ggml_nbytes(tensor)); } else { GGML_ASSERT(false); } @@ -276,16 +352,13 @@ static void show_usage() { } -int main(int argc, char * argv[]) { +static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { int64_t n_begin_time = 0LL; int64_t n_end_time = 0LL; int64_t n_duration = 0LL; size_t ctx_size = 0; int sizey = 4; int sizex = 4; - int num_threads = 4; - int n_backend_type = QNN_BACKEND_CPU; - int n_ggml_op_type = GGML_OP_ADD; struct ggml_context * ctx = nullptr; struct ggml_cgraph * gf = nullptr; @@ -294,50 +367,23 @@ int main(int argc, char * argv[]) { struct ggml_tensor * dst = nullptr; ggml_backend_t backend = nullptr; ggml_backend_buffer_t buffer= nullptr; - ggml_type qtype = GGML_TYPE_F32; - std::vector work_buffer; - for (int i = 1; i < argc; i++) { - if (0 == strcmp(argv[i], "-t")) { - if (i + 1 < argc) { - if (0 == memcmp(argv[i + 1], "GGML_OP_ADD", 11)) { - n_ggml_op_type = GGML_OP_ADD; - } else if (0 == memcmp(argv[i + 1], "GGML_OP_MUL_MAT", 15)) { - n_ggml_op_type = GGML_OP_MUL_MAT; - } else if (0 == memcmp(argv[i + 1], "GGML_OP_MUL", 11)) { - n_ggml_op_type = GGML_OP_MUL; - } else { - show_usage(); - return 1; - } - i++; - } - } else if (0 == strcmp(argv[i], "-b")) { - if (i + 1 < argc) { - int backend = atoi(argv[i + 1]); - if (backend <= QNN_BACKEND_NPU) - n_backend_type = backend; - else { - show_usage(); - return 1; - } - i++; - } - } else { - show_usage(); - return 1; - } - } + ggml_type qtype = GGML_TYPE_I8; + qtype = GGML_TYPE_F32; + qtype = GGML_TYPE_F16; + qtype = GGML_TYPE_Q8_0; + std::vector work_buffer; QNN_LOG_DEBUG("enter qnn_ggml_op\n"); - QNN_LOG_DEBUG("ggml op:%d(%s)", n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type)); + QNN_LOG_DEBUG("ggml op:%d(%s)\n", n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type)); + n_begin_time = ggml_time_us(); srand(time(NULL)); ctx_size += 1024 * 1024 * 32; QNN_LOG_DEBUG("Allocating Memory of size %zi bytes, %zi MB\n", ctx_size, - (ctx_size / 1024 / 1024)); + (ctx_size / 1024 / 1024)); struct ggml_init_params params = { /*.mem_size =*/ ctx_size, @@ -349,7 +395,7 @@ int main(int argc, char * argv[]) { params.no_alloc = true; backend = ggml_backend_qnn_init(n_backend_type, "/data/local/tmp/"); if (nullptr == backend) { - QNN_LOG_ERROR("create qnn backend %d(%s) failed", n_backend_type, get_qnn_backend_name(n_backend_type)); + QNN_LOG_ERROR("create qnn backend %d(%s) failed\n", n_backend_type, get_qnn_backend_name(n_backend_type)); return 1; } } @@ -361,15 +407,25 @@ int main(int argc, char * argv[]) { } QNN_LOG_DEBUG("creating new tensors\n"); - QNN_LOG_DEBUG("ggml_blck_size(%s) %d", ggml_type_name(qtype), ggml_blck_size(qtype)); - QNN_LOG_DEBUG("ggml_type_size(%s) %d", ggml_type_name(qtype), ggml_type_size(qtype)); - if (qtype != GGML_TYPE_F32) { + QNN_LOG_DEBUG("ggml_blck_size(%s) %d\n", ggml_type_name(qtype), ggml_blck_size(qtype)); + QNN_LOG_DEBUG("ggml_type_size(%s) %d\n", ggml_type_name(qtype), ggml_type_size(qtype)); + if (ggml_is_quantized(qtype)) { sizex = ggml_blck_size(qtype); + + if (n_ggml_op_type == GGML_OP_MUL_MAT) { + sizex = ggml_blck_size(qtype) * 2; + } } + QNN_LOG_DEBUG("sizex %d\n", sizex); - src0 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); + if (n_ggml_op_type == GGML_OP_MUL) { + src0 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); + src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); + } else { + src0 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); + src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); + } ggml_set_input(src0); - src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); ggml_set_input(src1); switch (n_ggml_op_type) { @@ -384,7 +440,7 @@ int main(int argc, char * argv[]) { break; default: QNN_LOG_WARN("ggml op %d(%s) not supported", n_ggml_op_type, - ggml_op_name((enum ggml_op) n_ggml_op_type)); + ggml_op_name((enum ggml_op) n_ggml_op_type)); ggml_free(ctx); ggml_backend_free(backend); return 3; @@ -407,17 +463,20 @@ int main(int argc, char * argv[]) { gf = ggml_new_graph(ctx); ggml_build_forward_expand(gf, dst); -#if 0 - ggml_set_f32(src0, (rand() % 100 + 1)); - ggml_set_f32(src1, (rand() % 100 + 1)); - ggml_set_f32(dst, 0.0f); -#else if (n_backend_type != QNN_BACKEND_GGML) { initialize_tensors(ctx); + } else { + if (qtype == GGML_TYPE_F32) { + ggml_set_f32(src0, (rand() % 100 + 1)); + } else { + initialize_tensors(ctx); + } + ggml_set_f32(src1, (rand() % 100 + 1)); + //ggml_set_f32(dst, 0.0f); } -#endif ggml_graph_compute_helper(backend, gf, work_buffer, num_threads, nullptr, nullptr); + if (get_tensor_data_size(dst) < (32 * 32)) { QNN_LOG_DEBUG("dump tensors:\n"); TENSOR_DUMP(src0); @@ -425,26 +484,69 @@ int main(int argc, char * argv[]) { TENSOR_DUMP(dst); } else { QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, - src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], - src0->nb[0], src0->nb[1], src0->nb[2]); + src0->name, + src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], + src0->nb[0], src0->nb[1], src0->nb[2]); QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, - src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], - src1->nb[0], src1->nb[1], src1->nb[2]); + src1->name, + src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], + src1->nb[0], src1->nb[1], src1->nb[2]); QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - dst->name, - dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], - dst->nb[1], dst->nb[2]); + dst->name, + dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], + dst->nb[1], dst->nb[2]); } ggml_free(ctx); ggml_backend_buffer_free(buffer); ggml_backend_free(backend); - n_end_time = ggml_time_us(); n_duration = (n_end_time - n_begin_time) / 1000; QNN_LOG_DEBUG("duration of ut GGML_OP_%s using QNN backend %s: %lld milliseconds\n", ggml_op_name((enum ggml_op)n_ggml_op_type), get_qnn_backend_name(n_backend_type), n_duration); + return 0; +} + + +int main(int argc, char * argv[]) { + int num_threads = 4; + int n_backend_type = QNN_BACKEND_CPU; + int n_ggml_op_type = GGML_OP_ADD; + + for (int i = 1; i < argc; i++) { + if (0 == strcmp(argv[i], "-t")) { + if (i + 1 < argc) { + if (0 == memcmp(argv[i + 1], "GGML_OP_ADD", 11)) { + n_ggml_op_type = GGML_OP_ADD; + } else if (0 == memcmp(argv[i + 1], "GGML_OP_MUL_MAT", 15)) { + n_ggml_op_type = GGML_OP_MUL_MAT; + } else if (0 == memcmp(argv[i + 1], "GGML_OP_MUL", 11)) { + n_ggml_op_type = GGML_OP_MUL; + } else { + show_usage(); + return 1; + } + i++; + } + } else if (0 == strcmp(argv[i], "-b")) { + if (i + 1 < argc) { + int backend = atoi(argv[i + 1]); + if (backend <= QNN_BACKEND_NPU) + n_backend_type = backend; + else { + show_usage(); + return 1; + } + i++; + } + } else { + show_usage(); + return 1; + } + } + + QNN_LOG_DEBUG("enter qnn_ggml_op\n"); + QNN_LOG_DEBUG("backend %d, ggml op:%d(%s)", n_backend_type, n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type)); + qnn_op_ut(num_threads, n_backend_type, n_ggml_op_type); return 0; } From f4c53037abff299f20a1d40e1247e29d2d7b82dc Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Thu, 6 Jun 2024 20:24:03 +0800 Subject: [PATCH 006/166] review: remove unused QNN helper functions --- ggml-qnn.cpp | 404 +-------------------------------------------------- 1 file changed, 8 insertions(+), 396 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index d0927f22e514a..e81704305e988 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -54,6 +54,7 @@ #include #endif + // ================================================================================================= // // forward/external/helper declaration @@ -61,6 +62,7 @@ // ================================================================================================= class qnn_instance; + static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...); @@ -74,7 +76,7 @@ static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const #define GGML_QNN_LOGBUF_LEN 4096 -#define GGML_QNN_DEBUG 1 //for troubleshooting QNN backend +#define GGML_QNN_DEBUG 0 //for troubleshooting QNN backend #define QNN_LOG_ERROR(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) #define QNN_LOG_WARN(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) @@ -86,6 +88,8 @@ static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const #define QNN_LOG_DEBUG(...) #endif +#define QNN_VER_PTR(x) (&((x).v1)) + #define VALIDATE(value, status) \ do { \ @@ -98,34 +102,6 @@ static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const #define VALIDATE_TENSOR_VERSION(tensor, err) VALIDATE(validate_tensor_version(tensor), err) -#define VALIDATE_OP_CONFIG_VERSION(op, err) VALIDATE(validate_op_config_version(op), err) - -#define QNN_VER_PTR(x) (&((x).v1)) -#define QNN_OP_CFG_VALID(op_config) ((op_config).version == QNN_OPCONFIG_VERSION_1) - -#define QNN_OP_CFG_GET_NAME(op_config) get_qnn_oponfig_name(op_config) -#define QNN_OP_CFG_GET_PACKAGE_NAME(op_config) get_qnn_op_config_packagename(op_config) -#define QNN_OP_CFG_GET_TYPE_NAME(op_config) get_qnn_op_config_typename(op_config) -#define QNN_OP_CFG_GET_NUM_PARAMS(op_config) get_qnn_op_config_numparams(op_config) -#define QNN_OP_CFG_GET_PARAMS(op_config) get_qnn_op_config_params(op_config) -#define QNN_OP_CFG_GET_NUM_INPUTS(op_config) get_qnn_op_config_numinputs(op_config) -#define QNN_OP_CFG_GET_INPUTS(op_config) get_qnn_op_config_inputs(op_config) -#define QNN_OP_CFG_GET_NUM_OUTPUTS(op_config) get_qnn_op_config_numoutputs(op_config) -#define QNN_OP_CFG_GET_OUTPUTS(op_config) get_qnn_op_config_outputs(op_config) - -#define QNN_OP_CFG_SET_NAME(op_config, value) set_qnn_op_config_name(op_config, value) -#define QNN_OP_CFG_SET_PACKAGE_NAME(op_config, value) set_qnn_op_config_packagename(op_config, value) -#define QNN_OP_CFG_SET_TYPE_NAME(op_config, value) set_qnn_op_config_typename(op_config, value) - -#define QNN_OP_CFG_SET_PARAMS(op_config, num_of_params, params) \ - set_qnn_op_config_params(op_config, num_of_params, params) - -#define QNN_OP_CFG_SET_INPUTS(op_config, num_of_inputs, inputTensors) \ - set_qnn_op_config_inputs(op_config, num_of_inputs, inputTensors) - -#define QNN_OP_CFG_SET_OUTPUTS(op_config, num_of_outputs, output_tensors) \ - set_qnn_op_config_outputs(op_config, num_of_outputs, output_tensors) - #define QNN_TENSOR_GET_ID(tensor) get_qnn_tensorid(tensor) #define QNN_TENSOR_GET_NAME(tensor) get_qnn_tensorname(tensor) #define QNN_TENSOR_GET_TYPE(tensor) get_qnn_tensortype(tensor) @@ -135,8 +111,6 @@ static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const #define QNN_TENSOR_GET_RANK(tensor) get_qnn_tensor_rank(tensor) #define QNN_TENSOR_GET_DIMENSIONS(tensor) get_qnn_tensor_dimensions(tensor) #define QNN_TENSOR_GET_MEM_TYPE(tensor) get_qnn_tensor_memtype(tensor) -#define QNN_TENSOR_GET_CLIENT_BUF(tensor) get_qnn_tensor_clientbuf(tensor) -#define QNN_TENSOR_GET_MEM_HANDLE(tensor) get_qnn_tensor_memhandle(tensor) #define QNN_TENSOR_SET_ID(tensor, value) set_qnn_tensor_id(tensor, value) #define QNN_TENSOR_SET_NAME(tensor, value) set_qnn_tensor_name(tensor, value) @@ -150,7 +124,6 @@ static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const #define QNN_TENSOR_SET_CLIENT_BUF(tensor, value) set_qnn_tensor_clientbuf(tensor, value) #define QNN_TENSOR_SET_MEM_HANDLE(tensor, value) set_qnn_tensor_memhandle(tensor, value) - using pfn_rpc_mem_init = void (*)(void); using pfn_rpc_mem_deinit = void (*)(void); using pfn_rpc_mem_alloc = void *(*)(int, uint32_t, int); @@ -204,6 +177,7 @@ static int g_current_device = QNN_BACKEND_GGML; //Qualcomm CPU: Qualcomm Kryo CPU //Qualcomm GPU: Qualcomm Adreno GPU //Qualcomm NPU: aka HTP(Hexagon Tensor Processor), ~= cDSP(Compute DSP) + HMX(Hexagon Matrix eXtensions)/HTA(Hexagon Tensor Accelerator) + static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { [QNN_BACKEND_CPU] = {.device = 0, .threads = 1, .name = "qnn-cpu", .lib = "libQnnCpu.so", .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}}, [QNN_BACKEND_GPU] = {.device = 1, .threads = 1, .name = "qnn-gpu", .lib = "libQnnGpu.so", .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}}, @@ -227,221 +201,6 @@ static inline int validate_tensor_version(Qnn_Tensor_t tensor) { } -[[maybe_unused]] static inline int validate_op_config_version(Qnn_OpConfig_t op_config) { - if (op_config.version != QNN_OPCONFIG_VERSION_1) { - QNN_LOG_WARN("validate_op_config_version() op %s, got unsupported version %d\n", - op_config.v1.name, - op_config.version); - return 1; - } - return 0; -} - - -static inline const char * get_qnn_oponfig_name(const Qnn_OpConfig_t & op_config) { - if (op_config.version == QNN_OPCONFIG_VERSION_1) { - return op_config.v1.name; - } - return nullptr; -} - - -[[maybe_unused]] static inline const char * get_qnn_oponfig_name(const Qnn_OpConfig_t * op_config) { - return get_qnn_oponfig_name(*op_config); -} - - -static inline const char * get_qnn_op_config_packagename(const Qnn_OpConfig_t & op_config) { - if (op_config.version == QNN_OPCONFIG_VERSION_1) { - return op_config.v1.packageName; - } - return nullptr; -} - - -[[maybe_unused]] static inline const char * get_qnn_op_config_packagename(const Qnn_OpConfig_t * op_config) { - return get_qnn_op_config_packagename(*op_config); -} - - -static inline const char * get_qnn_op_config_typename(const Qnn_OpConfig_t & op_config) { - if (op_config.version == QNN_OPCONFIG_VERSION_1) { - return op_config.v1.typeName; - } - return nullptr; -} - - -[[maybe_unused]] static inline const char * get_qnn_op_config_typename(const Qnn_OpConfig_t * op_config) { - return get_qnn_op_config_typename(*op_config); -} - - -static inline uint32_t get_qnn_op_config_numparams(const Qnn_OpConfig_t & op_config) { - if (op_config.version == QNN_OPCONFIG_VERSION_1) { - return op_config.v1.numOfParams; - } - return 0u; -} - - -[[maybe_unused]] static inline uint32_t get_qnn_op_config_numparams(const Qnn_OpConfig_t * op_config) { - return get_qnn_op_config_numparams(*op_config); -} - - -static inline const Qnn_Param_t * get_qnn_op_config_params(const Qnn_OpConfig_t & op_config) { - if (op_config.version == QNN_OPCONFIG_VERSION_1) { - return op_config.v1.params; - } - return nullptr; -} - - -[[maybe_unused]] static inline const Qnn_Param_t * get_qnn_op_config_params(const Qnn_OpConfig_t * op_config) { - return get_qnn_op_config_params(*op_config); -} - - -static inline uint32_t get_qnn_op_config_numinputs(const Qnn_OpConfig_t & op_config) { - if (op_config.version == QNN_OPCONFIG_VERSION_1) { - return op_config.v1.numOfInputs; - } - return 0u; -} - - -[[maybe_unused]] static inline uint32_t get_qnn_op_config_numinputs(const Qnn_OpConfig_t * op_config) { - return get_qnn_op_config_numinputs(*op_config); -} - - -static inline const Qnn_Tensor_t * get_qnn_op_config_inputs(const Qnn_OpConfig_t & op_config) { - if (op_config.version == QNN_OPCONFIG_VERSION_1) { - return op_config.v1.inputTensors; - } - return nullptr; -} - - -[[maybe_unused]] static inline const Qnn_Tensor_t * get_qnn_op_config_inputs(const Qnn_OpConfig_t * op_config) { - return get_qnn_op_config_inputs(*op_config); -} - - -static inline uint32_t get_qnn_op_config_numoutputs(const Qnn_OpConfig_t & op_config) { - if (op_config.version == QNN_OPCONFIG_VERSION_1) { - return op_config.v1.numOfOutputs; - } - return 0u; -} - - -[[maybe_unused]] static inline uint32_t get_qnn_op_config_numoutputs(const Qnn_OpConfig_t * op_config) { - return get_qnn_op_config_numoutputs(*op_config); -} - - -static inline const Qnn_Tensor_t * get_qnn_op_config_outputs(const Qnn_OpConfig_t & op_config) { - if (op_config.version == QNN_OPCONFIG_VERSION_1) { - return op_config.v1.outputTensors; - } - return nullptr; -} - - -[[maybe_unused]] static inline const Qnn_Tensor_t * get_qnn_op_config_outputs(const Qnn_OpConfig_t * op_config) { - return get_qnn_op_config_outputs(*op_config); -} - - -static inline void set_qnn_op_config_name(Qnn_OpConfig_t & op_config, const char * name) { - if (op_config.version == QNN_OPCONFIG_VERSION_1) { - op_config.v1.name = name; - } -} - - -[[maybe_unused]] static inline void set_qnn_op_config_name(Qnn_OpConfig_t * op_config, const char * name) { - set_qnn_op_config_name(*op_config, name); -} - - -static inline void set_qnn_op_config_packagename(Qnn_OpConfig_t & op_config, const char * package_name) { - if (op_config.version == QNN_OPCONFIG_VERSION_1) { - op_config.v1.packageName = package_name; - } -} - - -[[maybe_unused]] static inline void set_qnn_op_config_packagename(Qnn_OpConfig_t * op_config, const char * package_name) { - set_qnn_op_config_packagename(*op_config, package_name); -} - - -static inline void set_qnn_op_config_typename(Qnn_OpConfig_t & op_config, const char * type_name) { - if (op_config.version == QNN_OPCONFIG_VERSION_1) { - op_config.v1.typeName = type_name; - } -} - - -[[maybe_unused]] static inline void set_qnn_op_config_typename(Qnn_OpConfig_t * op_config, const char * type_name) { - set_qnn_op_config_typename(*op_config, type_name); -} - - -static inline void set_qnn_op_config_params(Qnn_OpConfig_t & op_config, - uint32_t num_of_params, - Qnn_Param_t * params) { - if (op_config.version == QNN_OPCONFIG_VERSION_1) { - op_config.v1.numOfParams = num_of_params; - op_config.v1.params = params; - } -} - - -[[maybe_unused]] static inline void set_qnn_op_config_params(Qnn_OpConfig_t * op_config, - uint32_t num_of_params, - Qnn_Param_t * params) { - set_qnn_op_config_params(*op_config, num_of_params, params); -} - - -static inline void set_qnn_op_config_inputs(Qnn_OpConfig_t & op_config, - uint32_t num_of_inputs, - Qnn_Tensor_t * input_tensors) { - if (op_config.version == QNN_OPCONFIG_VERSION_1) { - op_config.v1.numOfInputs = num_of_inputs; - op_config.v1.inputTensors = input_tensors; - } -} - - -[[maybe_unused]] static inline void set_qnn_op_config_inputs(Qnn_OpConfig_t * op_config, - uint32_t num_of_inputs, - Qnn_Tensor_t * input_tensors) { - set_qnn_op_config_inputs(*op_config, num_of_inputs, input_tensors); -} - - -static inline void set_qnn_op_config_outputs(Qnn_OpConfig_t & op_config, - uint32_t num_of_outputs, - Qnn_Tensor_t * output_tensors) { - if (op_config.version == QNN_OPCONFIG_VERSION_1) { - op_config.v1.numOfOutputs = num_of_outputs; - op_config.v1.outputTensors = output_tensors; - } -} - - -[[maybe_unused]] static inline void set_qnn_op_config_outputs(Qnn_OpConfig_t * op_config, - uint32_t num_of_outputs, - Qnn_Tensor_t * output_tensors) { - set_qnn_op_config_outputs(*op_config, num_of_outputs, output_tensors); -} - - static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.id; @@ -451,11 +210,6 @@ static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t & tensor) { } -[[maybe_unused]] static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t * tensor) { - return get_qnn_tensorid(*tensor); -} - - static inline const char * get_qnn_tensorname(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.name; @@ -464,10 +218,6 @@ static inline const char * get_qnn_tensorname(const Qnn_Tensor_t & tensor) { } -static inline const char * get_qnn_tensorname(const Qnn_Tensor_t * tensor) { - return get_qnn_tensorname(*tensor); -} - static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { @@ -477,11 +227,6 @@ static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t & tensor) { } -[[maybe_unused]] static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t * tensor) { - return get_qnn_tensortype(*tensor); -} - - static inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.dataFormat; @@ -490,11 +235,6 @@ static inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_ } -[[maybe_unused]] static inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_t * tensor) { - return get_qnn_tensor_dataformat(*tensor); -} - - static inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.dataType; @@ -503,11 +243,6 @@ static inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t & tensor } -[[maybe_unused]] static inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t * tensor) { - return get_qnn_tensor_datatype(*tensor); -} - - static inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.quantizeParams; @@ -516,11 +251,6 @@ static inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t } -[[maybe_unused]] static inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t * tensor) { - return get_qnn_tensor_quantparams(*tensor); -} - - static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.rank; @@ -529,11 +259,6 @@ static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t & tensor) { } -[[maybe_unused]] static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t * tensor) { - return get_qnn_tensor_rank(*tensor); -} - - static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.dimensions; @@ -542,11 +267,6 @@ static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t & tensor) } -[[maybe_unused]] static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t * tensor) { - return get_qnn_tensor_dimensions(*tensor); -} - - static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.memType; @@ -555,37 +275,6 @@ static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t & te } -[[maybe_unused]] static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t * tensor) { - return get_qnn_tensor_memtype(*tensor); -} - - -static inline Qnn_ClientBuffer_t get_qnn_tensor_clientbuf(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.clientBuf; - } - return QNN_CLIENT_BUFFER_INIT; -} - - -[[maybe_unused]] static inline Qnn_ClientBuffer_t get_qnn_tensor_clientbuf(const Qnn_Tensor_t * tensor) { - return get_qnn_tensor_clientbuf(*tensor); -} - - -static inline Qnn_MemHandle_t get_qnn_tensor_memhandle(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.memHandle; - } - return nullptr; -} - - -[[maybe_unused]] static inline Qnn_MemHandle_t get_qnn_tensor_memhandle(const Qnn_Tensor_t * tensor) { - return get_qnn_tensor_memhandle(*tensor); -} - - static inline void set_qnn_tensor_id(Qnn_Tensor_t & tensor, uint32_t id) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.id = id; @@ -593,11 +282,6 @@ static inline void set_qnn_tensor_id(Qnn_Tensor_t & tensor, uint32_t id) { } -[[maybe_unused]] static inline void set_qnn_tensor_id(Qnn_Tensor_t * tensor, uint32_t id) { - set_qnn_tensor_id(*tensor, id); -} - - static inline void set_qnn_tensor_name(Qnn_Tensor_t & tensor, const char * name) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.name = name; @@ -605,11 +289,6 @@ static inline void set_qnn_tensor_name(Qnn_Tensor_t & tensor, const char * name) } -[[maybe_unused]] static inline void set_qnn_tensor_name(Qnn_Tensor_t * tensor, const char * name) { - set_qnn_tensor_name(*tensor, name); -} - - static inline void set_qnn_tensor_type(Qnn_Tensor_t & tensor, Qnn_TensorType_t type) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.type = type; @@ -617,11 +296,6 @@ static inline void set_qnn_tensor_type(Qnn_Tensor_t & tensor, Qnn_TensorType_t t } -[[maybe_unused]] static inline void set_qnn_tensor_type(Qnn_Tensor_t * tensor, Qnn_TensorType_t type) { - set_qnn_tensor_type(*tensor, type); -} - - static inline void set_qnn_tensor_dataformat(Qnn_Tensor_t & tensor, Qnn_TensorDataFormat_t format) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.dataFormat = format; @@ -629,11 +303,6 @@ static inline void set_qnn_tensor_dataformat(Qnn_Tensor_t & tensor, Qnn_TensorDa } -[[maybe_unused]] static inline void set_qnn_tensor_dataformat(Qnn_Tensor_t * tensor, Qnn_TensorDataFormat_t format) { - set_qnn_tensor_dataformat(*tensor, format); -} - - static inline void set_qnn_tensor_datatype(Qnn_Tensor_t & tensor, Qnn_DataType_t dataType) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.dataType = dataType; @@ -641,11 +310,6 @@ static inline void set_qnn_tensor_datatype(Qnn_Tensor_t & tensor, Qnn_DataType_t } -[[maybe_unused]] static inline void set_qnn_tensor_datatype(Qnn_Tensor_t * tensor, Qnn_DataType_t dataType) { - set_qnn_tensor_datatype(*tensor, dataType); -} - - static inline void set_qnn_tensor_quantparams(Qnn_Tensor_t & tensor, Qnn_QuantizeParams_t params) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.quantizeParams = params; @@ -653,11 +317,6 @@ static inline void set_qnn_tensor_quantparams(Qnn_Tensor_t & tensor, Qnn_Quantiz } -[[maybe_unused]] static inline void set_qnn_tensor_quantparams(Qnn_Tensor_t * tensor, Qnn_QuantizeParams_t params) { - set_qnn_tensor_quantparams(*tensor, params); -} - - static inline void set_qnn_tensor_rank(Qnn_Tensor_t & tensor, uint32_t rank) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.rank = rank; @@ -665,11 +324,6 @@ static inline void set_qnn_tensor_rank(Qnn_Tensor_t & tensor, uint32_t rank) { } -[[maybe_unused]] static inline void set_qnn_tensor_rank(Qnn_Tensor_t * tensor, uint32_t rank) { - set_qnn_tensor_rank(*tensor, rank); -} - - static inline void set_qnn_tensor_dimensions(Qnn_Tensor_t & tensor, uint32_t * dims) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.dimensions = dims; @@ -677,11 +331,6 @@ static inline void set_qnn_tensor_dimensions(Qnn_Tensor_t & tensor, uint32_t * d } -[[maybe_unused]] static inline void set_qnn_tensor_dimensions(Qnn_Tensor_t * tensor, uint32_t * dims) { - set_qnn_tensor_dimensions(*tensor, dims); -} - - static inline void set_qnn_tensor_memtype(Qnn_Tensor_t & tensor, Qnn_TensorMemType_t memType) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.memType = memType; @@ -689,11 +338,6 @@ static inline void set_qnn_tensor_memtype(Qnn_Tensor_t & tensor, Qnn_TensorMemTy } -[[maybe_unused]] static inline void set_qnn_tensor_memtype(Qnn_Tensor_t * tensor, Qnn_TensorMemType_t memType) { - set_qnn_tensor_memtype(*tensor, memType); -} - - static inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t & tensor, Qnn_ClientBuffer_t clientBuf) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.clientBuf = clientBuf; @@ -701,11 +345,6 @@ static inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t & tensor, Qnn_ClientBuf } -[[maybe_unused]] static inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t * tensor, Qnn_ClientBuffer_t clientBuf) { - set_qnn_tensor_clientbuf(*tensor, clientBuf); -} - - static inline void set_qnn_tensor_memhandle(Qnn_Tensor_t & tensor, Qnn_MemHandle_t handle) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.memHandle = handle; @@ -713,11 +352,6 @@ static inline void set_qnn_tensor_memhandle(Qnn_Tensor_t & tensor, Qnn_MemHandle } -[[maybe_unused]] static inline void set_qnn_tensor_memhandle(Qnn_Tensor_t * tensor, Qnn_MemHandle_t handle) { - set_qnn_tensor_memhandle(*tensor, handle); -} - - static size_t memscpy(void * dst, size_t dstSize, const void * src, size_t copySize) { if (!dst || !src || !dstSize || !copySize) return 0; @@ -824,19 +458,6 @@ static int free_qnn_tensor(Qnn_Tensor_t & tensor) { } -[[maybe_unused]] static int free_qnn_tensors(Qnn_Tensor_t *& tensors, uint32_t num_tensors) { - int err = 0; - - // free all pointer allocations in struct - for (size_t i = 0; i < num_tensors; i++) { - free_qnn_tensor(tensors[i]); - } - free(tensors); - - return err; -} - - static uint32_t ggml_get_tensor_rank(const ggml_tensor * tensor) { uint32_t rank = 0; for (int i = 0; i < GGML_MAX_DIMS; i++) { @@ -3137,7 +2758,7 @@ static const char * ggml_backend_qnn_buffer_get_name(ggml_backend_buffer_t buffe } -[[maybe_unused]] GGML_CALL static bool ggml_backend_buffer_is_qnn(ggml_backend_buffer_t buffer) { +GGML_CALL static bool ggml_backend_buffer_is_qnn(ggml_backend_buffer_t buffer) { return buffer->iface.get_name == ggml_backend_qnn_buffer_get_name; } @@ -3236,15 +2857,6 @@ GGML_CALL static void ggml_backend_qnn_buffer_clear(ggml_backend_buffer_t buffer } -[[maybe_unused]] GGML_CALL static void ggml_backend_qnn_buffer_reset(ggml_backend_buffer_t buffer) { - ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; - for (auto * sub_buffer : ctx->sub_buffers) { - free(sub_buffer); - } - ctx->sub_buffers.clear(); -} - - static ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = { /* .get_name = */ ggml_backend_qnn_buffer_get_name, /* .free_buffer = */ ggml_backend_qnn_buffer_free_buffer, @@ -3402,7 +3014,7 @@ GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const //note: this function be used with proposal/refined ggml backend subsystem in this PR: // https://github.com/ggerganov/llama.cpp/pull/7641 -// any ggml backend(only using system memory: ggml_backend_xxx_buffer_is_host return true) +// new ggml backend(only using system memory: ggml_backend_xxx_buffer_is_host return true) // can following this style for mixed inference between CPU&GPU / CPU&NPU very easily GGML_CALL static bool ggml_backend_qnn_offload_op(ggml_backend_t backend, const ggml_tensor * tensor) { GGML_UNUSED(backend); From 2fab33d8250db70e872a12af7ffd41af04592acc Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Fri, 7 Jun 2024 12:51:04 +0800 Subject: [PATCH 007/166] ggml-qnn: remove static global vars to support multi-instance simultaneously --- ggml-qnn.cpp | 250 +++++++++++++++------------------ tests/ggml-qnn/ggml-qnn-ut.cpp | 3 +- 2 files changed, 113 insertions(+), 140 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index e81704305e988..867f01625ad7f 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -76,7 +76,7 @@ static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const #define GGML_QNN_LOGBUF_LEN 4096 -#define GGML_QNN_DEBUG 0 //for troubleshooting QNN backend +#define GGML_QNN_DEBUG 1 //for troubleshooting QNN backend #define QNN_LOG_ERROR(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) #define QNN_LOG_WARN(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) @@ -89,7 +89,7 @@ static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const #endif #define QNN_VER_PTR(x) (&((x).v1)) - +#define GGML_QNN_NAME "qnn" #define VALIDATE(value, status) \ do { \ @@ -135,8 +135,6 @@ using _pfn_QnnInterface_getProviders = decltype(QnnInterface_ using _pfn_QnnSystemInterface_getProviders = decltype(QnnSystemInterface_getProviders); -typedef void (* ggml_qnn_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); -typedef void (* ggml_qnn_func_common_t)(const ggml_op ggml_op, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); enum class ggml_qnn_profile_level { profile_off = 0, @@ -144,7 +142,6 @@ enum class ggml_qnn_profile_level { profile_detail = 2 }; - struct ggml_backend_qnn_context { int device; int threads; @@ -156,15 +153,16 @@ struct ggml_backend_qnn_context { QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface; } ; +typedef void (* ggml_qnn_func_t)(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); + +typedef void (* ggml_qnn_func_common_t)(ggml_backend_qnn_context * ctx, const ggml_op ggml_op, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); // ================================================================================================= // // static global variables // // ================================================================================================= -static ggml_backend_t g_qnn_backend = nullptr; - -static int g_current_device = QNN_BACKEND_GGML; +//static ggml_backend_t g_qnn_backend = nullptr; //according to the QNN SDK Reference Guide, //CPU - Choose a non-quantized model. Quantized models are currently incompatible with the CPU backend @@ -184,7 +182,6 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { [QNN_BACKEND_NPU] = {.device = 2, .threads = 1, .name = "qnn-npu", .lib = "libQnnHtp.so", .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}}, }; - // ================================================================================================= // // QNN helper functions and other internal helper functions @@ -1010,7 +1007,7 @@ void qnn_instance::free_rpcmem(void * buf) { } -int32_t qnn_instance::rpcmem_to_fd(void *buf) { +int32_t qnn_instance::rpcmem_to_fd(void * buf) { int32_t mem_fd = -1; if (!is_rpcmem_initialized()) { QNN_LOG_WARN("rpc memory not initialized\n"); @@ -1168,33 +1165,6 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t * _loaded_lib_handle[backend_id] = lib_handle; _backend_id = backend_id; -#if 0 //comment it for purpose of reduce size of APK - QnnSaver_Config_t outputdir_cfg; - outputdir_cfg.option = QNN_SAVER_CONFIG_OPTION_OUTPUT_DIRECTORY; - outputdir_cfg.outputDirectory = "/data/local/tmp/"; - - QnnSaver_Config_t backendid_cfg; - backendid_cfg.option = QNN_SAVER_CONFIG_OPTION_BACKEND_ID; - backendid_cfg.backendId = _backend_id; - const QnnSaver_Config_t *saverCfg[] = {&outputdir_cfg, &backendid_cfg, nullptr}; - if (0 == QnnSaver_initialize(saverCfg)) { - QNN_LOG_INFO("QnnSaver_initialize successfully"); - } else { - QNN_LOG_WARN("QnnSaver_initialize failure"); - } -#endif - auto saver_initialize = load_qnn_functionpointers<_pfn_QnnSaver_initialize *>( - _loaded_lib_handle[backend_id], "QnnSaver_initialize"); - if (nullptr != saver_initialize) { - error = saver_initialize(saver_config); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to saver_initialize,error %d", QNN_GET_ERROR_CODE(error)); - return 7; - } - } else { - QNN_LOG_WARN("saver_initialize is null\n"); - } - return 0; } @@ -1345,14 +1315,15 @@ static void ggml_qnn_logcallback(const char * fmt, } double ms = (double) timestamp / 1000000.0; - - if (0) { +#if GGML_QNN_DEBUG + { std::lock_guard lock(log_mutex); memset(s_ggml_qnn_logbuf, 0, GGML_QNN_LOGBUF_LEN); vsnprintf(reinterpret_cast(s_ggml_qnn_logbuf), GGML_QNN_LOGBUF_LEN, fmt, argp); QNN_LOG_DEBUG("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggml_qnn_logbuf); } +#endif } @@ -1390,11 +1361,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { _qnn_interface.set_qnn_interface(_loaded_backend[backend_id]); -#if 1 _qnn_interface.qnn_log_create(ggml_qnn_logcallback, _qnn_log_level, &_qnn_log_handle); -#else - _qnn_raw_interface.logCreate(ggml_qnn_logcallback, _qnn_log_level, &_qnn_log_handle); -#endif if (nullptr == _qnn_log_handle) { QNN_LOG_WARN("why failed to initialize qnn log\n"); //NPU backend not work on Qualcomm SoC based low-end phone return 4; @@ -1437,7 +1404,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate( _qnn_backend_handle, QNN_PROFILE_LEVEL_BASIC, &_qnn_profile_handle)) { QNN_LOG_WARN("unable to create profile handle in the backend\n"); - return 7; + return 6; } else { QNN_LOG_DEBUG("initialize qnn profile successfully\n"); } @@ -1456,7 +1423,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL); if (nullptr == _rpc_lib_handle) { QNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dlerror()); - return 9; + return 8; } else { QNN_LOG_DEBUG("load rpcmem lib successfully\n"); set_rpcmem_initialized(true); @@ -1470,7 +1437,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { || nullptr == _pfn_rpc_mem_to_fd) { QNN_LOG_WARN("unable to access symbols in QNN RPC lib. dlerror(): %s", dlerror()); dlclose(_rpc_lib_handle); - return 10; + return 9; } if (nullptr != _pfn_rpc_mem_init) // make Qualcomm's SoC based low-end phone happy @@ -1483,7 +1450,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { &_qnn_context_handle); if (nullptr == _qnn_context_handle) { QNN_LOG_WARN("why failed to initialize qnn context\n"); - return 8; + return 10; } else { QNN_LOG_DEBUG("initialize qnn context successfully\n"); } @@ -1695,7 +1662,7 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor, bool b_dum } -static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { Qnn_ErrorHandle_t error = QNN_SUCCESS; bool graph_initialized = false; int64_t n_begin_time = 0LL; @@ -1703,7 +1670,6 @@ static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggm int64_t n_duration = 0LL; qnn_instance * instance = nullptr; - struct ggml_backend_qnn_context * ctx = nullptr; std::string graph_name = "ggml_op_qnn_add"; Qnn_GraphHandle_t graph_handle = nullptr; @@ -1727,7 +1693,6 @@ static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggm QNN_LOG_WARN("pls check why QNN tensor is null"); return; } - ctx = (struct ggml_backend_qnn_context *)g_qnn_backend->context; if (nullptr == ctx) { QNN_LOG_WARN("pls check why backend ctx is null"); return; @@ -1755,9 +1720,9 @@ static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggm dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); - QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0)); - QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1)); - QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2)); + QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); + QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); + QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; @@ -1918,7 +1883,7 @@ static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggm * mul_mat_f16_f32: src0 is F16 and src1 is F32. * mul_mat_q_f32: src0 is quantized (Q4_0, Q4_1, ...), and src1 is F32. */ -static void ggml_qnn_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { Qnn_ErrorHandle_t error = QNN_SUCCESS; bool graph_initialized = false; int64_t n_begin_time = 0LL; @@ -1926,7 +1891,6 @@ static void ggml_qnn_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, int64_t n_duration = 0LL; qnn_instance * instance = nullptr; - struct ggml_backend_qnn_context * ctx = nullptr; std::string graph_name = "ggml_op_qnn_mul_mat"; Qnn_GraphHandle_t graph_handle = nullptr; @@ -1952,7 +1916,6 @@ static void ggml_qnn_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, QNN_LOG_WARN("pls check why QNN tensor is null"); return; } - ctx = (struct ggml_backend_qnn_context *)g_qnn_backend->context; if (nullptr == ctx) { QNN_LOG_WARN("pls check why backend ctx is null"); return; @@ -1979,9 +1942,9 @@ static void ggml_qnn_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); - QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0)); - QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1)); - QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2)); + QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); + QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); + QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; @@ -2129,7 +2092,7 @@ static void ggml_qnn_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, //common function for GGML OPs using QNN API -static void ggml_qnn_hanlde_op(const enum ggml_op ggmlop, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, const enum ggml_op ggmlop, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { Qnn_ErrorHandle_t error = QNN_SUCCESS; bool graph_initialized = false; int64_t n_begin_time = 0LL; @@ -2137,7 +2100,6 @@ static void ggml_qnn_hanlde_op(const enum ggml_op ggmlop, const ggml_tensor * sr int64_t n_duration = 0LL; qnn_instance * instance = nullptr; - struct ggml_backend_qnn_context * ctx = nullptr; std::string qnn_graph_name = "ggml_qnn_graph"; std::string qnn_op_config_name = "ggml_qnn_op_config"; @@ -2164,7 +2126,6 @@ static void ggml_qnn_hanlde_op(const enum ggml_op ggmlop, const ggml_tensor * sr QNN_LOG_WARN("pls check why QNN tensor is null"); return; } - ctx = (struct ggml_backend_qnn_context *)g_qnn_backend->context; if (nullptr == ctx) { QNN_LOG_WARN("pls check why backend ctx is null"); return; @@ -2201,9 +2162,9 @@ static void ggml_qnn_hanlde_op(const enum ggml_op ggmlop, const ggml_tensor * sr dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); - QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0)); - QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1)); - QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2)); + QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); + QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); + QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; @@ -2349,153 +2310,154 @@ static void ggml_qnn_hanlde_op(const enum ggml_op ggmlop, const ggml_tensor * sr } -static void ggml_qnn_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_repeat(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_get_rows(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_acc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_acc(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_div(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_div(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_gelu(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_silu(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_gelu_quick(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_gelu_quick(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_tanh(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_tanh(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_relu(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_hardsigmoid(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_hardsigmoid(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_hardswish(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_hardswish(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_leaky_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_leaky_relu(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_sqr(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_norm(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_group_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_group_norm(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_concat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_concat(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_upscale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_upscale(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_pad(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_pad(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_rms_norm(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_cpy(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_qnn_cpy(src0, dst, nullptr); +static void ggml_qnn_dup(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_qnn_cpy(ctx, src0, dst, nullptr); (void) src1; } -static void ggml_qnn_mul_mat_id(const ggml_tensor * src0, +static void ggml_qnn_mul_mat_id(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); @@ -2504,35 +2466,35 @@ static void ggml_qnn_mul_mat_id(const ggml_tensor * src0, } -static void ggml_qnn_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_scale(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_clamp(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_diag_mask_inf(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_soft_max(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_rope(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(ggml_is_contiguous(src0)); QNN_LOG_DEBUG("call %s\n", __func__); @@ -2541,21 +2503,21 @@ static void ggml_qnn_rope(const ggml_tensor * src0, const ggml_tensor * src1, gg } -static void ggml_qnn_pool2d(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_pool2d(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_im2col(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_sum_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_sum_rows(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(ggml_is_contiguous(src0)); QNN_LOG_DEBUG("call %s\n", __func__); @@ -2563,7 +2525,7 @@ static void ggml_qnn_sum_rows(const ggml_tensor * src0, const ggml_tensor * src1 } -static void ggml_qnn_argsort(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_argsort(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(ggml_is_contiguous(src0)); QNN_LOG_DEBUG("call %s\n", __func__); @@ -2571,7 +2533,7 @@ static void ggml_qnn_argsort(const ggml_tensor * src0, const ggml_tensor * src1, } -static void ggml_qnn_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_nop(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { (void) src0; (void) src1; (void) dst; @@ -2581,7 +2543,7 @@ static void ggml_qnn_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggm } -bool ggml_qnn_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { +bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx, struct ggml_compute_params * params, struct ggml_tensor * tensor) { ggml_qnn_func_t func = nullptr; ggml_qnn_func_common_t func_common = nullptr; @@ -2715,16 +2677,21 @@ bool ggml_qnn_compute_forward(struct ggml_compute_params * params, struct ggml_t } if (nullptr != func) - func(tensor->src[0], tensor->src[1], tensor); + func(ctx, tensor->src[0], tensor->src[1], tensor); if (nullptr != func_common) - func_common(tensor->op, tensor->src[0], tensor->src[1], tensor); + func_common(ctx, tensor->op, tensor->src[0], tensor->src[1], tensor); return true; } struct ggml_backend_qnn_buffer_context { + ggml_backend_qnn_buffer_context(size_t device) : + device(device), + name(GGML_QNN_NAME + std::to_string(device)) { + } + ~ggml_backend_qnn_buffer_context() { if (buffer) { free(buffer); @@ -2749,6 +2716,14 @@ struct ggml_backend_qnn_buffer_context { size_t buffer_size = 0; std::vector sub_buffers; std::vector qnn_tensors; + size_t device; + std::string name; +}; + + +struct ggml_backend_qnn_buffer_type_context { + size_t device; + std::string name; }; @@ -2782,7 +2757,7 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t static int idx = 0; char tensor_name[GGML_MAX_NAME] = { 0 }; - snprintf(tensor_name, GGML_MAX_NAME, "tensor_%2d", idx++); + snprintf(tensor_name, GGML_MAX_NAME, "tensor_%04d", idx++); uint32_t dimensions[] = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1], (uint32_t) tensor->ne[2], (uint32_t) tensor->ne[3]}; Qnn_DataType_t qnn_data_type = qnn_datatype_from_ggml_datatype(tensor->type); @@ -2888,7 +2863,8 @@ static void * ggml_qnn_host_malloc(size_t n) { GGML_CALL static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - ggml_backend_qnn_buffer_context * ctx = new ggml_backend_qnn_buffer_context; + ggml_backend_qnn_buffer_type_context * buft_ctx = (ggml_backend_qnn_buffer_type_context *)buft->context; + ggml_backend_qnn_buffer_context * ctx = new ggml_backend_qnn_buffer_context(buft_ctx->device); const size_t size_page = sysconf(_SC_PAGESIZE); @@ -2901,7 +2877,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer ctx->buffer = ggml_qnn_host_malloc(size_aligned); ctx->buffer_size = size_aligned; - ctx->backend_ctx = &g_qnn_mgr[g_current_device]; + ctx->backend_ctx = &g_qnn_mgr[buft_ctx->device]; if (nullptr == ctx->buffer) { QNN_LOG_WARN("%s: failed to allocate %.2f MiB\n", __func__, size / (1 << 20)); @@ -2968,7 +2944,6 @@ GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { if (g_qnn_mgr[ctx->device].backend != nullptr) { delete backend; - g_qnn_backend = nullptr; g_qnn_mgr[ctx->device].backend = nullptr; } QNN_LOG_INFO("leave %s", __func__ ); @@ -2995,7 +2970,7 @@ GGML_CALL static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backe if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { continue; } - bool ok = ggml_qnn_compute_forward(¶ms, node); + bool ok = ggml_qnn_compute_forward(ctx, ¶ms, node); if (!ok) { QNN_LOG_DEBUG("%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); } @@ -3017,9 +2992,9 @@ GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const // new ggml backend(only using system memory: ggml_backend_xxx_buffer_is_host return true) // can following this style for mixed inference between CPU&GPU / CPU&NPU very easily GGML_CALL static bool ggml_backend_qnn_offload_op(ggml_backend_t backend, const ggml_tensor * tensor) { - GGML_UNUSED(backend); + ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; - return ggml_qnn_compute_forward(nullptr, (ggml_tensor*)tensor); + return ggml_qnn_compute_forward(ctx, nullptr, (ggml_tensor*)tensor); } @@ -3104,27 +3079,36 @@ void ggml_backend_qnn_get_device_description(size_t dev_num, char * description, } -ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device_index) { - if (device_index >= GGML_QNN_MAX_DEVICES) { +ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device) { + if (device >= GGML_QNN_MAX_DEVICES) { QNN_LOG_DEBUG("ggml_backend_qnn_buffer_type error: device_index:%d is out of range [0, %d]\n", - device_index, GGML_QNN_MAX_DEVICES - 1); + device, GGML_QNN_MAX_DEVICES - 1); return nullptr; } - static struct ggml_backend_buffer_type ggml_backend_buffer_type_qnn = { - /* .iface = */ { - /* .get_name = */ ggml_backend_qnn_buffer_type_name, - /* .alloc_buffer = */ ggml_backend_qnn_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_qnn_buffer_type_get_alignment, - /* .get_max_size = */ ggml_backend_qnn_buffer_type_get_max_size, - /* .get_alloc_size = */ nullptr,// defaults to ggml_nbytes - /* .supports_backend = */ ggml_backend_qnn_buffer_type_supports_backend, - /* .is_host = */ ggml_backend_qnn_buffer_is_host - }, - /* .context = */ nullptr, - }; + static ggml_backend_buffer_type ggml_backend_qnn_buffer_types[GGML_QNN_MAX_DEVICES]; + + static bool ggml_backend_qnn_buffer_type_initialized = false; + + if (!ggml_backend_qnn_buffer_type_initialized) { + for (int i = 0; i < GGML_QNN_MAX_DEVICES; i++) { + ggml_backend_qnn_buffer_types[i] = { + /* .iface = */ { + /* .get_name = */ ggml_backend_qnn_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_qnn_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_qnn_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_qnn_buffer_type_get_max_size, + /* .get_alloc_size = */ nullptr,// defaults to ggml_nbytes + /* .supports_backend = */ ggml_backend_qnn_buffer_type_supports_backend, + /* .is_host = */ ggml_backend_qnn_buffer_is_host + }, + /* .context = */ new ggml_backend_qnn_buffer_type_context { device, GGML_QNN_NAME + std::to_string(device) }, + }; + } + ggml_backend_qnn_buffer_type_initialized = true; + } - return &ggml_backend_buffer_type_qnn; + return &ggml_backend_qnn_buffer_types[device]; } @@ -3137,8 +3121,10 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device_index) { ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { int result = 0; - if (nullptr == qnn_lib_path) + if (nullptr == qnn_lib_path) { + QNN_LOG_ERROR("invalid qnn lib path\n"); return nullptr; + } QNN_LOG_DEBUG("device %d", device); QNN_LOG_DEBUG("qnn_lib_path %s", qnn_lib_path); @@ -3147,18 +3133,6 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { return nullptr; } - if (nullptr != g_qnn_mgr[device].backend) { - QNN_LOG_ERROR("qnn backend %d(%s) already loaded", device, get_qnn_backend_name(device)); - if (device == g_current_device) { - g_qnn_backend = g_qnn_mgr[device].backend; - QNN_LOG_INFO("re-use cached backend %d(%s)", device, get_qnn_backend_name(device)); - return g_qnn_mgr[device].backend; - } else { - QNN_LOG_INFO("delete previous backend %d(%s)", device, get_qnn_backend_name(device)); - ggml_backend_qnn_free(g_qnn_backend); - } - } - std::string path = qnn_lib_path; if (QNN_BACKEND_NPU == device) { if (0 == setenv("LD_LIBRARY_PATH", @@ -3215,8 +3189,6 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { /* .context = */ &g_qnn_mgr[device] }; g_qnn_mgr[device].backend = qnn_backend; - g_qnn_backend = g_qnn_mgr[device].backend; - g_current_device = device; return qnn_backend; } diff --git a/tests/ggml-qnn/ggml-qnn-ut.cpp b/tests/ggml-qnn/ggml-qnn-ut.cpp index 1041252f3770f..eb072beae6bd4 100644 --- a/tests/ggml-qnn/ggml-qnn-ut.cpp +++ b/tests/ggml-qnn/ggml-qnn-ut.cpp @@ -133,7 +133,7 @@ static bool ggml_graph_compute_helper( #define QK8_0 32 typedef struct { - uint16_t d; // delta + uint16_t d; // delta int8_t qs[QK8_0]; // quants } block_q8_0; @@ -158,6 +158,7 @@ static void tensor_dump(const ggml_tensor * tensor, const char * name) { QNN_LOG_WARN("tensor is null"); return; } + if (tensor->type == GGML_TYPE_I8) { for (int h = 0; h < tensor->ne[3]; h++) { for (int i = 0; i < tensor->ne[2]; i++) { From 94ee77505832bdaf5fa72fd72c2fd4031c57eefc Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Fri, 7 Jun 2024 14:56:07 +0800 Subject: [PATCH 008/166] review: remove static global vars to support multi-instance simultaneously and thread safe --- ggml-qnn.cpp | 40 ++++++++++++++-------------------------- 1 file changed, 14 insertions(+), 26 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index 867f01625ad7f..f45a6449ccae3 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -697,9 +697,9 @@ class qnn_interface { } private: - const QnnInterface_t *_qnn_interface = nullptr; + const QnnInterface_t * _qnn_interface = nullptr; - const QnnSystemInterface_t *_qnn_sys_interface = nullptr; + const QnnSystemInterface_t * _qnn_sys_interface = nullptr; }; @@ -848,7 +848,7 @@ class qnn_instance { return 0; } - std::string &get_qnn_graph_name() { return _graph_name; } + std::string & get_qnn_graph_name() { return _graph_name; } bool is_rpcmem_initialized() { return _rpcmem_initialized; @@ -911,7 +911,7 @@ class qnn_instance { qnn_interface _qnn_interface; - void *_system_lib_handle = nullptr; + void * _system_lib_handle = nullptr; Qnn_GraphHandle_t _qnn_graph_handle = nullptr; @@ -927,7 +927,7 @@ class qnn_instance { QnnSystemContext_Handle_t _qnn_system_handle = nullptr; - QnnHtpDevice_PerfInfrastructure_t *_qnn_htp_perfinfra = nullptr; + QnnHtpDevice_PerfInfrastructure_t * _qnn_htp_perfinfra = nullptr; uint32_t _qnn_power_configid = 1; uint32_t _qnn_rpc_pollingtime = 9999; // 0-10000 us for high performing @@ -936,12 +936,12 @@ class qnn_instance { std::unordered_set _qnn_mem_set; - static std::mutex _init_mutex; - static std::unordered_map _loaded_lib_handle; - static std::unordered_map _lib_path_to_backend_id; - static std::unordered_map _loaded_backend; + std::mutex _init_mutex; + std::unordered_map _loaded_lib_handle; + std::unordered_map _lib_path_to_backend_id; + std::unordered_map _loaded_backend; - void *_rpc_lib_handle = nullptr; + void * _rpc_lib_handle = nullptr; std::atomic_bool _rpcmem_initialized{false}; pfn_rpc_mem_alloc _pfn_rpc_mem_alloc; pfn_rpc_mem_free _pfn_rpc_mem_free; @@ -950,26 +950,15 @@ class qnn_instance { pfn_rpc_mem_deinit _pfn_rpc_mem_deinit; std::unordered_map _rpcmem_store_map; - std::string _graph_name; }; - // ================================================================================================= // // implementation of wrapper class // // ================================================================================================= -std::mutex qnn_instance::_init_mutex; - -std::unordered_map qnn_instance::_loaded_lib_handle; - -std::unordered_map qnn_instance::_lib_path_to_backend_id; - -std::unordered_map qnn_instance::_loaded_backend; - - void * qnn_instance::alloc_rpcmem(size_t bytes, size_t alignment) { if (!_rpcmem_initialized) { QNN_LOG_WARN("rpc memory not initialized\n"); @@ -977,14 +966,13 @@ void * qnn_instance::alloc_rpcmem(size_t bytes, size_t alignment) { } auto allocate_bytes = static_cast(bytes + alignment); - void *buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, allocate_bytes); + void * buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, allocate_bytes); if (buf == nullptr) { QNN_LOG_WARN("failed to allocate rpc memory\n"); return nullptr; } - auto aligned_buf = reinterpret_cast(align_to(alignment, - reinterpret_cast(buf))); + auto aligned_buf = reinterpret_cast(align_to(alignment,reinterpret_cast(buf))); bool status = _rpcmem_store_map.insert(std::pair(aligned_buf, buf)).second; if (!status) { QNN_LOG_WARN("failed to allocate rpc memory\n"); @@ -1097,7 +1085,7 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t * Qnn_ErrorHandle_t error = QNN_SUCCESS; QNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str()); - void *lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL); + void * lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL); if (nullptr == lib_handle) { QNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dlerror()); return 1; @@ -1113,7 +1101,7 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t * // get QnnInterface Providers std::uint32_t num_providers = 0; - const QnnInterface_t **provider_list = nullptr; + const QnnInterface_t ** provider_list = nullptr; error = get_providers(&provider_list, &num_providers); if (error != QNN_SUCCESS) { QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error)); From 5d691c6cd05b4ff51f181272b8cb4df0dcb0e0ba Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Sat, 8 Jun 2024 09:22:39 +0800 Subject: [PATCH 009/166] review: put qnn's internal log inside preprocessor diretive --- ggml-qnn.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index f45a6449ccae3..072003e1d76b8 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -1277,6 +1277,7 @@ static void ggml_qnn_logcallback(const char * fmt, uint64_t timestamp, va_list argp) { +#if GGML_QNN_DEBUG static std::mutex log_mutex; static unsigned char s_ggml_qnn_logbuf[GGML_QNN_LOGBUF_LEN]; @@ -1303,7 +1304,6 @@ static void ggml_qnn_logcallback(const char * fmt, } double ms = (double) timestamp / 1000000.0; -#if GGML_QNN_DEBUG { std::lock_guard lock(log_mutex); From fdf0272dfb29cd640de92d6e54dce448c48a156e Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Sat, 8 Jun 2024 17:56:32 +0800 Subject: [PATCH 010/166] review: code format using clang-format + manually modification according to review comments --- ggml-qnn.cpp | 2793 +++++++++++++++++++++++++------------------------- 1 file changed, 1414 insertions(+), 1379 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index 072003e1d76b8..3c5ff332a1df2 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -54,132 +54,166 @@ #include #endif - // ================================================================================================= // -// forward/external/helper declaration +// forward declaration // // ================================================================================================= class qnn_instance; - -static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...); - +static void ggml_qnn_log_internal(ggml_log_level level, const char * file, + const char * func, int line, + const char * format, ...); // ================================================================================================= // // self-defined macro / data structure // // ================================================================================================= -#define RPCMEM_DEFAULT_FLAGS 1 -#define RPCMEM_HEAP_ID_SYSTEM 25 +#define RPCMEM_DEFAULT_FLAGS 1 +#define RPCMEM_HEAP_ID_SYSTEM 25 -#define GGML_QNN_LOGBUF_LEN 4096 +#define GGML_QNN_DEBUG 1 // for troubleshooting QNN backend +#define ENABLE_QNN_LOG 0 // enable/disable QNN internal log +#define GGML_QNN_LOGBUF_LEN 4096 +#define QNN_VER_PTR(x) (&((x).v1)) +#define GGML_QNN_NAME "qnn" -#define GGML_QNN_DEBUG 1 //for troubleshooting QNN backend +#define QNN_LOG_ERROR(...) \ + ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) -#define QNN_LOG_ERROR(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) -#define QNN_LOG_WARN(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) -#define QNN_LOG_INFO(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define QNN_LOG_WARN(...) \ + ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) + +#define QNN_LOG_INFO(...) \ + ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) #if GGML_QNN_DEBUG -#define QNN_LOG_DEBUG(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define QNN_LOG_DEBUG(...) \ + ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) #else #define QNN_LOG_DEBUG(...) #endif -#define QNN_VER_PTR(x) (&((x).v1)) -#define GGML_QNN_NAME "qnn" - -#define VALIDATE(value, status) \ - do { \ - status = value; \ - if (status != QNN_SUCCESS) { \ - QNN_LOG_WARN("%s expected QNN_SUCCESS\n", #value); \ - return status; \ - } \ - } while (0) - -#define VALIDATE_TENSOR_VERSION(tensor, err) VALIDATE(validate_tensor_version(tensor), err) - -#define QNN_TENSOR_GET_ID(tensor) get_qnn_tensorid(tensor) -#define QNN_TENSOR_GET_NAME(tensor) get_qnn_tensorname(tensor) -#define QNN_TENSOR_GET_TYPE(tensor) get_qnn_tensortype(tensor) -#define QNN_TENSOR_GET_DATA_FORMAT(tensor) get_qnn_tensor_dataformat(tensor) -#define QNN_TENSOR_GET_DATA_TYPE(tensor) get_qnn_tensor_datatype(tensor) -#define QNN_TENSOR_GET_QUANT_PARAMS(tensor) get_qnn_tensor_quantparams(tensor) -#define QNN_TENSOR_GET_RANK(tensor) get_qnn_tensor_rank(tensor) -#define QNN_TENSOR_GET_DIMENSIONS(tensor) get_qnn_tensor_dimensions(tensor) -#define QNN_TENSOR_GET_MEM_TYPE(tensor) get_qnn_tensor_memtype(tensor) - -#define QNN_TENSOR_SET_ID(tensor, value) set_qnn_tensor_id(tensor, value) -#define QNN_TENSOR_SET_NAME(tensor, value) set_qnn_tensor_name(tensor, value) -#define QNN_TENSOR_SET_TYPE(tensor, value) set_qnn_tensor_type(tensor, value) -#define QNN_TENSOR_SET_DATA_FORMAT(tensor, value) set_qnn_tensor_dataformat(tensor, value) -#define QNN_TENSOR_SET_DATA_TYPE(tensor, value) set_qnn_tensor_datatype(tensor, value) -#define QNN_TENSOR_SET_QUANT_PARAMS(tensor, value) set_qnn_tensor_quantparams(tensor, value) -#define QNN_TENSOR_SET_RANK(tensor, value) set_qnn_tensor_rank(tensor, value) -#define QNN_TENSOR_SET_DIMENSIONS(tensor, value) set_qnn_tensor_dimensions(tensor, value) -#define QNN_TENSOR_SET_MEM_TYPE(tensor, value) set_qnn_tensor_memtype(tensor, value) -#define QNN_TENSOR_SET_CLIENT_BUF(tensor, value) set_qnn_tensor_clientbuf(tensor, value) -#define QNN_TENSOR_SET_MEM_HANDLE(tensor, value) set_qnn_tensor_memhandle(tensor, value) - -using pfn_rpc_mem_init = void (*)(void); -using pfn_rpc_mem_deinit = void (*)(void); -using pfn_rpc_mem_alloc = void *(*)(int, uint32_t, int); -using pfn_rpc_mem_free = void (*)(void *); -using pfn_rpc_mem_to_fd = int (*)(void *); - -using _pfn_QnnSaver_initialize = decltype(QnnSaver_initialize); -using _pfn_QnnInterface_getProviders = decltype(QnnInterface_getProviders); -using _pfn_QnnSystemInterface_getProviders = decltype(QnnSystemInterface_getProviders); - - +#define VALIDATE(value, status) \ + do { \ + status = value; \ + if (status != QNN_SUCCESS) { \ + QNN_LOG_WARN("%s expected QNN_SUCCESS\n", #value); \ + return status; \ + } \ + } while (0) + +#define VALIDATE_TENSOR_VERSION(tensor, err) \ + VALIDATE(validate_tensor_version(tensor), err) + +#define QNN_TENSOR_GET_ID(tensor) get_qnn_tensorid(tensor) +#define QNN_TENSOR_GET_NAME(tensor) get_qnn_tensorname(tensor) +#define QNN_TENSOR_GET_TYPE(tensor) get_qnn_tensortype(tensor) +#define QNN_TENSOR_GET_DATA_FORMAT(tensor) get_qnn_tensor_dataformat(tensor) +#define QNN_TENSOR_GET_DATA_TYPE(tensor) get_qnn_tensor_datatype(tensor) +#define QNN_TENSOR_GET_QUANT_PARAMS(tensor) get_qnn_tensor_quantparams(tensor) +#define QNN_TENSOR_GET_RANK(tensor) get_qnn_tensor_rank(tensor) +#define QNN_TENSOR_GET_DIMENSIONS(tensor) get_qnn_tensor_dimensions(tensor) +#define QNN_TENSOR_GET_MEM_TYPE(tensor) get_qnn_tensor_memtype(tensor) + +#define QNN_TENSOR_SET_ID(tensor, value) set_qnn_tensor_id(tensor, value) +#define QNN_TENSOR_SET_NAME(tensor, value) set_qnn_tensor_name(tensor, value) +#define QNN_TENSOR_SET_TYPE(tensor, value) set_qnn_tensor_type(tensor, value) +#define QNN_TENSOR_SET_DATA_FORMAT(tensor, value) \ + set_qnn_tensor_dataformat(tensor, value) +#define QNN_TENSOR_SET_DATA_TYPE(tensor, value) \ + set_qnn_tensor_datatype(tensor, value) +#define QNN_TENSOR_SET_QUANT_PARAMS(tensor, value) \ + set_qnn_tensor_quantparams(tensor, value) +#define QNN_TENSOR_SET_RANK(tensor, value) set_qnn_tensor_rank(tensor, value) +#define QNN_TENSOR_SET_DIMENSIONS(tensor, value) \ + set_qnn_tensor_dimensions(tensor, value) +#define QNN_TENSOR_SET_MEM_TYPE(tensor, value) \ + set_qnn_tensor_memtype(tensor, value) +#define QNN_TENSOR_SET_CLIENT_BUF(tensor, value) \ + set_qnn_tensor_clientbuf(tensor, value) +#define QNN_TENSOR_SET_MEM_HANDLE(tensor, value) \ + set_qnn_tensor_memhandle(tensor, value) + +using pfn_rpc_mem_init = void (*)(void); +using pfn_rpc_mem_deinit = void (*)(void); +using pfn_rpc_mem_alloc = void *(*) (int, uint32_t, int); +using pfn_rpc_mem_free = void (*)(void *); +using pfn_rpc_mem_to_fd = int (*)(void *); + +using _pfn_QnnSaver_initialize = decltype(QnnSaver_initialize); +using _pfn_QnnInterface_getProviders = decltype(QnnInterface_getProviders); +using _pfn_QnnSystemInterface_getProviders = decltype(QnnSystemInterface_getProviders); enum class ggml_qnn_profile_level { - profile_off = 0, - profile_basic = 1, - profile_detail = 2 + profile_off = 0, + profile_basic = 1, + profile_detail = 2 }; struct ggml_backend_qnn_context { - int device; - int threads; - char name[GGML_MAX_NAME]; - char lib[GGML_MAX_NAME]; - qnn_instance * instance; - struct ggml_backend * backend; - QNN_INTERFACE_VER_TYPE raw_interface; + int device; + int threads; + char name[GGML_MAX_NAME]; + char lib[GGML_MAX_NAME]; + qnn_instance * instance; + struct ggml_backend * backend; + QNN_INTERFACE_VER_TYPE raw_interface; QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface; -} ; - -typedef void (* ggml_qnn_func_t)(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); - -typedef void (* ggml_qnn_func_common_t)(ggml_backend_qnn_context * ctx, const ggml_op ggml_op, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +}; -// ================================================================================================= -// -// static global variables -// -// ================================================================================================= -//static ggml_backend_t g_qnn_backend = nullptr; - -//according to the QNN SDK Reference Guide, -//CPU - Choose a non-quantized model. Quantized models are currently incompatible with the CPU backend -//GPU - Choose a non-quantized model. Quantized models are currently incompatible with the GPU backend -//HTP - Choose a quantized model. Quantized models are required when running on the HTP backend -//DSP - Choose a quantized model. Quantized models are required when running on the DSP backend -//HTA - Choose a quantized model. Quantized models are required when running on the HTA backend +typedef void (*ggml_qnn_func_t)(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, + const ggml_tensor * src1, + ggml_tensor * dst); + +typedef void (*ggml_qnn_func_common_t)(ggml_backend_qnn_context * ctx, + const ggml_op ggml_op, + const ggml_tensor * src0, + const ggml_tensor * src1, + ggml_tensor * dst); + +// according to the QNN SDK Reference Guide, +// CPU - Choose a non-quantized model.Quantized models are currently incompatible with the CPU backend +// GPU - Choose a non-quantized model.Quantized models are currently incompatible with the GPU backend +// HTP - Choose a quantized model. Quantized models are required when running on the HTP backend +// DSP - Choose a quantized model. Quantized models are required when running on the DSP backend +// HTA - Choose a quantized model. Quantized models are required when running on the HTA backend // -//only focus on Qualcomm CPU/GPU/NPU backend in this implementation of QNN backend for ggml currently -//Qualcomm CPU: Qualcomm Kryo CPU -//Qualcomm GPU: Qualcomm Adreno GPU -//Qualcomm NPU: aka HTP(Hexagon Tensor Processor), ~= cDSP(Compute DSP) + HMX(Hexagon Matrix eXtensions)/HTA(Hexagon Tensor Accelerator) +// only focus on Qualcomm CPU/GPU/NPU backend in this implementation of QNN backend for ggml currently, +// CPU: Qualcomm Kryo CPU +// GPU: Qualcomm Adreno GPU +// NPU: Qualcomm NPU: aka HTP(Hexagon Tensor Processor), ~= cDSP(Compute DSP) + +// HMX(Hexagon Matrix eXtensions)/HTA(Hexagon Tensor Accelerator) static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { - [QNN_BACKEND_CPU] = {.device = 0, .threads = 1, .name = "qnn-cpu", .lib = "libQnnCpu.so", .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}}, - [QNN_BACKEND_GPU] = {.device = 1, .threads = 1, .name = "qnn-gpu", .lib = "libQnnGpu.so", .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}}, - [QNN_BACKEND_NPU] = {.device = 2, .threads = 1, .name = "qnn-npu", .lib = "libQnnHtp.so", .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}}, + [QNN_BACKEND_CPU] = {.device = 0, + .threads = 1, + .name = "qnn-cpu", + .lib = "libQnnCpu.so", + .instance = nullptr, + .backend = nullptr, + .raw_interface = {}, + .raw_system_interface = {}}, + + [QNN_BACKEND_GPU] = {.device = 1, + .threads = 1, + .name = "qnn-gpu", + .lib = "libQnnGpu.so", + .instance = nullptr, + .backend = nullptr, + .raw_interface = {}, + .raw_system_interface = {}}, + + [QNN_BACKEND_NPU] = {.device = 2, + .threads = 1, + .name = "qnn-npu", + .lib = "libQnnHtp.so", + .instance = nullptr, + .backend = nullptr, + .raw_interface = {}, + .raw_system_interface = {}}, }; // ================================================================================================= @@ -189,15 +223,14 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { // ================================================================================================= static inline int validate_tensor_version(Qnn_Tensor_t tensor) { if (tensor.version != QNN_TENSOR_VERSION_1) { - QNN_LOG_WARN("validate_tensor_version() tensor %s, got unsupported version %d\n", - tensor.v1.name, - tensor.version); + QNN_LOG_WARN( + "validate_tensor_version() tensor %s, got unsupported version %d\n", + tensor.v1.name, tensor.version); return 1; } return 0; } - static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.id; @@ -206,7 +239,6 @@ static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t & tensor) { return 0u; } - static inline const char * get_qnn_tensorname(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.name; @@ -214,8 +246,6 @@ static inline const char * get_qnn_tensorname(const Qnn_Tensor_t & tensor) { return nullptr; } - - static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.type; @@ -223,31 +253,30 @@ static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t & tensor) { return QNN_TENSOR_TYPE_UNDEFINED; } - -static inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_t & tensor) { +static inline Qnn_TensorDataFormat_t + get_qnn_tensor_dataformat(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.dataFormat; } return QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER; } - -static inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t & tensor) { +static inline Qnn_DataType_t + get_qnn_tensor_datatype(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.dataType; } return QNN_DATATYPE_UNDEFINED; } - -static inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t & tensor) { +static inline Qnn_QuantizeParams_t + get_qnn_tensor_quantparams(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.quantizeParams; } return QNN_QUANTIZE_PARAMS_INIT; } - static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.rank; @@ -255,7 +284,6 @@ static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t & tensor) { return 0u; } - static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.dimensions; @@ -263,7 +291,6 @@ static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t & tensor) return nullptr; } - static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.memType; @@ -271,109 +298,95 @@ static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t & te return QNN_TENSORMEMTYPE_UNDEFINED; } - static inline void set_qnn_tensor_id(Qnn_Tensor_t & tensor, uint32_t id) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.id = id; } } - static inline void set_qnn_tensor_name(Qnn_Tensor_t & tensor, const char * name) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.name = name; } } - static inline void set_qnn_tensor_type(Qnn_Tensor_t & tensor, Qnn_TensorType_t type) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.type = type; } } - static inline void set_qnn_tensor_dataformat(Qnn_Tensor_t & tensor, Qnn_TensorDataFormat_t format) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.dataFormat = format; } } - static inline void set_qnn_tensor_datatype(Qnn_Tensor_t & tensor, Qnn_DataType_t dataType) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.dataType = dataType; } } - static inline void set_qnn_tensor_quantparams(Qnn_Tensor_t & tensor, Qnn_QuantizeParams_t params) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.quantizeParams = params; } } - static inline void set_qnn_tensor_rank(Qnn_Tensor_t & tensor, uint32_t rank) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.rank = rank; } } - static inline void set_qnn_tensor_dimensions(Qnn_Tensor_t & tensor, uint32_t * dims) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.dimensions = dims; } } - -static inline void set_qnn_tensor_memtype(Qnn_Tensor_t & tensor, Qnn_TensorMemType_t memType) { +static inline void set_qnn_tensor_memtype(Qnn_Tensor_t & tensor, Qnn_TensorMemType_t mem_type) { if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.memType = memType; + tensor.v1.memType = mem_type; } } - -static inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t & tensor, Qnn_ClientBuffer_t clientBuf) { +static inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t & tensor, Qnn_ClientBuffer_t client_buf) { if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.clientBuf = clientBuf; + tensor.v1.clientBuf = client_buf; } } - static inline void set_qnn_tensor_memhandle(Qnn_Tensor_t & tensor, Qnn_MemHandle_t handle) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.memHandle = handle; } } +static size_t memscpy(void * dst, size_t dst_size, const void * src, size_t copy_size) { + if (!dst || !src || !dst_size || !copy_size) return 0; -static size_t memscpy(void * dst, size_t dstSize, const void * src, size_t copySize) { - if (!dst || !src || !dstSize || !copySize) - return 0; - - size_t minSize = dstSize < copySize ? dstSize : copySize; + size_t min_size = dst_size < copy_size ? dst_size : copy_size; - memcpy(dst, src, minSize); + memcpy(dst, src, min_size); - return minSize; + return min_size; } - static char * ggml_qnn_strndup(const char * source, size_t maxlen) { return ::strndup(source, maxlen); } - static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) { int err = 0; VALIDATE_TENSOR_VERSION(src, err); dst.version = src.version; QNN_TENSOR_SET_NAME( - dst, ggml_qnn_strndup(QNN_TENSOR_GET_NAME(src), std::string(QNN_TENSOR_GET_NAME(src)).size())); - if (QNN_TENSOR_GET_NAME(dst) == nullptr) { + dst, ggml_qnn_strndup(QNN_TENSOR_GET_NAME(src), + std::string(QNN_TENSOR_GET_NAME(src)).size())); + if (nullptr == QNN_TENSOR_GET_NAME(dst)) { return 1; } QNN_TENSOR_SET_ID(dst, QNN_TENSOR_GET_ID(src)); @@ -382,8 +395,6 @@ static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) { QNN_TENSOR_SET_DATA_TYPE(dst, QNN_TENSOR_GET_DATA_TYPE(src)); QNN_TENSOR_SET_MEM_TYPE(dst, QNN_TENSOR_GET_MEM_TYPE(src)); - // Only metadata (i.e. non-static data) is copied from source to destination. The union still - // must be initialized so that the clientBuf/memHandle do not contain garbage data if (QNN_TENSOR_GET_MEM_TYPE(src) == QNN_TENSORMEMTYPE_RAW) { Qnn_ClientBuffer_t client_buf = {nullptr, 0}; QNN_TENSOR_SET_CLIENT_BUF(dst, client_buf); @@ -393,48 +404,47 @@ static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) { return 1; } - Qnn_QuantizeParams_t src_qparam = QNN_TENSOR_GET_QUANT_PARAMS(src); - Qnn_QuantizationEncoding_t encoding = src_qparam.quantizationEncoding; + Qnn_QuantizeParams_t src_qparam = QNN_TENSOR_GET_QUANT_PARAMS(src); + Qnn_QuantizationEncoding_t encoding = src_qparam.quantizationEncoding; if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) { - // need to allocate and copy memory for scaleOffset as it is a pointer array - Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; - Qnn_AxisScaleOffset_t &axis_scale_offset = src_qparam_cpy.axisScaleOffsetEncoding; - Qnn_ScaleOffset_t **scaleOffset = &axis_scale_offset.scaleOffset; - size_t scaleOffsetSize = axis_scale_offset.numScaleOffsets * sizeof(Qnn_ScaleOffset_t); - *scaleOffset = (Qnn_ScaleOffset_t *)malloc(scaleOffsetSize); - memscpy(*scaleOffset, - scaleOffsetSize, + Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; + Qnn_AxisScaleOffset_t & axis_scale_offset = src_qparam_cpy.axisScaleOffsetEncoding; + Qnn_ScaleOffset_t **scaleOffset = & axis_scale_offset.scaleOffset; + size_t scaleOffsetSize = axis_scale_offset.numScaleOffsets * sizeof(Qnn_ScaleOffset_t); + *scaleOffset = (Qnn_ScaleOffset_t *) malloc(scaleOffsetSize); + memscpy(*scaleOffset, scaleOffsetSize, src_qparam.axisScaleOffsetEncoding.scaleOffset, scaleOffsetSize); QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam_cpy); } else if (encoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) { - // need to allocate and copy memory for scaleOffset as it is a pointer array - Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; - Qnn_BwAxisScaleOffset_t &bwaxis_scale_offset = src_qparam_cpy.bwAxisScaleOffsetEncoding; - size_t scaleSize = bwaxis_scale_offset.numElements * sizeof(float); - float **scales = &bwaxis_scale_offset.scales; - int32_t **offsets = &bwaxis_scale_offset.offsets; - *scales = (float *)malloc(scaleSize); - memscpy(*scales, scaleSize, src_qparam.bwAxisScaleOffsetEncoding.scales, scaleSize); - - // only copy offsets if present, nullptr implies all offsets are 0 + Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; + Qnn_BwAxisScaleOffset_t & bwaxis_scale_offset = src_qparam_cpy.bwAxisScaleOffsetEncoding; + size_t scaleSize = bwaxis_scale_offset.numElements * sizeof(float); + float ** scales = &bwaxis_scale_offset.scales; + int32_t ** offsets = &bwaxis_scale_offset.offsets; + *scales = (float *) malloc(scaleSize); + memscpy(*scales, scaleSize, src_qparam.bwAxisScaleOffsetEncoding.scales, + scaleSize); + if (bwaxis_scale_offset.offsets != nullptr) { size_t offsetSize = bwaxis_scale_offset.numElements * sizeof(int32_t); - *offsets = (int32_t *)malloc(offsetSize); - memscpy(*offsets, offsetSize, src_qparam.bwAxisScaleOffsetEncoding.offsets, offsetSize); + *offsets = (int32_t *) malloc(offsetSize); + memscpy(*offsets, offsetSize, + src_qparam.bwAxisScaleOffsetEncoding.offsets, offsetSize); } QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam_cpy); } else { QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam); } - // allocate and copy memory for all the pointer members uint32_t rank = QNN_TENSOR_GET_RANK(src); QNN_TENSOR_SET_RANK(dst, rank); - size_t dim_size = rank * sizeof(uint32_t); - uint32_t * dimensions = (uint32_t *)malloc(dim_size); + size_t dim_size = rank * sizeof(uint32_t); + uint32_t *dimensions = (uint32_t *) malloc(dim_size); if (dimensions == nullptr) { - QNN_LOG_WARN("deep_copy_qnn_tensors() allocation error while copying tensor %s\n", QNN_TENSOR_GET_NAME(src)); + QNN_LOG_WARN("deep_copy_qnn_tensors() allocation error while copying " + "tensor %s\n", + QNN_TENSOR_GET_NAME(src)); return 1; } memscpy(dimensions, dim_size, QNN_TENSOR_GET_DIMENSIONS(src), dim_size); @@ -443,7 +453,6 @@ static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) { return err; } - static int free_qnn_tensor(Qnn_Tensor_t & tensor) { int err = 0; VALIDATE_TENSOR_VERSION(tensor, err); @@ -454,7 +463,6 @@ static int free_qnn_tensor(Qnn_Tensor_t & tensor) { return err; } - static uint32_t ggml_get_tensor_rank(const ggml_tensor * tensor) { uint32_t rank = 0; for (int i = 0; i < GGML_MAX_DIMS; i++) { @@ -465,44 +473,40 @@ static uint32_t ggml_get_tensor_rank(const ggml_tensor * tensor) { return rank; } - -//TODO: mapping more ggml data type to QNN data type -//ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684 +// TODO: mapping more ggml data type to QNN data type +// ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684 static Qnn_DataType_t qnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) { switch (ggmltype) { - case GGML_TYPE_F16: - return QNN_DATATYPE_FLOAT_16; - case GGML_TYPE_F32: - return QNN_DATATYPE_FLOAT_32; - case GGML_TYPE_I8: - return QNN_DATATYPE_INT_8; - case GGML_TYPE_Q8_0: - return QNN_DATATYPE_SFIXED_POINT_8; - default: - break; - + case GGML_TYPE_F16: + return QNN_DATATYPE_FLOAT_16; + case GGML_TYPE_F32: + return QNN_DATATYPE_FLOAT_32; + case GGML_TYPE_I8: + return QNN_DATATYPE_INT_8; + case GGML_TYPE_Q8_0: + return QNN_DATATYPE_SFIXED_POINT_8; + default: + break; } return QNN_DATATYPE_UNDEFINED; } - -//TODO: only support GGML_OP_ADD/GGML_OP_MUL/GGML_OP_MUL_MAT +// TODO: only support GGML_OP_ADD/GGML_OP_MUL/GGML_OP_MUL_MAT static const char * qnn_opname_from_ggmlop(enum ggml_op ggmlop) { switch (ggmlop) { - case GGML_OP_ADD: - return QNN_OP_ELEMENT_WISE_ADD; - case GGML_OP_MUL: - return QNN_OP_ELEMENT_WISE_MULTIPLY; - case GGML_OP_MUL_MAT: - return QNN_OP_MAT_MUL; - default: - break; + case GGML_OP_ADD: + return QNN_OP_ELEMENT_WISE_ADD; + case GGML_OP_MUL: + return QNN_OP_ELEMENT_WISE_MULTIPLY; + case GGML_OP_MUL_MAT: + return QNN_OP_MAT_MUL; + default: + break; } return nullptr; } - static uint32_t ggml_get_tensor_data_size(const ggml_tensor * tensor) { /* size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); @@ -516,86 +520,85 @@ static uint32_t ggml_get_tensor_data_size(const ggml_tensor * tensor) { return ggml_nbytes(tensor); } - -template +template Fn load_qnn_functionpointers(void * handle, const char * function_name) { return reinterpret_cast(dlsym(handle, function_name)); } - static const char * get_qnn_backend_name(int n_backend_type) { switch (n_backend_type) { - case 0: - return "QNN-CPU"; - case 1: - return "QNN-GPU"; - case 2: - return "QNN-NPU"; - case 3: - return "ggml"; //"fake" QNN backend, used for compare performance between QNN backend and original GGML - - default: - return "unknown"; + case 0: + return "QNN-CPU"; + case 1: + return "QNN-GPU"; + case 2: + return "QNN-NPU"; + case 3: + return "ggml"; //"fake" QNN backend, used for compare performance between QNN backend and original GGML + default: + return "unknown"; } } - static intptr_t align_to(size_t alignment, intptr_t offset) { - return offset % alignment == 0 ? offset - : offset + - (static_cast(alignment) - - offset % static_cast(alignment)); + return offset % alignment == 0 + ? offset + : offset + (static_cast(alignment) - + offset % static_cast(alignment)); } - -static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...) { +static void ggml_qnn_log_internal(ggml_log_level level, const char * file, + const char * func, int line, + const char * format, ...) { static std::mutex ggml_qnn_log_internal_mutex; - static char s_ggml_qnn_log_internal_buf[GGML_QNN_LOGBUF_LEN]; + static char s_ggml_qnn_log_internal_buf[GGML_QNN_LOGBUF_LEN]; { std::lock_guard lock(ggml_qnn_log_internal_mutex); - va_list args; + va_list args; + va_start(args, format); - int len_prefix = snprintf(s_ggml_qnn_log_internal_buf, GGML_QNN_LOGBUF_LEN, "[%s, %d]: ", func, line); - int len = vsnprintf(s_ggml_qnn_log_internal_buf + len_prefix, GGML_QNN_LOGBUF_LEN - len_prefix, format, args); + int len_prefix = + snprintf(s_ggml_qnn_log_internal_buf, GGML_QNN_LOGBUF_LEN, + "[%s, %d]: ", func, line); + int len = vsnprintf(s_ggml_qnn_log_internal_buf + len_prefix, + GGML_QNN_LOGBUF_LEN - len_prefix, format, args); if (len < (GGML_QNN_LOGBUF_LEN - len_prefix)) { #if (defined __ANDROID__) || (defined ANDROID) - //for Android APK + // for Android APK __android_log_print(level, "ggml-qnn", "%s\n", s_ggml_qnn_log_internal_buf); #endif - //for Android command line application or WoA + // for Android command line application or WoA printf("%s\n", s_ggml_qnn_log_internal_buf); } va_end(args); } } - // ================================================================================================= // -// wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK +// wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI +// Engine Direct) SDK // // ================================================================================================= class qnn_interface { -#define DEFINE_SHIM_FUNCTION_INTERFACE(F, pointer_name) \ - template \ - inline auto qnn_##F(Args... args) const { \ - return (_qnn_interface->QNN_INTERFACE_VER_NAME.pointer_name)( \ - std::forward(args)...); \ - } - +#define DEFINE_SHIM_FUNCTION_INTERFACE(F, pointer_name) \ + template inline auto qnn_##F(Args... args) const { \ + return (_qnn_interface->QNN_INTERFACE_VER_NAME.pointer_name)( \ + std::forward(args)...); \ + } -#define DEFINE_SHIM_FUNCTION_SYS_INTERFACE(F, pointer_name) \ - template \ - inline auto qnn_##F(Args... args) const { \ - return (_qnn_sys_interface->QNN_SYSTEM_INTERFACE_VER_NAME.pointer_name)( \ - std::forward(args)...); \ - } +#define DEFINE_SHIM_FUNCTION_SYS_INTERFACE(F, pointer_name) \ + template inline auto qnn_##F(Args... args) const { \ + return ( \ + _qnn_sys_interface->QNN_SYSTEM_INTERFACE_VER_NAME.pointer_name)( \ + std::forward(args)...); \ + } friend class qnn_instance; -public: + public: qnn_interface() = default; // QnnBackend @@ -603,31 +606,38 @@ class qnn_interface { DEFINE_SHIM_FUNCTION_INTERFACE(backend_free, backendFree); - DEFINE_SHIM_FUNCTION_INTERFACE(backend_register_op_package, backendRegisterOpPackage); + DEFINE_SHIM_FUNCTION_INTERFACE(backend_register_op_package, + backendRegisterOpPackage); - DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, backendValidateOpConfig); + DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, + backendValidateOpConfig); - DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, backendGetApiVersion); + DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, + backendGetApiVersion); // QnnDevice DEFINE_SHIM_FUNCTION_INTERFACE(device_create, deviceCreate); DEFINE_SHIM_FUNCTION_INTERFACE(device_free, deviceFree); - DEFINE_SHIM_FUNCTION_INTERFACE(device_get_infrastructure, deviceGetInfrastructure); + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_infrastructure, + deviceGetInfrastructure); - DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, deviceGetPlatformInfo); + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, + deviceGetPlatformInfo); DEFINE_SHIM_FUNCTION_INTERFACE(device_get_info, deviceGetInfo); // QnnContext DEFINE_SHIM_FUNCTION_INTERFACE(context_create, contextCreate); - DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary_size, contextGetBinarySize); + DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary_size, + contextGetBinarySize); DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary, contextGetBinary); - DEFINE_SHIM_FUNCTION_INTERFACE(context_create_from_binary, contextCreateFromBinary); + DEFINE_SHIM_FUNCTION_INTERFACE(context_create_from_binary, + contextCreateFromBinary); DEFINE_SHIM_FUNCTION_INTERFACE(context_free, contextFree); @@ -666,17 +676,22 @@ class qnn_interface { DEFINE_SHIM_FUNCTION_INTERFACE(mem_de_register, memDeRegister); // QnnProperty - DEFINE_SHIM_FUNCTION_INTERFACE(property_has_capability, propertyHasCapability); + DEFINE_SHIM_FUNCTION_INTERFACE(property_has_capability, + propertyHasCapability); // QnnTensor - DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_context_tensor, tensorCreateContextTensor); + DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_context_tensor, + tensorCreateContextTensor); - DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, tensorCreateGraphTensor); + DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, + tensorCreateGraphTensor); // QnnSystem - DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_create, systemContextCreate); + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_create, + systemContextCreate); - DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, systemContextGetBinaryInfo); + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, + systemContextGetBinaryInfo); DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree); @@ -684,67 +699,60 @@ class qnn_interface { _qnn_interface = qnn_interface; } - void set_qnn_system_interface(const QnnSystemInterface_t * qnn_sys_interface) { + void set_qnn_system_interface( + const QnnSystemInterface_t * qnn_sys_interface) { _qnn_sys_interface = qnn_sys_interface; } - uint32_t get_backend_id() const { - return _qnn_interface->backendId; - } + uint32_t get_backend_id() const { return _qnn_interface->backendId; } bool is_loaded() const { return ((_qnn_sys_interface != nullptr) && (_qnn_interface != nullptr)); } -private: + private: const QnnInterface_t * _qnn_interface = nullptr; const QnnSystemInterface_t * _qnn_sys_interface = nullptr; }; - - // ================================================================================================= // // wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK // -// and -// -// resource management of QNN resources for GGML's QNN backend // ================================================================================================= class qnn_instance { -public: + public: using BackendIdType = decltype(QnnInterface_t{}.backendId); - explicit qnn_instance(const std::string & lib_path, const std::string & backend_name, - const std::string & model_name) : - _lib_path(std::move(lib_path)), - _backend_name(std::move(backend_name)), - _model_name(std::move(model_name)) {}; + explicit qnn_instance(const std::string & lib_path, + const std::string & backend_name, + const std::string & model_name) + : _lib_path(std::move(lib_path)) + , _backend_name(std::move(backend_name)) + , _model_name(std::move(model_name)){}; - ~qnn_instance() { - } + ~qnn_instance() {} int qnn_init(const QnnSaver_Config_t ** saver_config); int qnn_finalize(); - const qnn_interface &get_qnn_interface() { + const qnn_interface & get_qnn_interface() { if (!_qnn_interface.is_loaded()) { QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); } return _qnn_interface; } - - const QNN_INTERFACE_VER_TYPE &get_qnn_raw_interface() { + const QNN_INTERFACE_VER_TYPE & get_qnn_raw_interface() { if (!_qnn_interface.is_loaded()) { QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); } return _qnn_raw_interface; } - const QNN_SYSTEM_INTERFACE_VER_TYPE &get_qnn_raw_system_interface() { + const QNN_SYSTEM_INTERFACE_VER_TYPE & get_qnn_raw_system_interface() { if (!_qnn_interface.is_loaded()) { QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); } @@ -753,24 +761,31 @@ class qnn_instance { const Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; } - const Qnn_ProfileHandle_t get_qnn_profile_handle() { return _qnn_profile_handle; } + const Qnn_ProfileHandle_t get_qnn_profile_handle() { + return _qnn_profile_handle; + } - const Qnn_DeviceHandle_t get_qnn_device_handle() { return _qnn_device_handle; } + const Qnn_DeviceHandle_t get_qnn_device_handle() { + return _qnn_device_handle; + } - const Qnn_BackendHandle_t get_qnn_backend_handle() { return _qnn_backend_handle; } + const Qnn_BackendHandle_t get_qnn_backend_handle() { + return _qnn_backend_handle; + } - const Qnn_ContextHandle_t get_qnn_context_handle() { return _qnn_context_handle; } + const Qnn_ContextHandle_t get_qnn_context_handle() { + return _qnn_context_handle; + } - const QnnSystemContext_Handle_t get_qnn_system_handle() { return _qnn_system_handle; } + const QnnSystemContext_Handle_t get_qnn_system_handle() { + return _qnn_system_handle; + } const Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; } - - int init_qnn_graph(const char * graph_name, - bool debug, - uint8_t do_node_validation = 1, - const QnnGraph_Config_t ** graph_configs = nullptr - ); + int init_qnn_graph(const char * graph_name, bool debug, + uint8_t do_node_validation = 1, + const QnnGraph_Config_t ** graph_configs = nullptr); int finalize_qnn_graph(); @@ -782,35 +797,35 @@ class qnn_instance { return 1; } - QnnHtpDevice_Infrastructure_t *htp_infra = static_cast(device_infra); - QnnHtpDevice_PerfInfrastructure_t *htp_perfinfra = &htp_infra->perfInfra; + QnnHtpDevice_Infrastructure_t * htp_infra = static_cast(device_infra); + QnnHtpDevice_PerfInfrastructure_t * htp_perfinfra = &htp_infra->perfInfra; uint32_t power_configid = 1; - uint32_t device_id = 0; - uint32_t core_id = 0; + uint32_t device_id = 0; + uint32_t core_id = 0; htp_perfinfra->createPowerConfigId(device_id, core_id, &power_configid); - _qnn_htp_perfinfra = htp_perfinfra; + _qnn_htp_perfinfra = htp_perfinfra; _qnn_power_configid = power_configid; return 0; } - int set_rpc_polling() { if (_qnn_rpc_pollingtime > 0) { QnnHtpPerfInfrastructure_PowerConfig_t rpc_pollingTime; memset(&rpc_pollingTime, 0, sizeof(rpc_pollingTime)); rpc_pollingTime.option = - QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME; + QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME; rpc_pollingTime.rpcPollingTimeConfig = _qnn_rpc_pollingtime; - const QnnHtpPerfInfrastructure_PowerConfig_t *powerConfigs[] = {&rpc_pollingTime, nullptr}; + const QnnHtpPerfInfrastructure_PowerConfig_t * powerConfigs[] = { + &rpc_pollingTime, nullptr}; if (_qnn_htp_perfinfra) { - _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, powerConfigs); + _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, + powerConfigs); } } return 0; } - int set_high_performance_mode() { if (nullptr == _qnn_htp_perfinfra) { QNN_LOG_DEBUG("perf intra is null\n"); @@ -820,39 +835,49 @@ class qnn_instance { QnnHtpPerfInfrastructure_PowerConfig_t powerConfig; memset(&powerConfig, 0, sizeof(powerConfig)); powerConfig.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3; - powerConfig.dcvsV3Config.dcvsEnable = 0; + powerConfig.dcvsV3Config.dcvsEnable = 0; powerConfig.dcvsV3Config.setDcvsEnable = 1; - powerConfig.dcvsV3Config.contextId = _qnn_power_configid; + powerConfig.dcvsV3Config.contextId = _qnn_power_configid; powerConfig.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE; - powerConfig.dcvsV3Config.setSleepLatency = 1; // True to consider Latency parameter otherwise False - powerConfig.dcvsV3Config.setBusParams = 1; // True to consider Bus parameter otherwise False - powerConfig.dcvsV3Config.setCoreParams = 1; // True to consider Core parameter otherwise False - powerConfig.dcvsV3Config.sleepDisable = 0; // True to consider sleep/LPM modes, False to enable - powerConfig.dcvsV3Config.setSleepDisable = 0; // True to consider sleep disable/enable parameter otherwise False - // set Sleep latency parameter + powerConfig.dcvsV3Config.setSleepLatency = + 1; // true to consider Latency parameter otherwise False + powerConfig.dcvsV3Config.setBusParams = + 1; // true to consider Bus parameter otherwise False + powerConfig.dcvsV3Config.setCoreParams = + 1; // true to consider Core parameter otherwise False + powerConfig.dcvsV3Config.sleepDisable = + 0; // true to consider sleep/LPM modes, False to enable + powerConfig.dcvsV3Config.setSleepDisable = + 0; // true to consider sleep disable/enable parameter otherwise False set sleep latency parameter uint32_t latencyValue = 40; - powerConfig.dcvsV3Config.sleepLatency = latencyValue; // range 40-2000 micro sec - // set Bus Clock Parameters (refer QnnHtpPerfInfrastructure_VoltageCorner_t enum) - powerConfig.dcvsV3Config.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - powerConfig.dcvsV3Config.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - powerConfig.dcvsV3Config.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - // set Core Clock Parameters (refer QnnHtpPerfInfrastructure_VoltageCorner_t enum) - powerConfig.dcvsV3Config.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - powerConfig.dcvsV3Config.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - powerConfig.dcvsV3Config.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + powerConfig.dcvsV3Config.sleepLatency = + latencyValue; // range 40-2000 micro sec + // set Bus Clock Parameters + powerConfig.dcvsV3Config.busVoltageCornerMin = + DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + powerConfig.dcvsV3Config.busVoltageCornerTarget = + DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + powerConfig.dcvsV3Config.busVoltageCornerMax = + DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + // set Core Clock Parameters + powerConfig.dcvsV3Config.coreVoltageCornerMin = + DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + powerConfig.dcvsV3Config.coreVoltageCornerTarget = + DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + powerConfig.dcvsV3Config.coreVoltageCornerMax = + DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; // set power config with different performance parameters - const QnnHtpPerfInfrastructure_PowerConfig_t *powerConfigs[] = {&powerConfig, nullptr}; + const QnnHtpPerfInfrastructure_PowerConfig_t *powerConfigs[] = { + &powerConfig, nullptr}; _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, powerConfigs); return 0; } - std::string & get_qnn_graph_name() { return _graph_name; } + std::string &get_qnn_graph_name() { return _graph_name; } - bool is_rpcmem_initialized() { - return _rpcmem_initialized; - } + bool is_rpcmem_initialized() { return _rpcmem_initialized; } void set_rpcmem_initialized(bool initialized) { _rpcmem_initialized = initialized; @@ -864,7 +889,7 @@ class qnn_instance { void unregister_rpcmem(); - void *alloc_rpcmem(size_t bytes, size_t alignment); + void * alloc_rpcmem(size_t bytes, size_t alignment); void free_rpcmem(void * buf); @@ -874,15 +899,17 @@ class qnn_instance { return _qnn_mem_set.count(handle) != 0U; } -public: - std::map> _qnn_graph_map; + public: + std::map> + _qnn_graph_map; -private: + private: int load_system(); int unload_system(); - int load_backend(std::string &lib_path, const QnnSaver_Config_t ** saver_config); + int load_backend(std::string & lib_path, const QnnSaver_Config_t ** saver_config); int unload_backend(); @@ -890,24 +917,25 @@ class qnn_instance { _qnn_raw_interface = raw_interface; } - void set_qnn_raw_system_interface(QNN_SYSTEM_INTERFACE_VER_TYPE &raw_interface) { + void set_qnn_raw_system_interface(QNN_SYSTEM_INTERFACE_VER_TYPE & raw_interface) { _qnn_raw_system_interface = raw_interface; } -private: + private: static constexpr const int _required_num_providers = 1; -private: - std::string _lib_path; - std::string _backend_name; - std::string _model_name; // prebuilt QNN model name, not used in currently + private: + std::string _lib_path; + std::string _backend_name; + std::string _model_name; // prebuilt QNN model name, not used currently BackendIdType _backend_id; - bool _debug_tensor = false; // flag to indicate if requested graph is to be run in debug mode - bool _do_node_validations = true; // flag to indicate whether all add_node calls need to be validated - QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_DEBUG; + bool _debug_tensor = false; // flag to indicate if requested graph is to be run in debug mode + bool _do_node_validations = true; // flag to indicate whether all add_node + // calls need to be validated + QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_DEBUG; - ggml_qnn_profile_level _profile_level = ggml_qnn_profile_level::profile_detail; + ggml_qnn_profile_level _profile_level = ggml_qnn_profile_level::profile_detail; qnn_interface _qnn_interface; @@ -927,36 +955,35 @@ class qnn_instance { QnnSystemContext_Handle_t _qnn_system_handle = nullptr; - QnnHtpDevice_PerfInfrastructure_t * _qnn_htp_perfinfra = nullptr; - uint32_t _qnn_power_configid = 1; + QnnHtpDevice_PerfInfrastructure_t * _qnn_htp_perfinfra = nullptr; + uint32_t _qnn_power_configid = 1; uint32_t _qnn_rpc_pollingtime = 9999; // 0-10000 us for high performing - QNN_INTERFACE_VER_TYPE _qnn_raw_interface; + QNN_INTERFACE_VER_TYPE _qnn_raw_interface; QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface; std::unordered_set _qnn_mem_set; - std::mutex _init_mutex; - std::unordered_map _loaded_lib_handle; + std::mutex _init_mutex; + std::unordered_map _loaded_lib_handle; std::unordered_map _lib_path_to_backend_id; std::unordered_map _loaded_backend; - void * _rpc_lib_handle = nullptr; - std::atomic_bool _rpcmem_initialized{false}; - pfn_rpc_mem_alloc _pfn_rpc_mem_alloc; - pfn_rpc_mem_free _pfn_rpc_mem_free; - pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd; - pfn_rpc_mem_init _pfn_rpc_mem_init; - pfn_rpc_mem_deinit _pfn_rpc_mem_deinit; + void * _rpc_lib_handle = nullptr; + std::atomic_bool _rpcmem_initialized{false}; + pfn_rpc_mem_alloc _pfn_rpc_mem_alloc; + pfn_rpc_mem_free _pfn_rpc_mem_free; + pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd; + pfn_rpc_mem_init _pfn_rpc_mem_init; + pfn_rpc_mem_deinit _pfn_rpc_mem_deinit; std::unordered_map _rpcmem_store_map; std::string _graph_name; }; - // ================================================================================================= // -// implementation of wrapper class +// implementation of QNN wrapper class // // ================================================================================================= void * qnn_instance::alloc_rpcmem(size_t bytes, size_t alignment) { @@ -965,15 +992,18 @@ void * qnn_instance::alloc_rpcmem(size_t bytes, size_t alignment) { return nullptr; } - auto allocate_bytes = static_cast(bytes + alignment); - void * buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, allocate_bytes); + auto allocate_bytes = static_cast(bytes + alignment); + void * buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, + allocate_bytes); if (buf == nullptr) { QNN_LOG_WARN("failed to allocate rpc memory\n"); return nullptr; } - auto aligned_buf = reinterpret_cast(align_to(alignment,reinterpret_cast(buf))); - bool status = _rpcmem_store_map.insert(std::pair(aligned_buf, buf)).second; + auto aligned_buf = reinterpret_cast( + align_to(alignment, reinterpret_cast(buf))); + bool status = + _rpcmem_store_map.insert(std::pair(aligned_buf, buf)).second; if (!status) { QNN_LOG_WARN("failed to allocate rpc memory\n"); _pfn_rpc_mem_free(buf); @@ -982,7 +1012,6 @@ void * qnn_instance::alloc_rpcmem(size_t bytes, size_t alignment) { return aligned_buf; } - void qnn_instance::free_rpcmem(void * buf) { if (!_rpcmem_initialized) { QNN_LOG_WARN("rpc memory not initialized\n"); @@ -994,7 +1023,6 @@ void qnn_instance::free_rpcmem(void * buf) { } } - int32_t qnn_instance::rpcmem_to_fd(void * buf) { int32_t mem_fd = -1; if (!is_rpcmem_initialized()) { @@ -1006,7 +1034,6 @@ int32_t qnn_instance::rpcmem_to_fd(void * buf) { return mem_fd; } - int qnn_instance::register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor) { if (nullptr == p_data || (nullptr == p_tensor)) { QNN_LOG_WARN("invalid param\n"); @@ -1020,10 +1047,11 @@ int qnn_instance::register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor) { if (is_rpcmem_allocated(p_data)) { QNN_LOG_WARN("rpc memory already allocated\n"); - //return 3; + // return 3; } if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) { - QNN_LOG_WARN("tensor %s has been registered shared memory\n", (QNN_VER_PTR(*p_tensor)->name)); + QNN_LOG_WARN("tensor %s has been registered shared memory\n", + (QNN_VER_PTR(*p_tensor)->name)); return 4; } @@ -1033,24 +1061,23 @@ int qnn_instance::register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor) { return 5; } QNN_LOG_DEBUG("mem_fd %d\n", mem_fd); - Qnn_MemDescriptor_t descriptor = { - {QNN_VER_PTR(*p_tensor)->rank, QNN_VER_PTR(*p_tensor)->dimensions, nullptr}, - QNN_VER_PTR(*p_tensor)->dataType, - QNN_MEM_TYPE_ION, - {{mem_fd}}}; - Qnn_MemHandle_t handle = nullptr; - int error = QNN_SUCCESS; - error = _qnn_interface.qnn_mem_register( - _qnn_context_handle, - &descriptor, - /*numDescriptors=*/1, - &handle); + Qnn_MemDescriptor_t descriptor = {{QNN_VER_PTR(*p_tensor)->rank, + QNN_VER_PTR(*p_tensor)->dimensions, + nullptr}, + QNN_VER_PTR(*p_tensor)->dataType, + QNN_MEM_TYPE_ION, + {{mem_fd}}}; + Qnn_MemHandle_t handle = nullptr; + int error = QNN_SUCCESS; + error = _qnn_interface.qnn_mem_register(_qnn_context_handle, &descriptor, + /*numDescriptors=*/1, &handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to register shared memory, error %d, %s\n", QNN_GET_ERROR_CODE(error), - strerror(error)); + QNN_LOG_WARN("failed to register shared memory, error %d, %s\n", + QNN_GET_ERROR_CODE(error), strerror(error)); return 6; } else { - QNN_LOG_INFO("tensor %s successfully register shared memory\n", (QNN_VER_PTR(*p_tensor)->name)); + QNN_LOG_INFO("tensor %s successfully register shared memory\n", + (QNN_VER_PTR(*p_tensor)->name)); } QNN_VER_PTR(*p_tensor)->memHandle = handle; _qnn_mem_set.insert(handle); @@ -1058,7 +1085,6 @@ int qnn_instance::register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor) { return 0; } - void qnn_instance::unregister_rpcmem() { Qnn_ErrorHandle_t error = QNN_SUCCESS; @@ -1069,47 +1095,49 @@ void qnn_instance::unregister_rpcmem() { for (auto &mem_handle : _qnn_mem_set) { error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to unregister shared memory, error %d\n", QNN_GET_ERROR_CODE(error)); + QNN_LOG_WARN("failed to unregister shared memory, error %d\n", + QNN_GET_ERROR_CODE(error)); } } _qnn_mem_set.clear(); } - bool qnn_instance::is_rpcmem_allocated(void * buf) { return _rpcmem_store_map.count(buf) != 0U; } - -int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t ** saver_config) { +int qnn_instance::load_backend(std::string & lib_path, + const QnnSaver_Config_t ** saver_config) { Qnn_ErrorHandle_t error = QNN_SUCCESS; QNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str()); void * lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL); if (nullptr == lib_handle) { - QNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dlerror()); + QNN_LOG_WARN("can not open QNN library %s, with error: %s", + lib_path.c_str(), dlerror()); return 1; } - // load get_provider function - auto get_providers = load_qnn_functionpointers<_pfn_QnnInterface_getProviders *>(lib_handle, - "QnnInterface_getProviders"); + auto get_providers = load_qnn_functionpointers<_pfn_QnnInterface_getProviders *>( + lib_handle, "QnnInterface_getProviders"); if (nullptr == get_providers) { - QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dlerror()); + QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", + dlerror()); return 2; } - // get QnnInterface Providers std::uint32_t num_providers = 0; const QnnInterface_t ** provider_list = nullptr; error = get_providers(&provider_list, &num_providers); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error)); + QNN_LOG_WARN("failed to get providers, error %d", + QNN_GET_ERROR_CODE(error)); return 3; } QNN_LOG_DEBUG("num_providers=%d\n", num_providers); if (num_providers != _required_num_providers) { - QNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers); + QNN_LOG_WARN("providers is %d instead of required %d", num_providers, + _required_num_providers); return 4; } @@ -1120,10 +1148,12 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t * bool found_valid_interface = false; QNN_INTERFACE_VER_TYPE qnn_interface; for (size_t idx = 0; idx < num_providers; idx++) { - if (QNN_API_VERSION_MAJOR == provider_list[idx]->apiVersion.coreApiVersion.major && - QNN_API_VERSION_MINOR <= provider_list[idx]->apiVersion.coreApiVersion.minor) { + if (QNN_API_VERSION_MAJOR == + provider_list[idx]->apiVersion.coreApiVersion.major && + QNN_API_VERSION_MINOR <= + provider_list[idx]->apiVersion.coreApiVersion.minor) { found_valid_interface = true; - qnn_interface = provider_list[idx]->QNN_INTERFACE_VER_NAME; + qnn_interface = provider_list[idx]->QNN_INTERFACE_VER_NAME; break; } } @@ -1136,33 +1166,34 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t * } set_qnn_raw_interface(qnn_interface); - BackendIdType backend_id = provider_list[0]->backendId; + BackendIdType backend_id = provider_list[0]->backendId; _lib_path_to_backend_id[lib_path] = backend_id; if (_loaded_backend.count(backend_id) > 0) { QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n", - lib_path.c_str(), backend_id); + lib_path.c_str(), backend_id); } _loaded_backend[backend_id] = provider_list[0]; if (_loaded_lib_handle.count(backend_id) > 0) { QNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]); int dlclose_error = dlclose(_loaded_lib_handle[backend_id]); if (dlclose_error != 0) { - QNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], dlerror()); + QNN_LOG_WARN("fail to close %p with error %s\n", + _loaded_lib_handle[backend_id], dlerror()); } } _loaded_lib_handle[backend_id] = lib_handle; - _backend_id = backend_id; + _backend_id = backend_id; return 0; } - int qnn_instance::unload_backend() { int dlclose_error = 0; - for (auto &it : _loaded_lib_handle) { + for (auto & it : _loaded_lib_handle) { dlclose_error = dlclose(it.second); if (dlclose_error != 0) { - QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, dlerror()); + QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, + dlerror()); } } @@ -1173,7 +1204,6 @@ int qnn_instance::unload_backend() { return 0; } - int qnn_instance::load_system() { Qnn_ErrorHandle_t error = QNN_SUCCESS; @@ -1182,14 +1212,18 @@ int qnn_instance::load_system() { _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL); if (nullptr == _system_lib_handle) { - QNN_LOG_WARN("can not open QNN library %s, error: %s\n", system_lib_path.c_str(), dlerror()); + QNN_LOG_WARN("can not open QNN library %s, error: %s\n", + system_lib_path.c_str(), dlerror()); return 1; } - auto * get_providers = reinterpret_cast<_pfn_QnnSystemInterface_getProviders *>(dlsym( - _system_lib_handle, "QnnSystemInterface_getProviders")); + auto * get_providers = + reinterpret_cast<_pfn_QnnSystemInterface_getProviders *>( + dlsym(_system_lib_handle, "QnnSystemInterface_getProviders")); if (nullptr == get_providers) { - QNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s\n", dlerror()); + QNN_LOG_WARN( + "can not load QNN symbol QnnSystemInterface_getProviders: %s\n", + dlerror()); return 2; } @@ -1197,12 +1231,14 @@ int qnn_instance::load_system() { const QnnSystemInterface_t ** provider_list = nullptr; error = get_providers(&provider_list, &num_providers); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to get providers, error %d\n", QNN_GET_ERROR_CODE(error)); + QNN_LOG_WARN("failed to get providers, error %d\n", + QNN_GET_ERROR_CODE(error)); return 3; } if (num_providers != _required_num_providers) { - QNN_LOG_WARN("providers is %d instead of required %d\n", num_providers, _required_num_providers); + QNN_LOG_WARN("providers is %d instead of required %d\n", num_providers, + _required_num_providers); return 4; } @@ -1215,11 +1251,12 @@ int qnn_instance::load_system() { bool found_valid_system_interface = false; for (size_t idx = 0; idx < num_providers; idx++) { if (QNN_SYSTEM_API_VERSION_MAJOR == - provider_list[idx]->systemApiVersion.major && + provider_list[idx]->systemApiVersion.major && QNN_SYSTEM_API_VERSION_MINOR <= - provider_list[idx]->systemApiVersion.minor) { + provider_list[idx]->systemApiVersion.minor) { found_valid_system_interface = true; - qnn_system_interface = provider_list[idx]->QNN_SYSTEM_INTERFACE_VER_NAME; + qnn_system_interface = + provider_list[idx]->QNN_SYSTEM_INTERFACE_VER_NAME; break; } } @@ -1243,7 +1280,6 @@ int qnn_instance::load_system() { return 0; } - int qnn_instance::unload_system() { int result = 0; @@ -1262,7 +1298,8 @@ int qnn_instance::unload_system() { int dlclose_error = dlclose(_system_lib_handle); if (dlclose_error != 0) { - QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", dlerror()); + QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", + dlerror()); return 2; } @@ -1271,36 +1308,33 @@ int qnn_instance::unload_system() { return result; } +static void ggml_qnn_logcallback(const char * fmt, QnnLog_Level_t level, + uint64_t timestamp, va_list argp) { -static void ggml_qnn_logcallback(const char * fmt, - QnnLog_Level_t level, - uint64_t timestamp, - va_list argp) { - -#if GGML_QNN_DEBUG - static std::mutex log_mutex; +#if ENABLE_QNN_LOG + static std::mutex log_mutex; static unsigned char s_ggml_qnn_logbuf[GGML_QNN_LOGBUF_LEN]; const char * log_level_desc = ""; switch (level) { - case QNN_LOG_LEVEL_ERROR: - log_level_desc = " ERROR "; - break; - case QNN_LOG_LEVEL_WARN: - log_level_desc = "WARNING"; - break; - case QNN_LOG_LEVEL_INFO: - log_level_desc = " INFO "; - break; - case QNN_LOG_LEVEL_DEBUG: - log_level_desc = " DEBUG "; - break; - case QNN_LOG_LEVEL_VERBOSE: - log_level_desc = "VERBOSE"; - break; - case QNN_LOG_LEVEL_MAX: - log_level_desc = "UNKNOWN"; - break; + case QNN_LOG_LEVEL_ERROR: + log_level_desc = "ERROR"; + break; + case QNN_LOG_LEVEL_WARN: + log_level_desc = "WARNING"; + break; + case QNN_LOG_LEVEL_INFO: + log_level_desc = "INFO"; + break; + case QNN_LOG_LEVEL_DEBUG: + log_level_desc = "DEBUG"; + break; + case QNN_LOG_LEVEL_VERBOSE: + log_level_desc = "VERBOSE"; + break; + case QNN_LOG_LEVEL_MAX: + log_level_desc = "UNKNOWN"; + break; } double ms = (double) timestamp / 1000000.0; @@ -1314,12 +1348,11 @@ static void ggml_qnn_logcallback(const char * fmt, #endif } - int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { BackendIdType backend_id = QNN_BACKEND_ID_NULL; QNN_LOG_DEBUG("enter qni_init\n"); - const std::lock_guard lock(_init_mutex); + std::lock_guard lock(_init_mutex); if (0 != load_system()) { QNN_LOG_WARN("can not load QNN system lib, pls check why?\n"); @@ -1328,39 +1361,43 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { QNN_LOG_DEBUG("load QNN system lib successfully\n"); } - std::string bakend_lib_path = _lib_path + _backend_name; - if (0 == _lib_path_to_backend_id.count(bakend_lib_path)) { - int is_load_ok = load_backend(bakend_lib_path, saver_config); + std::string backend_lib_path = _lib_path + _backend_name; + if (0 == _lib_path_to_backend_id.count(backend_lib_path)) { + int is_load_ok = load_backend(backend_lib_path, saver_config); if (0 != is_load_ok) { QNN_LOG_WARN("failed to load QNN backend\n"); return 2; } } - backend_id = _lib_path_to_backend_id[bakend_lib_path]; + backend_id = _lib_path_to_backend_id[backend_lib_path]; if (0 == _loaded_backend.count(backend_id) || 0 == _loaded_lib_handle.count(backend_id)) { - QNN_LOG_WARN("library %s is loaded but loaded backend count=%zu, loaded lib_handle count=%zu\n", - bakend_lib_path.c_str(), - _loaded_backend.count(backend_id), - _loaded_lib_handle.count(backend_id)); + QNN_LOG_WARN("library %s is loaded but loaded backend count=%zu, " + "loaded lib_handle count=%zu\n", + backend_lib_path.c_str(), _loaded_backend.count(backend_id), + _loaded_lib_handle.count(backend_id)); return 3; } _qnn_interface.set_qnn_interface(_loaded_backend[backend_id]); - _qnn_interface.qnn_log_create(ggml_qnn_logcallback, _qnn_log_level, &_qnn_log_handle); + _qnn_interface.qnn_log_create(ggml_qnn_logcallback, _qnn_log_level, + &_qnn_log_handle); if (nullptr == _qnn_log_handle) { - QNN_LOG_WARN("why failed to initialize qnn log\n"); //NPU backend not work on Qualcomm SoC based low-end phone + QNN_LOG_WARN( + "why failed to initialize qnn log\n"); // NPU backend not work on + // Qualcomm SoC equipped low-end phone return 4; } else { QNN_LOG_DEBUG("initialize qnn log successfully\n"); } std::vector temp_backend_config; - _qnn_interface.qnn_backend_create(_qnn_log_handle, temp_backend_config.empty() ? nullptr - : temp_backend_config.data(), - &_qnn_backend_handle); + _qnn_interface.qnn_backend_create( + _qnn_log_handle, + temp_backend_config.empty() ? nullptr : temp_backend_config.data(), + &_qnn_backend_handle); if (nullptr == _qnn_backend_handle) { QNN_LOG_WARN("why failed to initialize qnn backend\n"); return 5; @@ -1369,7 +1406,8 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { } if (nullptr != _qnn_raw_interface.propertyHasCapability) { - auto qnnStatus = _qnn_raw_interface.propertyHasCapability(QNN_PROPERTY_GROUP_DEVICE); + auto qnnStatus = + _qnn_raw_interface.propertyHasCapability(QNN_PROPERTY_GROUP_DEVICE); if (QNN_PROPERTY_NOT_SUPPORTED == qnnStatus) { QNN_LOG_WARN("device property is not supported\n"); } @@ -1378,8 +1416,10 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { } } - auto qnnStatus = _qnn_raw_interface.deviceCreate(_qnn_log_handle, nullptr, &_qnn_device_handle); - if (QNN_SUCCESS != qnnStatus && QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnnStatus) { + Qnn_ErrorHandle_t qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, nullptr, + &_qnn_device_handle); + if (QNN_SUCCESS != qnn_status && + QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnn_status) { QNN_LOG_WARN("failed to create QNN device\n"); } else { QNN_LOG_INFO("create device successfully\n"); @@ -1389,8 +1429,10 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { QNN_LOG_INFO("profiling turned on; level = %d", _profile_level); if (ggml_qnn_profile_level::profile_basic == _profile_level) { QNN_LOG_INFO("basic profiling requested. creating Qnn Profile object\n"); - if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate( - _qnn_backend_handle, QNN_PROFILE_LEVEL_BASIC, &_qnn_profile_handle)) { + if (QNN_PROFILE_NO_ERROR != + _qnn_raw_interface.profileCreate(_qnn_backend_handle, + QNN_PROFILE_LEVEL_BASIC, + &_qnn_profile_handle)) { QNN_LOG_WARN("unable to create profile handle in the backend\n"); return 6; } else { @@ -1398,8 +1440,10 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { } } else if (ggml_qnn_profile_level::profile_detail == _profile_level) { QNN_LOG_INFO("detailed profiling requested. Creating Qnn Profile object\n"); - if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate( - _qnn_backend_handle, QNN_PROFILE_LEVEL_DETAILED, &_qnn_profile_handle)) { + if (QNN_PROFILE_NO_ERROR != + _qnn_raw_interface.profileCreate(_qnn_backend_handle, + QNN_PROFILE_LEVEL_DETAILED, + &_qnn_profile_handle)) { QNN_LOG_WARN("unable to create profile handle in the backend\n"); return 7; } else { @@ -1416,26 +1460,32 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { QNN_LOG_DEBUG("load rpcmem lib successfully\n"); set_rpcmem_initialized(true); } - _pfn_rpc_mem_init = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_init")); - _pfn_rpc_mem_deinit = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_deinit")); - _pfn_rpc_mem_alloc = reinterpret_cast(dlsym(_rpc_lib_handle,"rpcmem_alloc")); - _pfn_rpc_mem_free = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_free")); - _pfn_rpc_mem_to_fd = reinterpret_cast(dlsym(_rpc_lib_handle,"rpcmem_to_fd")); - if (nullptr == _pfn_rpc_mem_alloc || nullptr == _pfn_rpc_mem_free - || nullptr == _pfn_rpc_mem_to_fd) { + _pfn_rpc_mem_init = reinterpret_cast( + dlsym(_rpc_lib_handle, "rpcmem_init")); + _pfn_rpc_mem_deinit = reinterpret_cast( + dlsym(_rpc_lib_handle, "rpcmem_deinit")); + _pfn_rpc_mem_alloc = reinterpret_cast( + dlsym(_rpc_lib_handle, "rpcmem_alloc")); + _pfn_rpc_mem_free = reinterpret_cast( + dlsym(_rpc_lib_handle, "rpcmem_free")); + _pfn_rpc_mem_to_fd = reinterpret_cast( + dlsym(_rpc_lib_handle, "rpcmem_to_fd")); + if (nullptr == _pfn_rpc_mem_alloc || nullptr == _pfn_rpc_mem_free || + nullptr == _pfn_rpc_mem_to_fd) { QNN_LOG_WARN("unable to access symbols in QNN RPC lib. dlerror(): %s", dlerror()); dlclose(_rpc_lib_handle); return 9; } - if (nullptr != _pfn_rpc_mem_init) // make Qualcomm's SoC based low-end phone happy + if (nullptr != + _pfn_rpc_mem_init) // make Qualcomm's SoC equipped low-end phone happy _pfn_rpc_mem_init(); std::vector temp_context_config; - _qnn_interface.qnn_context_create(_qnn_backend_handle, _qnn_device_handle, - temp_context_config.empty() ? nullptr - : temp_context_config.data(), - &_qnn_context_handle); + _qnn_interface.qnn_context_create( + _qnn_backend_handle, _qnn_device_handle, + temp_context_config.empty() ? nullptr : temp_context_config.data(), + &_qnn_context_handle); if (nullptr == _qnn_context_handle) { QNN_LOG_WARN("why failed to initialize qnn context\n"); return 10; @@ -1448,12 +1498,12 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { return 0; } - int qnn_instance::qnn_finalize() { - int ret_status = 0; - Qnn_ErrorHandle_t error = QNN_SUCCESS; + int ret_status = 0; + Qnn_ErrorHandle_t error = QNN_SUCCESS; - if (nullptr != _pfn_rpc_mem_deinit) // make Qualcomm's SoC based low-end phone happy + if (nullptr != + _pfn_rpc_mem_deinit) // make Qualcomm's SoC equipped low-end phone happy _pfn_rpc_mem_deinit(); if (dlclose(_rpc_lib_handle) != 0) { @@ -1463,11 +1513,12 @@ int qnn_instance::qnn_finalize() { } if (nullptr != _qnn_context_handle) { - error = _qnn_interface.qnn_context_free(_qnn_context_handle, _qnn_profile_handle); + error = _qnn_interface.qnn_context_free(_qnn_context_handle, + _qnn_profile_handle); if (error != QNN_SUCCESS) { QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); - + _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); } _qnn_context_handle = nullptr; } @@ -1476,8 +1527,8 @@ int qnn_instance::qnn_finalize() { error = _qnn_interface.qnn_profile_free(_qnn_profile_handle); if (error != QNN_SUCCESS) { QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); - + _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); } _qnn_profile_handle = nullptr; } @@ -1486,8 +1537,8 @@ int qnn_instance::qnn_finalize() { error = _qnn_interface.qnn_device_free(_qnn_device_handle); if (error != QNN_SUCCESS) { QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); - + _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); } _qnn_device_handle = nullptr; } @@ -1496,17 +1547,18 @@ int qnn_instance::qnn_finalize() { error = _qnn_interface.qnn_backend_free(_qnn_backend_handle); if (error != QNN_SUCCESS) { QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); } _qnn_backend_handle = nullptr; - } if (nullptr != _qnn_log_handle) { error = _qnn_interface.qnn_log_free(_qnn_log_handle); if (error != QNN_SUCCESS) { QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); } _qnn_log_handle = nullptr; } @@ -1518,9 +1570,9 @@ int qnn_instance::qnn_finalize() { return ret_status; } - -int qnn_instance::init_qnn_graph(const char * graph_name, bool debug, uint8_t do_node_validation, - const QnnGraph_Config_t ** graph_configs) { +int qnn_instance::init_qnn_graph(const char * graph_name, bool debug, + uint8_t do_node_validation, + const QnnGraph_Config_t ** graph_configs) { int result = 0; if (nullptr == graph_name) { @@ -1534,15 +1586,16 @@ int qnn_instance::init_qnn_graph(const char * graph_name, bool debug, uint8_t do } if (!do_node_validation) { - QNN_LOG_WARN("node validation disabled, backend will not perform op validation prior to adding node\n"); + QNN_LOG_WARN("node validation disabled, backend will not perform op " + "validation prior to adding node\n"); } - _graph_name = graph_name; - _debug_tensor = debug; + _graph_name = graph_name; + _debug_tensor = debug; _do_node_validations = do_node_validation; - result = _qnn_raw_interface.graphCreate(_qnn_context_handle, graph_name, graph_configs, - &_qnn_graph_handle); + result = _qnn_raw_interface.graphCreate(_qnn_context_handle, graph_name, + graph_configs, &_qnn_graph_handle); if (result != QNN_GRAPH_NO_ERROR || nullptr == _qnn_graph_handle) { QNN_LOG_WARN("failed to create graph in qnn context\n"); return 3; @@ -1553,13 +1606,12 @@ int qnn_instance::init_qnn_graph(const char * graph_name, bool debug, uint8_t do return 0; } - int qnn_instance::finalize_qnn_graph() { if (nullptr != _qnn_graph_handle) { - if (_qnn_raw_interface.graphFinalize(_qnn_graph_handle, _qnn_profile_handle, nullptr) != - QNN_GRAPH_NO_ERROR) { + if (_qnn_raw_interface.graphFinalize(_qnn_graph_handle, + _qnn_profile_handle, + nullptr) != QNN_GRAPH_NO_ERROR) { QNN_LOG_WARN("finalizing graph failure\n"); - //return 1; } } else { QNN_LOG_DEBUG("qnn graph handle is null\n"); @@ -1568,26 +1620,28 @@ int qnn_instance::finalize_qnn_graph() { return 0; } - - // ================================================================================================= // // implementation of GGML's QNN backend // // ================================================================================================= -static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor, bool b_dump_tensor_info) { - if (nullptr == tensor) - return false; - if (b_dump_tensor_info) { - QNN_LOG_DEBUG("op name:%s, tensor type:%s", ggml_op_name(tensor->op), - ggml_type_name(tensor->type)); - } - //only support the following 3 OPs currently and ensure tensor->src[0] and tensor->src[1] is not nullptr - bool supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL) || (tensor->op == GGML_OP_MUL_MAT)); +static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, + const struct ggml_tensor *tensor, + bool b_dump_tensor_info) { + // only support the following 3 OPs currently + // provide a GENERAL approach could fix this problem in a standalone PR of refine ggml backend + // subsystem for mixed inference between CPU&GPU / CPU&NPU easily for ANY ggml backends + // which the backend's ggml_backend_xxx_buffer_is_host return true. + // this approach could be found: + // https://github.com/ggerganov/llama.cpp/pull/7641 + // + // ensure tensor->src[0] and tensor->src[1] is not nullptr. + bool supported_op = + ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL) || + (tensor->op == GGML_OP_MUL_MAT)); if (!supported_op) { return false; } - const struct ggml_tensor * src0 = tensor->src[0]; const struct ggml_tensor * src1 = tensor->src[1]; @@ -1597,87 +1651,114 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor, bool b_dum const int64_t ne10 = src1->ne[0]; const int64_t ne11 = src1->ne[1]; - const int64_t ne0 = tensor->ne[0]; - const int64_t ne1 = tensor->ne[1]; + const int64_t ne20 = tensor->ne[0]; + const int64_t ne21 = tensor->ne[1]; - GGML_UNUSED(ne0); - GGML_UNUSED(ne1); + //TODO: support other quatinized data type + if (ggml_is_quantized(src0->type) && (src0->type != GGML_TYPE_Q8_0)) { + return false; + } if (b_dump_tensor_info) { - QNN_LOG_DEBUG("src0 type:%s", ggml_type_name(tensor->src[0]->type)); - QNN_LOG_DEBUG("src1 type:%s", ggml_type_name(tensor->src[1]->type)); - if (tensor->op == GGML_OP_MUL_MAT) { - QNN_LOG_DEBUG("GGML_OP_MUL_MAT"); - QNN_LOG_DEBUG( - "src0 %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, - src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], - src0->nb[0], src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG( - "src1 %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, - src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], - src1->nb[0], src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG( - " %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - tensor->name, - tensor->type, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1], tensor->ne[2], - tensor->nb[0], - tensor->nb[1], tensor->nb[2]); - - } - } - - if (ggml_is_empty(tensor) || tensor->op == GGML_OP_RESHAPE || tensor->op == GGML_OP_TRANSPOSE || tensor->op == GGML_OP_VIEW || tensor->op == GGML_OP_PERMUTE || tensor->op == GGML_OP_NONE) { + QNN_LOG_DEBUG("GGML_OP_MUL_MAT"); + QNN_LOG_DEBUG("op name:%s, tensor type:%s", + ggml_op_name(tensor->op), + ggml_type_name(tensor->type)); + QNN_LOG_DEBUG("src0 type:%s", ggml_type_name(tensor->src[0]->type)); + QNN_LOG_DEBUG("src1 type:%s", ggml_type_name(tensor->src[1]->type)); + QNN_LOG_DEBUG("src0 %15s: type = %i (%5s) ne = %5" PRIi64 + " x %5" PRIi64 " x %5" PRIi64 + ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, src0->type, ggml_type_name(src0->type), + src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], + src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG("src1 %15s: type = %i (%5s) ne = %5" PRIi64 + " x %5" PRIi64 " x %5" PRIi64 + ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, src1->type, ggml_type_name(src1->type), + src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], + src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG( + " %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + tensor->name, tensor->type, ggml_type_name(tensor->type), + tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], + tensor->nb[1], tensor->nb[2]); + } + } + + if (ggml_is_empty(tensor) || tensor->op == GGML_OP_RESHAPE || + tensor->op == GGML_OP_TRANSPOSE || tensor->op == GGML_OP_VIEW || + tensor->op == GGML_OP_PERMUTE || tensor->op == GGML_OP_NONE) { return false; } - //make ggml_get_tensor_rank and QNN SDK happy + // make ggml_get_tensor_rank and QNN SDK happy if ((ne00 <= 1 || ne01 <= 1 || ne10 <= 1 || ne11 <= 1)) { return false; } - // GPU/NPU inference will slower then CPU inference when tensor->ne[1] < min batch size - if (tensor->ne[1] < 32) { + if ((ne20 < 32) || (ne21 < 32) || (ne10 < 32)) { return false; } int qtype = src0->type; - return (qtype == GGML_TYPE_F32 || qtype == GGML_TYPE_F16 || qtype == GGML_TYPE_Q8_0) - && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); - -} + if (tensor->op == GGML_OP_ADD) { + return (qtype == GGML_TYPE_F32 || qtype == GGML_TYPE_F16 || + qtype == GGML_TYPE_Q8_0) && + (src1->type == GGML_TYPE_F32); + } + if (tensor->op == GGML_OP_MUL) { + return (qtype == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32); + } -static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - bool graph_initialized = false; - int64_t n_begin_time = 0LL; - int64_t n_end_time = 0LL; - int64_t n_duration = 0LL; + if (tensor->op == GGML_OP_MUL_MAT) { + if (ctx->device == QNN_BACKEND_GGML) { + return (ne00 == ne10) && (src1->ne[2] % src0->ne[2] == 0) && + (src1->ne[3] % src0->ne[3] == 0); + } + if ((ctx->device == QNN_BACKEND_NPU) && (qtype == GGML_TYPE_Q8_0) && + (src1->type == GGML_TYPE_F32) && (tensor->type == GGML_TYPE_F32)) { + return true; + } + if (ctx->device == QNN_BACKEND_CPU || ctx->device == QNN_BACKEND_GPU) { + return (ne00 == ne10) && (ne00 == ne01); + } + return false; + } +} - qnn_instance * instance = nullptr; +static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + bool graph_initialized = false; + int64_t n_begin_time = 0LL; + int64_t n_end_time = 0LL; + int64_t n_duration = 0LL; - std::string graph_name = "ggml_op_qnn_add"; - Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Tensor_t * tensor_0 = nullptr; - Qnn_Tensor_t * tensor_1 = nullptr; - Qnn_Tensor_t * tensor_2 = nullptr; - Qnn_Param_t qnn_params[] = {}; - enum ggml_op ggmlop = GGML_OP_ADD; - Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; - Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; - Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; + qnn_instance * instance = nullptr; + std::string graph_name = "ggml_op_qnn_add"; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Tensor_t * tensor_0 = nullptr; + Qnn_Tensor_t * tensor_1 = nullptr; + Qnn_Tensor_t * tensor_2 = nullptr; + Qnn_Param_t qnn_params[] = {}; + enum ggml_op ggmlop = GGML_OP_ADD; + Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; if ((nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { QNN_LOG_WARN("pls check why GGML tensor is null"); return; } - tensor_0 = (Qnn_Tensor_t *)src0->extra; - tensor_1 = (Qnn_Tensor_t *)src1->extra; - tensor_2 = (Qnn_Tensor_t *)dst->extra; - if ((nullptr == tensor_0) || (nullptr == tensor_1) || (nullptr == tensor_2)) { + tensor_0 = (Qnn_Tensor_t *) src0->extra; + tensor_1 = (Qnn_Tensor_t *) src1->extra; + tensor_2 = (Qnn_Tensor_t *) dst->extra; + if ((nullptr == tensor_0) || (nullptr == tensor_1) || + (nullptr == tensor_2)) { QNN_LOG_WARN("pls check why QNN tensor is null"); return; } @@ -1685,53 +1766,63 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QNN_LOG_WARN("pls check why backend ctx is null"); return; } - instance = ctx->instance; + instance = ctx->instance; if (nullptr == instance) { QNN_LOG_WARN("pls check why qnn instance is null"); return; } - QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; - - n_begin_time = ggml_time_us(); - - QNN_LOG_DEBUG("call %s\n", __func__); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, - src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], - src0->nb[0], src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, - src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], - src1->nb[0], src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - dst->name, - dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], - dst->nb[1], dst->nb[2]); - QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); - QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); - QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); - QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + + n_begin_time = ggml_time_us(); + + if (0) { + QNN_LOG_DEBUG("call %s\n", __func__); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, src0->type, ggml_type_name(src0->type), + src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], + src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, src1->type, ggml_type_name(src1->type), + src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], + src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, dst->type, ggml_type_name(dst->type), + dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], + dst->nb[1], dst->nb[2]); + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], + src0->ne[3]); + QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); + QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); + QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); + } QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; - src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); - src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); - dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); - - uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], - (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; - uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], - (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; - uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], - (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; - - std::string map_entry = std::string(ggml_op_name(ggmlop)); - if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { + src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); + src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); + dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); + + uint32_t dimensions_input_0[] = { + (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], (uint32_t) src0->ne[2], + (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = { + (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2], + (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = { + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], + (uint32_t) dst->ne[3]}; + + std::string map_entry = std::string(ggml_op_name(ggmlop)); + if (instance->_qnn_graph_map.find(map_entry) != + instance->_qnn_graph_map.end()) { graph_initialized = true; auto & graph_item = instance->_qnn_graph_map[map_entry]; - graph_handle = std::get<0>(graph_item); + graph_handle = std::get<0>(graph_item); } uint32_t * tensor_0_dimensions = QNN_VER_PTR(*tensor_0)->dimensions; @@ -1739,15 +1830,16 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions; if (!graph_initialized) { - graph_name = graph_name + "_" + std::to_string(ctx->threads) + src0->name + "_" + src1->name; + graph_name = graph_name + "_" + std::to_string(ctx->threads) + + src0->name + "_" + src1->name; QNN_LOG_DEBUG("graph name %s", graph_name.c_str()); - //QnnGraph_Config_t graph_config; - //graph_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - //graph_config.customConfig = strdup(graph_name.c_str()); - //const QnnGraph_Config_t * p_graph_config = &graph_config; - error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, &graph_handle); + error = qnn_raw_interface.graphCreate( + instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, + &graph_handle); if (QNN_SUCCESS != error) { - QNN_LOG_INFO("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error); + QNN_LOG_INFO("can't create qnn graph handle with graph name %s, " + "error = %d\n", + graph_name.c_str(), error); return; } error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); @@ -1763,40 +1855,31 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QNN_LOG_INFO("error = %d\n", error); } - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, + ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, + ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, + ggml_get_tensor_data_size(dst)}; QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); - QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); - QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); - QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; - - Qnn_Tensor_t tensor_inputs[] = { - *tensor_0, - *tensor_1 - }; - Qnn_Tensor_t tensor_outputs[] = { - *tensor_2 - }; - Qnn_OpConfig_t op_config = { - (Qnn_OpConfigVersion_t) 1, .v1 = { - "ggml_op_add", - QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_ELEMENT_WISE_ADD, - 0, - qnn_params, - 2, - tensor_inputs, - 1, - tensor_outputs - } - }; + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + + Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; + Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; + Qnn_OpConfig_t op_config = { + (Qnn_OpConfigVersion_t) 1, + .v1 = {"ggml_op_add", QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_ELEMENT_WISE_ADD, 0, qnn_params, + 2, tensor_inputs, 1, + tensor_outputs}}; error = qnn_raw_interface.graphAddNode(graph_handle, op_config); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); @@ -1805,49 +1888,57 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); } - error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + error = + qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, + tensor_outputs, 1, + nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); } - auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); + auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); instance->_qnn_graph_map[map_entry] = graph_item; } else { auto & graph_item = instance->_qnn_graph_map[map_entry]; - graph_handle = std::get<0>(graph_item); - tensor_0 = std::get<1>(graph_item); - tensor_1 = std::get<2>(graph_item); - tensor_2 = std::get<3>(graph_item); - - //QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); - uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], - (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; - uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], - (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; - uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], - (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; - - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + graph_handle = std::get<0>(graph_item); + tensor_0 = std::get<1>(graph_item); + tensor_1 = std::get<2>(graph_item); + tensor_2 = std::get<3>(graph_item); + + // QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], + // src0->ne[2], src0->ne[3]); + uint32_t dimensions_input_0[] = { + (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = { + (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = { + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], + (uint32_t) dst->ne[3]}; + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, + ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, + ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, + ggml_get_tensor_data_size(dst)}; QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); - QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); - QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); - QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; - - Qnn_Tensor_t tensor_inputs[] = { - *tensor_0, - *tensor_1 - }; - Qnn_Tensor_t tensor_outputs[] = { - *tensor_2 - }; - error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + + Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; + Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; + error = + qnn_raw_interface.graphExecute(graph_handle, tensor_inputs,2, + tensor_outputs,1, + nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); } @@ -1855,52 +1946,54 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions; QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions; - n_end_time = ggml_time_us(); - n_duration = (n_end_time - n_begin_time) / 1000; + n_end_time = ggml_time_us(); + n_duration = (n_end_time - n_begin_time) / 1000; QNN_LOG_DEBUG("duration of ggml_qnn_add : %lld milliseconds\n", n_duration); } - - /* * ggml_qnn_mul_mat was re-added as a standalone function because * the following comments came from https://github.com/ggerganov/llama.cpp/pull/1632 - * MUL_MAT take most of the compute time (about 95%). So to speed up llama, we have to focus on MUL_MAT. + * MUL_MAT take most of the compute time (about 95%). + * So to speed up llama, we have to focus on MUL_MAT. + * * We have three kinds of MUL_MAT to compute: - * mul_mat_f32: both src0 and src1 are F32. + * mul_mat_f32: both src0 and src1 are F32. * mul_mat_f16_f32: src0 is F16 and src1 is F32. - * mul_mat_q_f32: src0 is quantized (Q4_0, Q4_1, ...), and src1 is F32. -*/ -static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - bool graph_initialized = false; - int64_t n_begin_time = 0LL; - int64_t n_end_time = 0LL; - int64_t n_duration = 0LL; - - qnn_instance * instance = nullptr; - - std::string graph_name = "ggml_op_qnn_mul_mat"; - Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Tensor_t * tensor_0 = nullptr; - Qnn_Tensor_t * tensor_1 = nullptr; - Qnn_Tensor_t * tensor_2 = nullptr; - - Qnn_Param_t qnn_params[] = {}; - - enum ggml_op ggmlop = GGML_OP_ADD; - Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; - Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; - Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; + * mul_mat_q_f32: src0 is quantized (Q4_0, Q4_1, ...), and src1 is F32. + */ +static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + bool graph_initialized = false; + int64_t n_begin_time = 0LL; + int64_t n_end_time = 0LL; + int64_t n_duration = 0LL; + + qnn_instance * instance = nullptr; + std::string graph_name = "ggml_op_qnn_mul_mat"; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Tensor_t * tensor_0 = nullptr; + Qnn_Tensor_t * tensor_1 = nullptr; + Qnn_Tensor_t * tensor_2 = nullptr; + + Qnn_Param_t qnn_params[] = {}; + + enum ggml_op ggmlop = GGML_OP_MUL_MAT; + Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; if ((nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { QNN_LOG_WARN("pls check why GGML tensor is null"); return; } - tensor_0 = (Qnn_Tensor_t *)src0->extra; - tensor_1 = (Qnn_Tensor_t *)src1->extra; - tensor_2 = (Qnn_Tensor_t *)dst->extra; - if ((nullptr == tensor_0) || (nullptr == tensor_1) || (nullptr == tensor_2)) { + tensor_0 = (Qnn_Tensor_t *) src0->extra; + tensor_1 = (Qnn_Tensor_t *) src1->extra; + tensor_2 = (Qnn_Tensor_t *) dst->extra; + if ((nullptr == tensor_0) || (nullptr == tensor_1) || + (nullptr == tensor_2)) { QNN_LOG_WARN("pls check why QNN tensor is null"); return; } @@ -1908,28 +2001,31 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, const ggml_tensor * QNN_LOG_WARN("pls check why backend ctx is null"); return; } - instance = ctx->instance; + instance = ctx->instance; if (nullptr == instance) { QNN_LOG_WARN("pls check why qnn instance is null"); return; } - QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; - n_begin_time = ggml_time_us(); + n_begin_time = ggml_time_us(); QNN_LOG_DEBUG("call %s\n", __func__); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, - src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], - src0->nb[0], src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, - src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], - src1->nb[0], src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - dst->name, - dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], - dst->nb[1], dst->nb[2]); - QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, src0->type, ggml_type_name(src0->type), + src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], + src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, src1->type, ggml_type_name(src1->type), + src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], + src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], + dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], + src0->ne[3]); QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); @@ -1938,22 +2034,26 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, const ggml_tensor * QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; - src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); - src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); - dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); - - uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], - (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; - uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], - (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; - uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], - (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; - - std::string map_entry = std::string(ggml_op_name(ggmlop)); - if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { + src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); + src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); + dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); + + uint32_t dimensions_input_0[] = { + (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], (uint32_t) src0->ne[2], + (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = { + (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2], + (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = { + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], + (uint32_t) dst->ne[3]}; + + std::string map_entry = std::string(ggml_op_name(ggmlop)); + if (instance->_qnn_graph_map.find(map_entry) != + instance->_qnn_graph_map.end()) { graph_initialized = true; auto & graph_item = instance->_qnn_graph_map[map_entry]; - graph_handle = std::get<0>(graph_item); + graph_handle = std::get<0>(graph_item); } uint32_t * tensor_0_dimensions = QNN_VER_PTR(*tensor_0)->dimensions; @@ -1961,11 +2061,16 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, const ggml_tensor * uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions; if (!graph_initialized) { - graph_name = graph_name + "_" + std::to_string(ctx->threads) + src0->name + "_" + src1->name; + graph_name = graph_name + "_" + std::to_string(ctx->threads) + + src0->name + "_" + src1->name; QNN_LOG_DEBUG("graph name %s", graph_name.c_str()); - error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, &graph_handle); + error = qnn_raw_interface.graphCreate( + instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, + &graph_handle); if (QNN_SUCCESS != error) { - QNN_LOG_INFO("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error); + QNN_LOG_INFO("can't create qnn graph handle with graph name %s, " + "error = %d\n", + graph_name.c_str(), error); return; } error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); @@ -1981,40 +2086,30 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, const ggml_tensor * QNN_LOG_INFO("error = %d\n", error); } - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, + ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, + ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, + ggml_get_tensor_data_size(dst)}; QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); - QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); - QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); - QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; - - Qnn_Tensor_t tensor_inputs[] = { - *tensor_0, - *tensor_1 - }; - Qnn_Tensor_t tensor_outputs[] = { - *tensor_2 - }; - Qnn_OpConfig_t op_config = { - (Qnn_OpConfigVersion_t) 1, .v1 = { - "ggml_op_mul_mat", - QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_MAT_MUL, - 0, - qnn_params, - 2, - tensor_inputs, - 1, - tensor_outputs - } - }; + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + + Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; + Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; + Qnn_OpConfig_t op_config = {(Qnn_OpConfigVersion_t) 1, + .v1 = {"ggml_op_mul_mat", + QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_MAT_MUL, 0, qnn_params, 2, + tensor_inputs, 1, tensor_outputs}}; error = qnn_raw_interface.graphAddNode(graph_handle, op_config); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); @@ -2023,48 +2118,56 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, const ggml_tensor * if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); } - error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + error = qnn_raw_interface.graphExecute(graph_handle, + tensor_inputs, 2, + tensor_outputs, 1, + nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); } - auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); + auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); instance->_qnn_graph_map[map_entry] = graph_item; } else { - auto & graph_item = instance->_qnn_graph_map[map_entry]; - graph_handle = std::get<0>(graph_item); - tensor_0 = std::get<1>(graph_item); - tensor_1 = std::get<2>(graph_item); - tensor_2 = std::get<3>(graph_item); + auto & graph_item= instance->_qnn_graph_map[map_entry]; + graph_handle = std::get<0>(graph_item); + tensor_0 = std::get<1>(graph_item); + tensor_1 = std::get<2>(graph_item); + tensor_2 = std::get<3>(graph_item); QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); - uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], - (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; - uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], - (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; - uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], - (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + uint32_t dimensions_input_0[] = { + (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = { + (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = { + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], + (uint32_t) dst->ne[3]}; QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); - QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); - QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); - QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; - - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; - - Qnn_Tensor_t tensor_inputs[] = { - *tensor_0, - *tensor_1 - }; - Qnn_Tensor_t tensor_outputs[] = { - *tensor_2 - }; - error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, + ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, + ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, + ggml_get_tensor_data_size(dst)}; + + Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; + Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; + error = + qnn_raw_interface.graphExecute(graph_handle, + tensor_inputs, 2, + tensor_outputs, 1, + nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); } @@ -2072,45 +2175,48 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, const ggml_tensor * QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions; QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions; - n_end_time = ggml_time_us(); - n_duration = (n_end_time - n_begin_time) / 1000; - QNN_LOG_DEBUG("duration of ggml_qnn_mul_mat : %lld milliseconds\n", n_duration); + n_end_time = ggml_time_us(); + n_duration = (n_end_time - n_begin_time) / 1000; + QNN_LOG_DEBUG("duration of ggml_qnn_mul_mat : %lld milliseconds\n", + n_duration); QNN_LOG_DEBUG("call %s done\n", __func__); } - -//common function for GGML OPs using QNN API -static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, const enum ggml_op ggmlop, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - bool graph_initialized = false; - int64_t n_begin_time = 0LL; - int64_t n_end_time = 0LL; - int64_t n_duration = 0LL; - - qnn_instance * instance = nullptr; - - std::string qnn_graph_name = "ggml_qnn_graph"; - std::string qnn_op_config_name = "ggml_qnn_op_config"; - const char * qnn_op_name = nullptr; - Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Tensor_t * tensor_0 = nullptr; - Qnn_Tensor_t * tensor_1 = nullptr; - Qnn_Tensor_t * tensor_2 = nullptr; - - Qnn_Param_t qnn_params[] = {}; - - Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; - Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; - Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; +// common function for GGML OPs using QNN API +static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, + const enum ggml_op ggmlop, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + bool graph_initialized = false; + int64_t n_begin_time = 0LL; + int64_t n_end_time = 0LL; + int64_t n_duration = 0LL; + + qnn_instance * instance = nullptr; + std::string qnn_graph_name = "ggml_qnn_graph"; + std::string qnn_op_config_name = "ggml_qnn_op_config"; + const char * qnn_op_name = nullptr; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Tensor_t * tensor_0 = nullptr; + Qnn_Tensor_t * tensor_1 = nullptr; + Qnn_Tensor_t * tensor_2 = nullptr; + + Qnn_Param_t qnn_params[] = {}; + + Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; if ((nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { QNN_LOG_WARN("pls check why GGML tensor is null"); return; } - tensor_0 = (Qnn_Tensor_t *)src0->extra; - tensor_1 = (Qnn_Tensor_t *)src1->extra; - tensor_2 = (Qnn_Tensor_t *)dst->extra; - if ((nullptr == tensor_0) || (nullptr == tensor_1) || (nullptr == tensor_2)) { + tensor_0 = (Qnn_Tensor_t *) src0->extra; + tensor_1 = (Qnn_Tensor_t *) src1->extra; + tensor_2 = (Qnn_Tensor_t *) dst->extra; + if ((nullptr == tensor_0) || (nullptr == tensor_1) || + (nullptr == tensor_2)) { QNN_LOG_WARN("pls check why QNN tensor is null"); return; } @@ -2118,58 +2224,66 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, const enum ggml_o QNN_LOG_WARN("pls check why backend ctx is null"); return; } - instance = ctx->instance; + instance = ctx->instance; if (nullptr == instance) { QNN_LOG_WARN("pls check why qnn instance is null"); return; } - QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; - src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); - src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); - dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); - qnn_op_name = qnn_opname_from_ggmlop(ggmlop); + src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); + src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); + dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); + qnn_op_name = qnn_opname_from_ggmlop(ggmlop); if (nullptr == qnn_op_name) { - QNN_LOG_WARN("pls check why can not get QNN OP name with ggml op %d(%s)", ggmlop, ggml_op_name(ggmlop)); + QNN_LOG_WARN( + "pls check why can not get QNN OP name with ggml op %d(%s)", ggmlop, + ggml_op_name(ggmlop)); return; } - n_begin_time = ggml_time_us(); + n_begin_time = ggml_time_us(); QNN_LOG_DEBUG("call %s\n", __func__); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, - src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], - src0->nb[0], src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, - src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], - src1->nb[0], src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - dst->name, - dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], - dst->nb[1], dst->nb[2]); - QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, src0->type, ggml_type_name(src0->type), + src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], + src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, src1->type, ggml_type_name(src1->type), + src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], + src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], + dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], + src0->ne[3]); QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); - QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; - QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; - QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; - uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], - (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; - uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], - (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; - uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], - (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; - - - std::string map_entry = std::string(ggml_op_name(ggmlop)); - if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { + QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; + QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; + QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; + uint32_t dimensions_input_0[] = { + (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], (uint32_t) src0->ne[2], + (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = { + (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2], + (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = { + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], + (uint32_t) dst->ne[3]}; + + std::string map_entry = std::string(ggml_op_name(ggmlop)); + if (instance->_qnn_graph_map.find(map_entry) != + instance->_qnn_graph_map.end()) { graph_initialized = true; auto & graph_item = instance->_qnn_graph_map[map_entry]; - graph_handle = std::get<0>(graph_item); + graph_handle = std::get<0>(graph_item); } uint32_t * tensor_0_dimensions = QNN_VER_PTR(*tensor_0)->dimensions; @@ -2177,13 +2291,21 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, const enum ggml_o uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions; if (!graph_initialized) { - qnn_graph_name = qnn_graph_name + "_" + ggml_op_name(ggmlop) + std::to_string(ctx->threads) + src0->name + "_" + src1->name; - qnn_op_config_name = qnn_op_config_name + "_" + ggml_op_name(ggmlop) + std::to_string(ctx->threads) + src0->name + "_" + src1->name; + qnn_graph_name = qnn_graph_name + "_" + ggml_op_name(ggmlop) + + std::to_string(ctx->threads) + src0->name + "_" + + src1->name; + qnn_op_config_name = qnn_op_config_name + "_" + ggml_op_name(ggmlop) + + std::to_string(ctx->threads) + src0->name + "_" + + src1->name; QNN_LOG_DEBUG("qnn graph name %s", qnn_graph_name.c_str()); QNN_LOG_DEBUG("qnn op_config name %s", qnn_op_config_name.c_str()); - error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), qnn_graph_name.c_str(), nullptr, &graph_handle); + error = qnn_raw_interface.graphCreate( + instance->get_qnn_context_handle(), qnn_graph_name.c_str(), nullptr, + &graph_handle); if (QNN_SUCCESS != error) { - QNN_LOG_INFO("can't create qnn graph handle with ggml op %s, graph name %s, error = %d\n", ggml_op_name(ggmlop), qnn_graph_name.c_str(), error); + QNN_LOG_INFO("can't create qnn graph handle with ggml op %s, graph " + "name %s, error = %d\n", + ggml_op_name(ggmlop), qnn_graph_name.c_str(), error); return; } @@ -2200,40 +2322,30 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, const enum ggml_o QNN_LOG_INFO("error = %d\n", error); } - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, + ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, + ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, + ggml_get_tensor_data_size(dst)}; QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); - QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); - QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); - QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; - - Qnn_Tensor_t tensor_inputs[] = { - *tensor_0, - *tensor_1 - }; - Qnn_Tensor_t tensor_outputs[] = { - *tensor_2 - }; - Qnn_OpConfig_t op_config = { - (Qnn_OpConfigVersion_t) 1, .v1 = { - qnn_op_config_name.c_str(), - QNN_OP_PACKAGE_NAME_QTI_AISW, - qnn_op_name, - 0, - qnn_params, - 2, - tensor_inputs, - 1, - tensor_outputs - } - }; + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + + Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; + Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; + Qnn_OpConfig_t op_config = {(Qnn_OpConfigVersion_t) 1, + .v1 = {qnn_op_config_name.c_str(), + QNN_OP_PACKAGE_NAME_QTI_AISW, + qnn_op_name, 0, qnn_params, 2, + tensor_inputs, 1, tensor_outputs}}; error = qnn_raw_interface.graphAddNode(graph_handle, op_config); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); @@ -2242,48 +2354,56 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, const enum ggml_o if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); } - error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + error = qnn_raw_interface.graphExecute(graph_handle, + tensor_inputs, 2, + tensor_outputs, 1, + nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); } - auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); + auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); instance->_qnn_graph_map[map_entry] = graph_item; } else { auto & graph_item = instance->_qnn_graph_map[map_entry]; - graph_handle = std::get<0>(graph_item); - tensor_0 = std::get<1>(graph_item); - tensor_1 = std::get<2>(graph_item); - tensor_2 = std::get<3>(graph_item); + graph_handle = std::get<0>(graph_item); + tensor_0 = std::get<1>(graph_item); + tensor_1 = std::get<2>(graph_item); + tensor_2 = std::get<3>(graph_item); QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); - uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], - (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; - uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], - (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; - uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], - (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + uint32_t dimensions_input_0[] = { + (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = { + (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = { + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], + (uint32_t) dst->ne[3]}; QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); - QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); - QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); - QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; - - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; - - Qnn_Tensor_t tensor_inputs[] = { - *tensor_0, - *tensor_1 - }; - Qnn_Tensor_t tensor_outputs[] = { - *tensor_2 - }; - error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, + ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, + ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, + ggml_get_tensor_data_size(dst)}; + + Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; + Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; + error = + qnn_raw_interface.graphExecute(graph_handle, + tensor_inputs, 2, + tensor_outputs, 1, + nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); } @@ -2291,381 +2411,310 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, const enum ggml_o QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions; QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions; - n_end_time = ggml_time_us(); - n_duration = (n_end_time - n_begin_time) / 1000; - QNN_LOG_DEBUG("duration of ggml_qnn_%s : %lld milliseconds\n", ggml_op_name(ggmlop), n_duration); + n_end_time = ggml_time_us(); + n_duration = (n_end_time - n_begin_time) / 1000; + QNN_LOG_DEBUG("duration of ggml_qnn_%s : %lld milliseconds\n", + ggml_op_name(ggmlop), n_duration); QNN_LOG_DEBUG("call %s done\n", __func__); } - -static void ggml_qnn_repeat(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_repeat(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_get_rows(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_get_rows(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_acc(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_acc(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { } - -static void ggml_qnn_div(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_div(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { } - -static void ggml_qnn_gelu(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_gelu(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_silu(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_silu(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_gelu_quick(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_gelu_quick(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { } - -static void ggml_qnn_tanh(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_tanh(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_relu(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_relu(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_hardsigmoid(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_hardsigmoid(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { } - -static void ggml_qnn_hardswish(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_hardswish(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_leaky_relu(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_leaky_relu(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { } - -static void ggml_qnn_sqr(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_sqr(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { } - -static void ggml_qnn_norm(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_norm(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_group_norm(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_group_norm(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { } - -static void ggml_qnn_concat(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_concat(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_upscale(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_upscale(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_pad(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_pad(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { } - -static void ggml_qnn_rms_norm(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_rms_norm(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_cpy(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_cpy(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { } - -static void ggml_qnn_dup(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_dup(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { ggml_qnn_cpy(ctx, src0, dst, nullptr); (void) src1; } - static void ggml_qnn_mul_mat_id(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, - const ggml_tensor * src1, - ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); + const ggml_tensor * src1, ggml_tensor * dst) { } - -static void ggml_qnn_scale(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_scale(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_clamp(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_clamp(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_diag_mask_inf(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_diag_mask_inf(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { } - -static void ggml_qnn_soft_max(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_soft_max(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_rope(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_rope(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { GGML_ASSERT(ggml_is_contiguous(src0)); - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); - } - -static void ggml_qnn_pool2d(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_pool2d(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_im2col(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_im2col(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_sum_rows(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_sum_rows(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { GGML_ASSERT(ggml_is_contiguous(src0)); - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); } - -static void ggml_qnn_argsort(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_argsort(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { GGML_ASSERT(ggml_is_contiguous(src0)); - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); } - -static void ggml_qnn_nop(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - (void) src0; - (void) src1; - (void) dst; - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_nop(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { + (void)src0; + (void)src1; + (void)dst; } - -bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx, struct ggml_compute_params * params, struct ggml_tensor * tensor) { - ggml_qnn_func_t func = nullptr; - ggml_qnn_func_common_t func_common = nullptr; +bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx, + struct ggml_compute_params * params, + struct ggml_tensor * tensor) { + ggml_qnn_func_t func = nullptr; + ggml_qnn_func_common_t func_common = nullptr; switch (tensor->op) { - case GGML_OP_ADD: - func = ggml_qnn_add; + case GGML_OP_ADD: + func = ggml_qnn_add; + break; + + case GGML_OP_MUL: + func_common = ggml_qnn_hanlde_op; + break; + + case GGML_OP_MUL_MAT: + func = ggml_qnn_mul_mat; + break; + + case GGML_OP_REPEAT: + func = ggml_qnn_repeat; + break; + case GGML_OP_GET_ROWS: + func = ggml_qnn_get_rows; + break; + case GGML_OP_DUP: + func = ggml_qnn_dup; + break; + + case GGML_OP_ACC: + func = ggml_qnn_acc; + break; + + case GGML_OP_DIV: + func = ggml_qnn_div; + break; + + case GGML_OP_UNARY: + switch (ggml_get_unary_op(tensor)) { + case GGML_UNARY_OP_GELU: + func = ggml_qnn_gelu; break; - - case GGML_OP_MUL: - func_common = ggml_qnn_hanlde_op; + case GGML_UNARY_OP_SILU: + func = ggml_qnn_silu; break; - - case GGML_OP_MUL_MAT: - func = ggml_qnn_mul_mat; + case GGML_UNARY_OP_GELU_QUICK: + func = ggml_qnn_gelu_quick; break; - - case GGML_OP_REPEAT: - func = ggml_qnn_repeat; + case GGML_UNARY_OP_TANH: + func = ggml_qnn_tanh; break; - case GGML_OP_GET_ROWS: - func = ggml_qnn_get_rows; + case GGML_UNARY_OP_RELU: + func = ggml_qnn_relu; break; - case GGML_OP_DUP: - func = ggml_qnn_dup; + case GGML_UNARY_OP_HARDSIGMOID: + func = ggml_qnn_hardsigmoid; break; - - case GGML_OP_ACC: - func = ggml_qnn_acc; - break; - - case GGML_OP_DIV: - func = ggml_qnn_div; - break; - - case GGML_OP_UNARY: - switch (ggml_get_unary_op(tensor)) { - case GGML_UNARY_OP_GELU: - func = ggml_qnn_gelu; - break; - case GGML_UNARY_OP_SILU: - func = ggml_qnn_silu; - break; - case GGML_UNARY_OP_GELU_QUICK: - func = ggml_qnn_gelu_quick; - break; - case GGML_UNARY_OP_TANH: - func = ggml_qnn_tanh; - break; - case GGML_UNARY_OP_RELU: - func = ggml_qnn_relu; - break; - case GGML_UNARY_OP_HARDSIGMOID: - func = ggml_qnn_hardsigmoid; - break; - case GGML_UNARY_OP_HARDSWISH: - func = ggml_qnn_hardswish; - break; - default: - return false; - } - break; - case GGML_OP_NORM: - func = ggml_qnn_norm; - break; - case GGML_OP_GROUP_NORM: - func = ggml_qnn_group_norm; - break; - case GGML_OP_CONCAT: - func = ggml_qnn_concat; - break; - case GGML_OP_UPSCALE: - func = ggml_qnn_upscale; - break; - case GGML_OP_PAD: - func = ggml_qnn_pad; - break; - case GGML_OP_LEAKY_RELU: - func = ggml_qnn_leaky_relu; - break; - case GGML_OP_RMS_NORM: - func = ggml_qnn_rms_norm; - break; - case GGML_OP_MUL_MAT_ID: - func = ggml_qnn_mul_mat_id; - break; - case GGML_OP_SCALE: - func = ggml_qnn_scale; - break; - case GGML_OP_SQR: - func = ggml_qnn_sqr; - break; - case GGML_OP_CLAMP: - func = ggml_qnn_clamp; - break; - case GGML_OP_CPY: - func = ggml_qnn_cpy; - break; - case GGML_OP_CONT: - func = ggml_qnn_dup; - break; - case GGML_OP_NONE: - case GGML_OP_RESHAPE: - case GGML_OP_VIEW: - case GGML_OP_PERMUTE: - case GGML_OP_TRANSPOSE: - func = ggml_qnn_nop; - break; - case GGML_OP_DIAG_MASK_INF: - func = ggml_qnn_diag_mask_inf; - break; - case GGML_OP_SOFT_MAX: - func = ggml_qnn_soft_max; - break; - case GGML_OP_ROPE: - func = ggml_qnn_rope; - break; - case GGML_OP_IM2COL: - func = ggml_qnn_im2col; - break; - case GGML_OP_POOL_2D: - func = ggml_qnn_pool2d; - break; - case GGML_OP_SUM_ROWS: - func = ggml_qnn_sum_rows; - break; - case GGML_OP_ARGSORT: - func = ggml_qnn_argsort; + case GGML_UNARY_OP_HARDSWISH: + func = ggml_qnn_hardswish; break; default: return false; + } + break; + case GGML_OP_NORM: + func = ggml_qnn_norm; + break; + case GGML_OP_GROUP_NORM: + func = ggml_qnn_group_norm; + break; + case GGML_OP_CONCAT: + func = ggml_qnn_concat; + break; + case GGML_OP_UPSCALE: + func = ggml_qnn_upscale; + break; + case GGML_OP_PAD: + func = ggml_qnn_pad; + break; + case GGML_OP_LEAKY_RELU: + func = ggml_qnn_leaky_relu; + break; + case GGML_OP_RMS_NORM: + func = ggml_qnn_rms_norm; + break; + case GGML_OP_MUL_MAT_ID: + func = ggml_qnn_mul_mat_id; + break; + case GGML_OP_SCALE: + func = ggml_qnn_scale; + break; + case GGML_OP_SQR: + func = ggml_qnn_sqr; + break; + case GGML_OP_CLAMP: + func = ggml_qnn_clamp; + break; + case GGML_OP_CPY: + func = ggml_qnn_cpy; + break; + case GGML_OP_CONT: + func = ggml_qnn_dup; + break; + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + func = ggml_qnn_nop; + break; + case GGML_OP_DIAG_MASK_INF: + func = ggml_qnn_diag_mask_inf; + break; + case GGML_OP_SOFT_MAX: + func = ggml_qnn_soft_max; + break; + case GGML_OP_ROPE: + func = ggml_qnn_rope; + break; + case GGML_OP_IM2COL: + func = ggml_qnn_im2col; + break; + case GGML_OP_POOL_2D: + func = ggml_qnn_pool2d; + break; + case GGML_OP_SUM_ROWS: + func = ggml_qnn_sum_rows; + break; + case GGML_OP_ARGSORT: + func = ggml_qnn_argsort; + break; + default: + return false; } - if (nullptr != func) - func(ctx, tensor->src[0], tensor->src[1], tensor); + if (nullptr != func) func(ctx, tensor->src[0], tensor->src[1], tensor); if (nullptr != func_common) func_common(ctx, tensor->op, tensor->src[0], tensor->src[1], tensor); @@ -2673,12 +2722,10 @@ bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx, struct ggml_comput return true; } - struct ggml_backend_qnn_buffer_context { - ggml_backend_qnn_buffer_context(size_t device) : - device(device), - name(GGML_QNN_NAME + std::to_string(device)) { - } + ggml_backend_qnn_buffer_context(size_t device) + : device(device) + , name(GGML_QNN_NAME + std::to_string(device)) {} ~ggml_backend_qnn_buffer_context() { if (buffer) { @@ -2697,83 +2744,82 @@ struct ggml_backend_qnn_buffer_context { sub_buffers.clear(); qnn_tensors.clear(); } - void * buffer = nullptr; + void * buffer = nullptr; struct ggml_backend_qnn_context * backend_ctx = nullptr; - size_t buffer_size = 0; - std::vector sub_buffers; + size_t buffer_size = 0; + std::vector sub_buffers; std::vector qnn_tensors; - size_t device; - std::string name; + size_t device; + std::string name; }; - struct ggml_backend_qnn_buffer_type_context { - size_t device; + size_t device; std::string name; }; - static const char * ggml_backend_qnn_buffer_get_name(ggml_backend_buffer_t buffer) { GGML_UNUSED(buffer); return "QNN"; } - GGML_CALL static bool ggml_backend_buffer_is_qnn(ggml_backend_buffer_t buffer) { return buffer->iface.get_name == ggml_backend_qnn_buffer_get_name; } - GGML_CALL static void ggml_backend_qnn_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; delete ctx; } - GGML_CALL static void * ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t buffer) { ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; return ctx->buffer; } +GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, + ggml_tensor * tensor) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + ggml_backend_qnn_buffer_context * ctx = + (ggml_backend_qnn_buffer_context *) buffer->context; -GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *)buffer->context; - - static int idx = 0; - char tensor_name[GGML_MAX_NAME] = { 0 }; + static int idx = 0; + char tensor_name[GGML_MAX_NAME] = {0}; snprintf(tensor_name, GGML_MAX_NAME, "tensor_%04d", idx++); - uint32_t dimensions[] = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1], (uint32_t) tensor->ne[2], (uint32_t) tensor->ne[3]}; - Qnn_DataType_t qnn_data_type = qnn_datatype_from_ggml_datatype(tensor->type); - Qnn_TensorType_t qnn_tensor_type= QNN_TENSOR_TYPE_APP_WRITE; + uint32_t dimensions[] = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1], + (uint32_t) tensor->ne[2], + (uint32_t) tensor->ne[3]}; + Qnn_DataType_t qnn_data_type = + qnn_datatype_from_ggml_datatype(tensor->type); + Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; if (tensor->flags & GGML_TENSOR_FLAG_INPUT) { qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; } else if (tensor->flags & GGML_TENSOR_FLAG_OUTPUT) { qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ; } - Qnn_Tensor_t qnn_tensor = { - .version= QNN_TENSOR_VERSION_1, - {.v1= { - .id = 0, - .name = tensor_name, - .type = qnn_tensor_type, - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = qnn_data_type, - .quantizeParams = {QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}}, - .rank = ggml_get_tensor_rank(tensor), - .dimensions = dimensions, - .memType = QNN_TENSORMEMTYPE_RAW, - {.clientBuf = {.data = nullptr, - .dataSize = 0}}}} - }; - Qnn_Tensor_t * p_qnn_tensor = (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t)); + Qnn_Tensor_t qnn_tensor = { + .version = QNN_TENSOR_VERSION_1, + {.v1 = {.id = 0, + .name = tensor_name, + .type = qnn_tensor_type, + .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, + .dataType = qnn_data_type, + .quantizeParams = + {QNN_DEFINITION_UNDEFINED, + QNN_QUANTIZATION_ENCODING_UNDEFINED, + {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, + .offset = 0}}}, + .rank = ggml_get_tensor_rank(tensor), + .dimensions = dimensions, + .memType = QNN_TENSORMEMTYPE_RAW, + {.clientBuf = {.data = nullptr, .dataSize = 0}}}}}; + Qnn_Tensor_t * p_qnn_tensor = + (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t)); if (nullptr == p_qnn_tensor) { QNN_LOG_WARN("calloc failed"); return; @@ -2788,21 +2834,24 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t ctx->qnn_tensors.push_back(p_qnn_tensor); } - -GGML_CALL static void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +GGML_CALL static void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer, + ggml_tensor * tensor, const void * data, + size_t offset, size_t size) { GGML_UNUSED(buffer); - memcpy((char *)tensor->data + offset, data, size); + memcpy((char *) tensor->data + offset, data, size); } - -GGML_CALL static void ggml_backend_qnn_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { +GGML_CALL static void ggml_backend_qnn_buffer_get_tensor(ggml_backend_buffer_t buffer, + const ggml_tensor * tensor, void * data, + size_t offset, size_t size) { GGML_UNUSED(buffer); - memcpy(data, (const char *)tensor->data + offset, size); + memcpy(data, (const char *) tensor->data + offset, size); } - -GGML_CALL static bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { +GGML_CALL static bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t buffer, + const struct ggml_tensor * src, + struct ggml_tensor * dst) { GGML_UNUSED(buffer); if (ggml_backend_buffer_is_host(src->buffer)) { memcpy(dst->data, src->data, ggml_nbytes(src)); @@ -2812,35 +2861,31 @@ GGML_CALL static bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t b return false; } - GGML_CALL static void ggml_backend_qnn_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; memset(ctx->buffer, value, ctx->buffer_size); } - static ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = { - /* .get_name = */ ggml_backend_qnn_buffer_get_name, - /* .free_buffer = */ ggml_backend_qnn_buffer_free_buffer, - /* .get_base = */ ggml_backend_qnn_buffer_get_base, - /* .init_tensor = */ ggml_backend_qnn_buffer_init_tensor, - /* .set_tensor = */ ggml_backend_qnn_buffer_set_tensor, - /* .get_tensor = */ ggml_backend_qnn_buffer_get_tensor, - /* .cpy_tensor = */ ggml_backend_qnn_buffer_cpy_tensor, - /* .clear = */ ggml_backend_qnn_buffer_clear, - /* .reset = */ nullptr, + /* .get_name = */ ggml_backend_qnn_buffer_get_name, + /* .free_buffer = */ ggml_backend_qnn_buffer_free_buffer, + /* .get_base = */ ggml_backend_qnn_buffer_get_base, + /* .init_tensor = */ ggml_backend_qnn_buffer_init_tensor, + /* .set_tensor = */ ggml_backend_qnn_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_qnn_buffer_get_tensor, + /* .cpy_tensor = */ ggml_backend_qnn_buffer_cpy_tensor, + /* .clear = */ ggml_backend_qnn_buffer_clear, + /* .reset = */ nullptr, }; - GGML_CALL static const char * ggml_backend_qnn_buffer_type_name(ggml_backend_buffer_type_t buft) { return "QNN"; } - static void * ggml_qnn_host_malloc(size_t n) { void * data = nullptr; - const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n); + int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n); if (result != 0) { QNN_LOG_WARN("%s: error: posix_memalign failed\n", __func__); return nullptr; @@ -2849,20 +2894,20 @@ static void * ggml_qnn_host_malloc(size_t n) { return data; } - -GGML_CALL static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +GGML_CALL static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer( + ggml_backend_buffer_type_t buft, size_t size) { ggml_backend_qnn_buffer_type_context * buft_ctx = (ggml_backend_qnn_buffer_type_context *)buft->context; ggml_backend_qnn_buffer_context * ctx = new ggml_backend_qnn_buffer_context(buft_ctx->device); - const size_t size_page = sysconf(_SC_PAGESIZE); + size_t size_page = sysconf(_SC_PAGESIZE); size_t size_aligned = size; if ((size_aligned % size_page) != 0) { size_aligned += (size_page - (size_aligned % size_page)); } - //TODO:use pre-allocated buffer in internal memory pool - ctx->buffer = ggml_qnn_host_malloc(size_aligned); + // TODO:use pre-allocated buffer in internal memory pool + ctx->buffer = ggml_qnn_host_malloc(size_aligned); ctx->buffer_size = size_aligned; ctx->backend_ctx = &g_qnn_mgr[buft_ctx->device]; @@ -2872,53 +2917,51 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer return nullptr; } - return ggml_backend_buffer_init(buft, ggml_backend_qnn_buffer_interface, ctx, size); + return ggml_backend_buffer_init(buft, ggml_backend_qnn_buffer_interface,ctx, size); } - -GGML_CALL static size_t ggml_backend_qnn_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +GGML_CALL static size_t ggml_backend_qnn_buffer_type_get_alignment( + ggml_backend_buffer_type_t buft) { GGML_UNUSED(buft); return 32; } - -//TODO: this value is an experimental value, works fine with whisper/llm/minicpm-v inference on Android +// TODO: this value is an experimental value, works fine with whisper/llm/minicpm-v inference on Android GGML_CALL static size_t ggml_backend_qnn_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { GGML_UNUSED(buft); return (96 * 1024 * 1024); } - -GGML_CALL static bool ggml_backend_qnn_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, - ggml_backend_t backend) { +GGML_CALL static bool ggml_backend_qnn_buffer_type_supports_backend( + ggml_backend_buffer_type_t buft, ggml_backend_t backend) { GGML_UNUSED(buft); return ggml_backend_is_qnn(backend) || ggml_backend_is_cpu(backend); } - GGML_CALL static bool ggml_backend_qnn_buffer_is_host(ggml_backend_buffer_type_t buft) { GGML_UNUSED(buft); return true; } - GGML_CALL static const char * ggml_backend_qnn_name(ggml_backend_t backend) { return "QNN"; } - GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { - QNN_LOG_INFO("enter %s", __func__ ); + QNN_LOG_INFO("enter %s", __func__); ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; QNN_LOG_DEBUG("idx %d, name:%s", ctx->device, g_qnn_mgr[ctx->device].name); - qnn_instance * instance = (qnn_instance*)g_qnn_mgr[ctx->device].instance; + qnn_instance * instance = (qnn_instance *)g_qnn_mgr[ctx->device].instance; if (instance != nullptr) { - std::map>::iterator graph_it; - for (graph_it = instance->_qnn_graph_map.begin(); graph_it != instance->_qnn_graph_map.end(); graph_it++) { - auto & graph_item = graph_it->second; + std::map>::iterator graph_it; + for (graph_it = instance->_qnn_graph_map.begin(); + graph_it != instance->_qnn_graph_map.end(); graph_it++) { + auto & graph_item = graph_it->second; Qnn_GraphHandle_t & graph_handle = std::get<0>(graph_item); GGML_UNUSED(graph_handle); QNN_LOG_DEBUG("graph type:%s", graph_it->first.c_str()); @@ -2930,96 +2973,90 @@ GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { g_qnn_mgr[ctx->device].instance = nullptr; } - if (g_qnn_mgr[ctx->device].backend != nullptr) { + if (g_qnn_mgr[ctx->device].backend != nullptr) { delete backend; g_qnn_mgr[ctx->device].backend = nullptr; } - QNN_LOG_INFO("leave %s", __func__ ); + QNN_LOG_INFO("leave %s", __func__); } - GGML_CALL static ggml_backend_buffer_type_t ggml_backend_qnn_get_default_buffer_type(ggml_backend_t backend) { ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; return ggml_backend_qnn_buffer_type(ctx->device); } - GGML_CALL static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { - enum ggml_status result = GGML_STATUS_SUCCESS; - ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; + enum ggml_status result = GGML_STATUS_SUCCESS; + ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; GGML_UNUSED(ctx); ggml_compute_params params = {}; - params.type = GGML_TASK_TYPE_COMPUTE; - params.ith = 0; + params.type = GGML_TASK_TYPE_COMPUTE; + params.ith = 0; for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_tensor * node = cgraph->nodes[i]; - if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { + ggml_tensor *node = cgraph->nodes[i]; + if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || + node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || + node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { continue; } bool ok = ggml_qnn_compute_forward(ctx, ¶ms, node); if (!ok) { - QNN_LOG_DEBUG("%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); + QNN_LOG_DEBUG("error: op not supported %s (%s)\n", node->name, ggml_op_name(node->op)); } } return result; } +GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, + const ggml_tensor * op) { + ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *) backend->context; -GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const ggml_tensor * op) { - GGML_UNUSED(backend); - - return (ggml_qnn_can_handle_op(op, true)); + return (ggml_qnn_can_handle_op(ctx, op, true)); } +GGML_CALL static bool ggml_backend_qnn_offload_op(ggml_backend_t backend,const ggml_tensor * tensor) { + ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; -//note: this function be used with proposal/refined ggml backend subsystem in this PR: -// https://github.com/ggerganov/llama.cpp/pull/7641 -// new ggml backend(only using system memory: ggml_backend_xxx_buffer_is_host return true) -// can following this style for mixed inference between CPU&GPU / CPU&NPU very easily -GGML_CALL static bool ggml_backend_qnn_offload_op(ggml_backend_t backend, const ggml_tensor * tensor) { - ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; - - return ggml_qnn_compute_forward(ctx, nullptr, (ggml_tensor*)tensor); + return ggml_qnn_compute_forward(ctx, nullptr, (ggml_tensor *) tensor); } - static ggml_backend_i ggml_backend_qnn_interface = { - /* .get_name = */ ggml_backend_qnn_name, - /* .free = */ ggml_backend_qnn_free, - /* .get_default_buffer_type = */ ggml_backend_qnn_get_default_buffer_type, - /* .set_tensor_async = */ nullptr, - /* .get_tensor_async = */ nullptr, - /* .cpy_tensor_async = */ nullptr, - /* .synchronize = */ nullptr, - /* .graph_plan_create = */ nullptr, - /* .graph_plan_free = */ nullptr, - /* .graph_plan_compute = */ nullptr, - /* .graph_compute = */ ggml_backend_qnn_graph_compute, - /* .supports_op = */ ggml_backend_qnn_supports_op, - /* .offload_op = */ ggml_backend_qnn_offload_op, - /* .event_new = */ nullptr, - /* .event_free = */ nullptr, - /* .event_record = */ nullptr, - /* .event_wait = */ nullptr, - /* .event_synchronize = */ nullptr, + /* .get_name = */ ggml_backend_qnn_name, + /* .free = */ ggml_backend_qnn_free, + /* .get_default_buffer_type = */ ggml_backend_qnn_get_default_buffer_type, + /* .set_tensor_async = */ nullptr, + /* .get_tensor_async = */ nullptr, + /* .cpy_tensor_async = */ nullptr, + /* .synchronize = */ nullptr, + /* .graph_plan_create = */ nullptr, + /* .graph_plan_free = */ nullptr, + /* .graph_plan_compute = */ nullptr, + /* .graph_compute = */ ggml_backend_qnn_graph_compute, + /* .supports_op = */ ggml_backend_qnn_supports_op, + /* .offload_op = */ ggml_backend_qnn_offload_op, + /* .event_new = */ nullptr, + /* .event_free = */ nullptr, + /* .event_record = */ nullptr, + /* .event_wait = */ nullptr, + /* .event_synchronize = */ nullptr, }; - static ggml_guid_t ggml_backend_qnn_guid() { - static ggml_guid guid = {0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81, 0x92, 0xa3, 0xb4, 0xc5, - 0xd6, 0xe7, 0xf8, 0x09}; + static ggml_guid guid = { + 0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81, + 0x92, 0xa3, 0xb4, 0xc5, 0xd6, 0xe7, 0xf8, 0x09 + }; return &guid; } - static ggml_backend_t ggml_backend_qnn_reg_init(const char * params, void * user_data) { if (nullptr == params) { - //QNN library path - //can be hardcoded to "/data/local/tmp/" for Android command line application - //or specified in JNI layer for Android APK + // QNN library path + // can be hardcoded to "/data/local/tmp/" for Android command line application + // or specified in JNI layer for Android APK params = "/data/local/tmp/"; } ggml_backend_t qnn_backend = ggml_backend_qnn_init((int) (intptr_t) user_data, params); @@ -3027,30 +3064,25 @@ static ggml_backend_t ggml_backend_qnn_reg_init(const char * params, void * user return qnn_backend; } - bool ggml_backend_is_qnn(ggml_backend_t backend) { return backend != nullptr && ggml_guid_matches(backend->guid, ggml_backend_qnn_guid()); } - void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int n_threads) { GGML_ASSERT(ggml_backend_is_qnn(backend)); - struct ggml_backend_qnn_context * ctx = (struct ggml_backend_qnn_context *)backend->context; + struct ggml_backend_qnn_context * ctx = (struct ggml_backend_qnn_context *) backend->context; ctx->threads = n_threads; } - const char * ggml_backend_qnn_get_name(ggml_backend_t backend) { return backend->iface.get_name(backend); } - int ggml_backend_qnn_get_device_count() { return GGML_QNN_MAX_DEVICES; } - void ggml_backend_qnn_get_device_description(size_t dev_num, char * description, size_t description_size) { if (nullptr == description || 0 == description_size) { QNN_LOG_WARN("invalid param"); @@ -3063,14 +3095,13 @@ void ggml_backend_qnn_get_device_description(size_t dev_num, char * description, } snprintf(description, description_size, "%s", g_qnn_mgr[dev_num].name); - QNN_LOG_DEBUG("description:%s", description); } - ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device) { if (device >= GGML_QNN_MAX_DEVICES) { - QNN_LOG_DEBUG("ggml_backend_qnn_buffer_type error: device_index:%d is out of range [0, %d]\n", - device, GGML_QNN_MAX_DEVICES - 1); + QNN_LOG_DEBUG("ggml_backend_qnn_buffer_type error: device_index:%d is " + "out of range [0, %d]\n", + device, GGML_QNN_MAX_DEVICES - 1); return nullptr; } @@ -3086,11 +3117,12 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device) { /* .alloc_buffer = */ ggml_backend_qnn_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_qnn_buffer_type_get_alignment, /* .get_max_size = */ ggml_backend_qnn_buffer_type_get_max_size, - /* .get_alloc_size = */ nullptr,// defaults to ggml_nbytes + /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes /* .supports_backend = */ ggml_backend_qnn_buffer_type_supports_backend, /* .is_host = */ ggml_backend_qnn_buffer_is_host - }, - /* .context = */ new ggml_backend_qnn_buffer_type_context { device, GGML_QNN_NAME + std::to_string(device) }, + }, + /* .context = */ new ggml_backend_qnn_buffer_type_context { device, + GGML_QNN_NAME + std::to_string(device)}, }; } ggml_backend_qnn_buffer_type_initialized = true; @@ -3099,7 +3131,6 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device) { return &ggml_backend_qnn_buffer_types[device]; } - /** * * @param device 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2: QNN_BACKEND_NPU @@ -3124,8 +3155,9 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { std::string path = qnn_lib_path; if (QNN_BACKEND_NPU == device) { if (0 == setenv("LD_LIBRARY_PATH", - (path + - ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images").c_str(), + (path + ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/" + "dsp:/vendor/dsp/images") + .c_str(), 1)) { QNN_LOG_INFO("QNN NPU backend setenv successfully"); } else { @@ -3133,31 +3165,35 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { } if (0 == setenv("ADSP_LIBRARY_PATH", (path + - ";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp").c_str(), + ";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/" + "rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp") + .c_str(), 1)) { QNN_LOG_INFO("QNN NPU backend setenv successfully"); } else { QNN_LOG_ERROR("QNN NPU backend setenv failure"); } } else { - if (0 == setenv("LD_LIBRARY_PATH", - path.c_str(), - 1)) { - QNN_LOG_INFO("%s backend setenv successfully\n", get_qnn_backend_name(device)); + if (0 == setenv("LD_LIBRARY_PATH", path.c_str(), 1)) { + QNN_LOG_INFO("%s backend setenv successfully\n", + get_qnn_backend_name(device)); } else { - QNN_LOG_ERROR("%s backend setenv failure\n", get_qnn_backend_name(device)); + QNN_LOG_ERROR("%s backend setenv failure\n", + get_qnn_backend_name(device)); } } qnn_instance * instance = nullptr; instance = new qnn_instance(qnn_lib_path, g_qnn_mgr[device].lib, ""); - result = instance->qnn_init(nullptr); + result = instance->qnn_init(nullptr); if (0 != result) { - QNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why\n", get_qnn_backend_name(device)); + QNN_LOG_WARN( + "init qnn subsystem failed with qnn backend %s, pls check why\n", + get_qnn_backend_name(device)); delete instance; return nullptr; } - qnn_interface qnn_interface = instance->get_qnn_interface(); + qnn_interface qnn_interface = instance->get_qnn_interface(); if (!qnn_interface.is_loaded()) { QNN_LOG_WARN("qnn subsystem failure\n"); delete instance; @@ -3167,29 +3203,28 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { std::string device_name = get_qnn_backend_name(device); QNN_LOG_INFO("qnn device name %s", device_name.c_str()); instance->init_qnn_graph(device_name.c_str(), false); - g_qnn_mgr[device].instance = instance; - g_qnn_mgr[device].raw_interface = instance->get_qnn_raw_interface(); - g_qnn_mgr[device].raw_system_interface = instance->get_qnn_raw_system_interface(); + g_qnn_mgr[device].instance = instance; + g_qnn_mgr[device].raw_interface = instance->get_qnn_raw_interface(); + g_qnn_mgr[device].raw_system_interface = instance->get_qnn_raw_system_interface(); - ggml_backend_t qnn_backend = new ggml_backend{ - /* .guid = */ ggml_backend_qnn_guid(), - /* .iface = */ ggml_backend_qnn_interface, - /* .context = */ &g_qnn_mgr[device] - }; - g_qnn_mgr[device].backend = qnn_backend; + ggml_backend_t qnn_backend = + new ggml_backend{/* .guid = */ ggml_backend_qnn_guid(), + /* .iface = */ ggml_backend_qnn_interface, + /* .context = */ &g_qnn_mgr[device]}; + g_qnn_mgr[device].backend = qnn_backend; return qnn_backend; } - extern "C" GGML_CALL int ggml_backend_qnn_reg_devices(void); GGML_CALL int ggml_backend_qnn_reg_devices() { for (size_t idx = 0; idx < GGML_QNN_MAX_DEVICES; idx++) { char name[GGML_MAX_NAME]; ggml_backend_qnn_get_device_description(idx, name, GGML_MAX_NAME); - ggml_backend_register(name, ggml_backend_qnn_reg_init, ggml_backend_qnn_buffer_type(idx), - (void *) (intptr_t)idx); + ggml_backend_register(name, ggml_backend_qnn_reg_init, + ggml_backend_qnn_buffer_type(idx), + (void *) (intptr_t) idx); } return GGML_QNN_MAX_DEVICES; From 3e8b61f9702a702bfe14478bdc4eb466038643dd Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Sun, 9 Jun 2024 09:06:44 +0800 Subject: [PATCH 011/166] review: fix a memory leak introduced by review modification which explained in https://github.com/zhouwg/llama.cpp/pull/1 --- ggml-qnn.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index 3c5ff332a1df2..d1d69afe2eef5 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -2771,6 +2771,7 @@ GGML_CALL static bool ggml_backend_buffer_is_qnn(ggml_backend_buffer_t buffer) { GGML_CALL static void ggml_backend_qnn_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; + delete ctx; } @@ -3105,12 +3106,14 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device) { return nullptr; } + //ref:https://github.com/zhouwg/llama.cpp/pull/1 + static ggml_backend_qnn_buffer_type_context ggml_backend_qnn_buffer_type_contexts[GGML_QNN_MAX_DEVICES]; static ggml_backend_buffer_type ggml_backend_qnn_buffer_types[GGML_QNN_MAX_DEVICES]; - static bool ggml_backend_qnn_buffer_type_initialized = false; - if (!ggml_backend_qnn_buffer_type_initialized) { - for (int i = 0; i < GGML_QNN_MAX_DEVICES; i++) { + for (size_t i = 0; i < GGML_QNN_MAX_DEVICES; i++) { + auto & context = ggml_backend_qnn_buffer_type_contexts[i]; + context = { i, std::string(GGML_QNN_NAME) + std::to_string(i) }; ggml_backend_qnn_buffer_types[i] = { /* .iface = */ { /* .get_name = */ ggml_backend_qnn_buffer_type_name, @@ -3121,8 +3124,7 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device) { /* .supports_backend = */ ggml_backend_qnn_buffer_type_supports_backend, /* .is_host = */ ggml_backend_qnn_buffer_is_host }, - /* .context = */ new ggml_backend_qnn_buffer_type_context { device, - GGML_QNN_NAME + std::to_string(device)}, + /* .context = */ & context, }; } ggml_backend_qnn_buffer_type_initialized = true; From d38d4a67d17570d3b3003397a50f873f5e143603 Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Sun, 9 Jun 2024 23:49:54 +0800 Subject: [PATCH 012/166] npu: probe htp info and capacity of rpc ion memory --- ggml-qnn.cpp | 123 +++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 115 insertions(+), 8 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index d1d69afe2eef5..3248e244a31c2 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -152,6 +152,28 @@ enum class ggml_qnn_profile_level { profile_detail = 2 }; +enum qcom_htp_arch { + NONE = 0, + V68 = 68, + V69 = 69, + V73 = 73, + V75 = 75, +}; + +enum qcom_chipset { + UNKNOWN_SM = 0, + SM8450 = 36, // v69 + SM8475 = 42, // v69 + SM8550 = 43, // v73 + SM8650 = 57, // v75 +}; + +struct qcom_socinfo { + int soc_model; + int htp_arch; + int vtcm_size_in_mb; +}; + struct ggml_backend_qnn_context { int device; int threads; @@ -216,6 +238,29 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { .raw_system_interface = {}}, }; +static struct qcom_socinfo g_qnn_soc_info_table[] = { + /* Qualcomm SnapDragon 8 Gen 1 */ + [SM8450] = {.soc_model = SM8450, + .htp_arch = V69, + .vtcm_size_in_mb = 8}, + + /* Qualcomm SnapDragon 8 Gen 1+ */ + [SM8475] = {.soc_model = SM8475, + .htp_arch = V69, + .vtcm_size_in_mb = 8}, + + /* Qualcomm SnapDragon 8 Gen 2 */ + [SM8550] = {.soc_model = SM8550, + .htp_arch = V73, + .vtcm_size_in_mb = 8}, + + /* Qualcomm SnapDragon 8 Gen 3 */ + [SM8650] = {.soc_model = SM8650, + .htp_arch = V75, + .vtcm_size_in_mb = 8}, + +}; + // ================================================================================================= // // QNN helper functions and other internal helper functions @@ -485,6 +530,8 @@ static Qnn_DataType_t qnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) { return QNN_DATATYPE_INT_8; case GGML_TYPE_Q8_0: return QNN_DATATYPE_SFIXED_POINT_8; + case GGML_TYPE_Q4_0: + return QNN_DATATYPE_SFIXED_POINT_4; default: break; } @@ -527,19 +574,34 @@ Fn load_qnn_functionpointers(void * handle, const char * function_name) { static const char * get_qnn_backend_name(int n_backend_type) { switch (n_backend_type) { - case 0: + case QNN_BACKEND_CPU: return "QNN-CPU"; - case 1: + case QNN_BACKEND_GPU: return "QNN-GPU"; - case 2: + case QNN_BACKEND_NPU: return "QNN-NPU"; - case 3: + case QNN_BACKEND_GGML: return "ggml"; //"fake" QNN backend, used for compare performance between QNN backend and original GGML default: return "unknown"; } } +static const char * qnn_get_chipset_desc(uint32_t chipset_id) { + switch (chipset_id) { + case SM8450: + return "SM8450"; + case SM8475: + return "SM8475"; + case SM8550: + return "SM8550"; + case SM8650: + return "SM8650"; + default: + return "unknown"; + } +} + static intptr_t align_to(size_t alignment, intptr_t offset) { return offset % alignment == 0 ? offset @@ -875,7 +937,7 @@ class qnn_instance { return 0; } - std::string &get_qnn_graph_name() { return _graph_name; } + std::string & get_qnn_graph_name() { return _graph_name; } bool is_rpcmem_initialized() { return _rpcmem_initialized; } @@ -893,6 +955,8 @@ class qnn_instance { void free_rpcmem(void * buf); + size_t get_rpcmem_capacity() { return _rpcmem_capacity; } + bool is_rpcmem_allocated(void * buf); bool is_rpcmem_registered(Qnn_MemHandle_t handle) { @@ -977,6 +1041,7 @@ class qnn_instance { pfn_rpc_mem_init _pfn_rpc_mem_init; pfn_rpc_mem_deinit _pfn_rpc_mem_deinit; std::unordered_map _rpcmem_store_map; + size_t _rpcmem_capacity = 512; std::string _graph_name; }; @@ -1493,6 +1558,46 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { QNN_LOG_DEBUG("initialize qnn context successfully\n"); } + if (_backend_name.find("Htp") != std::variant_npos) { + const QnnDevice_PlatformInfo_t * p_info = nullptr; + _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info); + QNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices); + QnnDevice_HardwareDeviceInfo_t * infos = p_info->v1.hwDevices; + for (int i = 0; i < p_info->v1.numHwDevices; i++) { + QNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId, + infos[i].v1.deviceType, infos[i].v1.numCores); + QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension; + QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = devinfo->onChipDevice; + QnnHtpDevice_Arch_t chiparch = chipinfo.arch; + QNN_LOG_INFO("htp_type:%d(%s)", devinfo->devType, (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : ""); + QNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d, vtcm_size_in_mb:%d MB", chipinfo.socModel, + qnn_get_chipset_desc(chipinfo.socModel), chiparch, chipinfo.vtcmSize); + } + _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info); + + + //TODO: faster approach to probe the accurate capacity of rpc ion memory + size_t candidate_size = 0; + uint8_t * rpc_buffer = nullptr; + const int SIZE_IN_MB = (1 << 20); + size_t probe_slots[] = {1024, 1536, 2048 - 48, 2048}; + size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); + for (size_t idx = 0; idx < probe_counts; idx++) { + rpc_buffer = static_cast(alloc_rpcmem(probe_slots[idx] * SIZE_IN_MB, 4)); + if (nullptr == rpc_buffer) { + QNN_LOG_INFO("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno)); + break; + } else { + candidate_size = probe_slots[idx]; + free_rpcmem(rpc_buffer); + rpc_buffer = nullptr; + } + } + if (candidate_size > _rpcmem_capacity) + _rpcmem_capacity = candidate_size; + QNN_LOG_INFO("capacity of rpc ion memory %d MB\n", _rpcmem_capacity); + } + QNN_LOG_DEBUG("leave qni_init\n"); return 0; @@ -1654,9 +1759,11 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, const int64_t ne20 = tensor->ne[0]; const int64_t ne21 = tensor->ne[1]; - //TODO: support other quatinized data type - if (ggml_is_quantized(src0->type) && (src0->type != GGML_TYPE_Q8_0)) { - return false; + //TODO: support other quantized data type + if (ggml_is_quantized(src0->type)) { + if ((src0->type != GGML_TYPE_Q8_0) && (src0->type != GGML_TYPE_Q4_0)) { + return false; + } } if (b_dump_tensor_info) { From 5f8cfe4a1eecab1504dea1451f7d4b4e7983d7b9 Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Mon, 10 Jun 2024 20:07:26 +0800 Subject: [PATCH 013/166] ggml-qnn: refine source code of ggml-qnn.cpp to make reviewer more happy --- ggml-qnn.cpp | 2654 +++++++++++++++++++++++++------------------------- 1 file changed, 1327 insertions(+), 1327 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index 3248e244a31c2..43a8fcd3ea8cb 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -32,8 +33,17 @@ #include #include #include -#include +#if (defined __ANDROID__) || (defined ANDROID) +#include +#endif + +#include "ggml-qnn.h" + +#include "ggml-backend-impl.h" + +// header file of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK +// https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct #include "QnnTypes.h" #include "QnnCommon.h" #include "QnnContext.h" @@ -46,14 +56,6 @@ #include "System/QnnSystemInterface.h" #include "HTP/QnnHtpDevice.h" -#include "ggml-qnn.h" - -#include "ggml-backend-impl.h" - -#if (defined __ANDROID__) || (defined ANDROID) -#include -#endif - // ================================================================================================= // // forward declaration @@ -61,96 +63,31 @@ // ================================================================================================= class qnn_instance; -static void ggml_qnn_log_internal(ggml_log_level level, const char * file, - const char * func, int line, - const char * format, ...); +struct ggml_backend_qnn_context; + +static int free_qnn_tensor(Qnn_Tensor_t & tensor); // ================================================================================================= // // self-defined macro / data structure // // ================================================================================================= -#define RPCMEM_DEFAULT_FLAGS 1 -#define RPCMEM_HEAP_ID_SYSTEM 25 - -#define GGML_QNN_DEBUG 1 // for troubleshooting QNN backend -#define ENABLE_QNN_LOG 0 // enable/disable QNN internal log -#define GGML_QNN_LOGBUF_LEN 4096 -#define QNN_VER_PTR(x) (&((x).v1)) -#define GGML_QNN_NAME "qnn" - -#define QNN_LOG_ERROR(...) \ - ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) - -#define QNN_LOG_WARN(...) \ - ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define ENABLE_QNNBACKEND_DEBUG 1 // for troubleshooting QNN backend +#define ENABLE_QNNSDK_LOG 0 // enable/disable QNN SDK's internal log +#define ENABLE_QNNBACKEND_PERF 0 // enable/disable op's perf info +#define QNN_LOGBUF_LEN 4096 +#define QNN_BACKEND_NAME "qnn" -#define QNN_LOG_INFO(...) \ - ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) - -#if GGML_QNN_DEBUG -#define QNN_LOG_DEBUG(...) \ - ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) -#else -#define QNN_LOG_DEBUG(...) -#endif - -#define VALIDATE(value, status) \ - do { \ - status = value; \ - if (status != QNN_SUCCESS) { \ - QNN_LOG_WARN("%s expected QNN_SUCCESS\n", #value); \ - return status; \ - } \ - } while (0) +typedef void (*ggml_qnn_func_t)(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, + const ggml_tensor * src1, + ggml_tensor * dst); -#define VALIDATE_TENSOR_VERSION(tensor, err) \ - VALIDATE(validate_tensor_version(tensor), err) - -#define QNN_TENSOR_GET_ID(tensor) get_qnn_tensorid(tensor) -#define QNN_TENSOR_GET_NAME(tensor) get_qnn_tensorname(tensor) -#define QNN_TENSOR_GET_TYPE(tensor) get_qnn_tensortype(tensor) -#define QNN_TENSOR_GET_DATA_FORMAT(tensor) get_qnn_tensor_dataformat(tensor) -#define QNN_TENSOR_GET_DATA_TYPE(tensor) get_qnn_tensor_datatype(tensor) -#define QNN_TENSOR_GET_QUANT_PARAMS(tensor) get_qnn_tensor_quantparams(tensor) -#define QNN_TENSOR_GET_RANK(tensor) get_qnn_tensor_rank(tensor) -#define QNN_TENSOR_GET_DIMENSIONS(tensor) get_qnn_tensor_dimensions(tensor) -#define QNN_TENSOR_GET_MEM_TYPE(tensor) get_qnn_tensor_memtype(tensor) - -#define QNN_TENSOR_SET_ID(tensor, value) set_qnn_tensor_id(tensor, value) -#define QNN_TENSOR_SET_NAME(tensor, value) set_qnn_tensor_name(tensor, value) -#define QNN_TENSOR_SET_TYPE(tensor, value) set_qnn_tensor_type(tensor, value) -#define QNN_TENSOR_SET_DATA_FORMAT(tensor, value) \ - set_qnn_tensor_dataformat(tensor, value) -#define QNN_TENSOR_SET_DATA_TYPE(tensor, value) \ - set_qnn_tensor_datatype(tensor, value) -#define QNN_TENSOR_SET_QUANT_PARAMS(tensor, value) \ - set_qnn_tensor_quantparams(tensor, value) -#define QNN_TENSOR_SET_RANK(tensor, value) set_qnn_tensor_rank(tensor, value) -#define QNN_TENSOR_SET_DIMENSIONS(tensor, value) \ - set_qnn_tensor_dimensions(tensor, value) -#define QNN_TENSOR_SET_MEM_TYPE(tensor, value) \ - set_qnn_tensor_memtype(tensor, value) -#define QNN_TENSOR_SET_CLIENT_BUF(tensor, value) \ - set_qnn_tensor_clientbuf(tensor, value) -#define QNN_TENSOR_SET_MEM_HANDLE(tensor, value) \ - set_qnn_tensor_memhandle(tensor, value) - -using pfn_rpc_mem_init = void (*)(void); -using pfn_rpc_mem_deinit = void (*)(void); -using pfn_rpc_mem_alloc = void *(*) (int, uint32_t, int); -using pfn_rpc_mem_free = void (*)(void *); -using pfn_rpc_mem_to_fd = int (*)(void *); - -using _pfn_QnnSaver_initialize = decltype(QnnSaver_initialize); -using _pfn_QnnInterface_getProviders = decltype(QnnInterface_getProviders); -using _pfn_QnnSystemInterface_getProviders = decltype(QnnSystemInterface_getProviders); - -enum class ggml_qnn_profile_level { - profile_off = 0, - profile_basic = 1, - profile_detail = 2 -}; +typedef void (*ggml_qnn_func_common_t)(ggml_backend_qnn_context * ctx, + const ggml_op ggml_op, + const ggml_tensor * src0, + const ggml_tensor * src1, + ggml_tensor * dst); enum qcom_htp_arch { NONE = 0, @@ -169,9 +106,36 @@ enum qcom_chipset { }; struct qcom_socinfo { - int soc_model; - int htp_arch; - int vtcm_size_in_mb; + uint32_t soc_model; + size_t htp_arch; + size_t vtcm_size_in_mb; +}; + +static struct qcom_socinfo g_qnn_soc_info_table[] = { + /* Qualcomm SnapDragon 8 Gen 1 */ + [SM8450] = { + .soc_model = SM8450, + .htp_arch = V69, + .vtcm_size_in_mb = 8}, + + /* Qualcomm SnapDragon 8 Gen 1+ */ + [SM8475] = { + .soc_model = SM8475, + .htp_arch = V69, + .vtcm_size_in_mb = 8}, + + /* Qualcomm SnapDragon 8 Gen 2 */ + [SM8550] = { + .soc_model = SM8550, + .htp_arch = V73, + .vtcm_size_in_mb = 8}, + + /* Qualcomm SnapDragon 8 Gen 3 */ + [SM8650] = { + .soc_model = SM8650, + .htp_arch = V75, + .vtcm_size_in_mb = 8}, + }; struct ggml_backend_qnn_context { @@ -183,19 +147,9 @@ struct ggml_backend_qnn_context { struct ggml_backend * backend; QNN_INTERFACE_VER_TYPE raw_interface; QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface; + struct qcom_socinfo socinfo; }; -typedef void (*ggml_qnn_func_t)(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, - const ggml_tensor * src1, - ggml_tensor * dst); - -typedef void (*ggml_qnn_func_common_t)(ggml_backend_qnn_context * ctx, - const ggml_op ggml_op, - const ggml_tensor * src0, - const ggml_tensor * src1, - ggml_tensor * dst); - // according to the QNN SDK Reference Guide, // CPU - Choose a non-quantized model.Quantized models are currently incompatible with the CPU backend // GPU - Choose a non-quantized model.Quantized models are currently incompatible with the GPU backend @@ -217,7 +171,8 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { .instance = nullptr, .backend = nullptr, .raw_interface = {}, - .raw_system_interface = {}}, + .raw_system_interface = {}, + .socinfo = {}}, [QNN_BACKEND_GPU] = {.device = 1, .threads = 1, @@ -226,7 +181,8 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { .instance = nullptr, .backend = nullptr, .raw_interface = {}, - .raw_system_interface = {}}, + .raw_system_interface = {}, + .socinfo = {}}, [QNN_BACKEND_NPU] = {.device = 2, .threads = 1, @@ -235,128 +191,425 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { .instance = nullptr, .backend = nullptr, .raw_interface = {}, - .raw_system_interface = {}}, + .raw_system_interface = {}, + .socinfo = {}}, }; -static struct qcom_socinfo g_qnn_soc_info_table[] = { - /* Qualcomm SnapDragon 8 Gen 1 */ - [SM8450] = {.soc_model = SM8450, - .htp_arch = V69, - .vtcm_size_in_mb = 8}, +struct ggml_backend_qnn_buffer_context { + ggml_backend_qnn_buffer_context(size_t device) + : device(device) + , name(QNN_BACKEND_NAME + std::to_string(device)) {} - /* Qualcomm SnapDragon 8 Gen 1+ */ - [SM8475] = {.soc_model = SM8475, - .htp_arch = V69, - .vtcm_size_in_mb = 8}, + ~ggml_backend_qnn_buffer_context() { + if (buffer) { + free(buffer); + } - /* Qualcomm SnapDragon 8 Gen 2 */ - [SM8550] = {.soc_model = SM8550, - .htp_arch = V73, - .vtcm_size_in_mb = 8}, + for (auto * sub_buffer : sub_buffers) { + free(sub_buffer); + } - /* Qualcomm SnapDragon 8 Gen 3 */ - [SM8650] = {.soc_model = SM8650, - .htp_arch = V75, - .vtcm_size_in_mb = 8}, + for (auto * qnn_tensor : qnn_tensors) { + free_qnn_tensor(*qnn_tensor); + free(qnn_tensor); + } + + sub_buffers.clear(); + qnn_tensors.clear(); + } + void * buffer = nullptr; + + struct ggml_backend_qnn_context * backend_ctx = nullptr; + + size_t buffer_size = 0; + std::vector sub_buffers; + std::vector qnn_tensors; + size_t device; + std::string name; +}; +struct ggml_backend_qnn_buffer_type_context { + size_t device; + std::string name; }; // ================================================================================================= // -// QNN helper functions and other internal helper functions +// QNN backend internal log function // // ================================================================================================= -static inline int validate_tensor_version(Qnn_Tensor_t tensor) { - if (tensor.version != QNN_TENSOR_VERSION_1) { - QNN_LOG_WARN( - "validate_tensor_version() tensor %s, got unsupported version %d\n", - tensor.v1.name, tensor.version); - return 1; +static void qnn_internal_log(ggml_log_level level, const char * file, + const char * func, int line, + const char * format, ...); +#define QNN_LOG_ERROR(...) \ + qnn_internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) + +#define QNN_LOG_WARN(...) \ + qnn_internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) + +#define QNN_LOG_INFO(...) \ + qnn_internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) + +#if ENABLE_QNNBACKEND_DEBUG +#define QNN_LOG_DEBUG(...) \ + qnn_internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#else +#define QNN_LOG_DEBUG(...) +#endif + +// ================================================================================================= +// +// QNN backend internal helper functions +// +// ================================================================================================= +static uint32_t qnn_get_ggml_tensor_rank(const ggml_tensor * tensor) { + uint32_t rank = 0; + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) { + rank++; + } } - return 0; + return rank; } -static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.id; +// TODO: mapping more ggml data type to QNN data type +// ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684 +static Qnn_DataType_t qnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) { + switch (ggmltype) { + case GGML_TYPE_F16: + return QNN_DATATYPE_FLOAT_16; + case GGML_TYPE_F32: + return QNN_DATATYPE_FLOAT_32; + case GGML_TYPE_I8: + return QNN_DATATYPE_INT_8; + case GGML_TYPE_Q8_0: + return QNN_DATATYPE_SFIXED_POINT_8; + case GGML_TYPE_Q4_0: + return QNN_DATATYPE_SFIXED_POINT_4; + default: + break; } - - return 0u; + return QNN_DATATYPE_UNDEFINED; } -static inline const char * get_qnn_tensorname(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.name; +// TODO: only support GGML_OP_ADD/GGML_OP_MUL/GGML_OP_MUL_MAT +static const char * qnn_opname_from_ggmlop(enum ggml_op ggmlop) { + switch (ggmlop) { + case GGML_OP_ADD: + return QNN_OP_ELEMENT_WISE_ADD; + case GGML_OP_MUL: + return QNN_OP_ELEMENT_WISE_MULTIPLY; + case GGML_OP_MUL_MAT: + return QNN_OP_MAT_MUL; + default: + break; } return nullptr; } -static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.type; +static uint32_t qnn_get_ggml_tensor_data_size(const ggml_tensor * tensor) { + /* + size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); + size_t n_dims = qnn_get_ggml_tensor_rank(tensor); + for (int i = 1; i < n_dims; i++) { + data_size *= tensor->ne[i]; } - return QNN_TENSOR_TYPE_UNDEFINED; + + return data_size; + */ + return ggml_nbytes(tensor); } -static inline Qnn_TensorDataFormat_t - get_qnn_tensor_dataformat(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.dataFormat; +static const char * qnn_get_backend_name(int n_backend_type) { + switch (n_backend_type) { + case QNN_BACKEND_CPU: + return "QNN-CPU"; + case QNN_BACKEND_GPU: + return "QNN-GPU"; + case QNN_BACKEND_NPU: + return "QNN-NPU"; + case QNN_BACKEND_GGML: + return "ggml"; //"fake" QNN backend, used for compare performance between QNN backend and original GGML + default: + return "unknown"; } - return QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER; } -static inline Qnn_DataType_t - get_qnn_tensor_datatype(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.dataType; +static const char * qnn_get_chipset_desc(uint32_t chipset_id) { + switch (chipset_id) { + case SM8450: + return "SM8450"; + case SM8475: + return "SM8475"; + case SM8550: + return "SM8550"; + case SM8650: + return "SM8650"; + default: + return "unknown"; } - return QNN_DATATYPE_UNDEFINED; } -static inline Qnn_QuantizeParams_t - get_qnn_tensor_quantparams(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.quantizeParams; +static const char * qnn_get_htparch_desc(size_t htp_arch) { + switch (htp_arch) { + case V68: + return "QCOM_HTP_V68"; + case V69: + return "QCOM_HTP_V69"; + case V73: + return "QCOM_HTP_V73"; + case V75: + return "QCOM_HTP_V75"; + default: + return "unknown"; } - return QNN_QUANTIZE_PARAMS_INIT; } -static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.rank; +static void qnn_internal_log(ggml_log_level level, const char * file, + const char * func, int line, + const char * format, ...) { + static std::mutex qnn_internal_log_mutex; + static char s_qnn_internal_log_buf[QNN_LOGBUF_LEN]; + + { + std::lock_guard lock(qnn_internal_log_mutex); + va_list args; + + va_start(args, format); + int len_prefix = + snprintf(s_qnn_internal_log_buf, QNN_LOGBUF_LEN, + "[%s, %d]: ", func, line); + int len = vsnprintf(s_qnn_internal_log_buf + len_prefix, + QNN_LOGBUF_LEN - len_prefix, format, args); + if (len < (QNN_LOGBUF_LEN - len_prefix)) { +#if (defined __ANDROID__) || (defined ANDROID) + // for Android APK + __android_log_print(level, "ggml-qnn", "%s\n", s_qnn_internal_log_buf); +#endif + // for Android command line application or WoA(Windows on ARM) + printf("%s\n", s_qnn_internal_log_buf); + } + va_end(args); } - return 0u; } -static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.dimensions; + +static bool qnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { + if ((nullptr == ctx) || (nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { + QNN_LOG_WARN("invalid params\n"); + return false; } - return nullptr; -} -static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.memType; + qnn_instance * instance = nullptr; + Qnn_Tensor_t * tensor_0 = nullptr; + Qnn_Tensor_t * tensor_1 = nullptr; + Qnn_Tensor_t * tensor_2 = nullptr; + tensor_0 = (Qnn_Tensor_t *) src0->extra; + tensor_1 = (Qnn_Tensor_t *) src1->extra; + tensor_2 = (Qnn_Tensor_t *) dst->extra; + instance = ctx->instance; + if ((nullptr == instance) || (nullptr == tensor_0) || (nullptr == tensor_1) || (nullptr == tensor_2)) { + QNN_LOG_WARN("invalid params\n"); + return false; } - return QNN_TENSORMEMTYPE_UNDEFINED; + + return true; } -static inline void set_qnn_tensor_id(Qnn_Tensor_t & tensor, uint32_t id) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.id = id; +#define CHECK_PARAMS(ctx, src0, src1, dst) \ + do { \ + if (!qnn_is_valid_params((ctx), (src0), (src1), (dst))) { \ + return; \ + } \ + } while (0) + +#if ENABLE_QNNBACKEND_PERF +class qnn_perf { +public: + qnn_perf(const std::string & perf_name) : _perf_name(std::move(perf_name)) {}; + qnn_perf() = delete; + qnn_perf(const qnn_perf & ) = delete; + qnn_perf & operator= (const qnn_perf & ) = delete; + + void start() { + _begin_time = ggml_time_us(); } -} -static inline void set_qnn_tensor_name(Qnn_Tensor_t & tensor, const char * name) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.name = name; + void info() { + _end_time = ggml_time_us(); + _duration = (_end_time - _begin_time) / 1000; + QNN_LOG_DEBUG("duration of %s : %lld milliseconds\n", _perf_name.c_str(), _duration); } -} -static inline void set_qnn_tensor_type(Qnn_Tensor_t & tensor, Qnn_TensorType_t type) { - if (tensor.version == QNN_TENSOR_VERSION_1) { +private: + int64_t _begin_time = 0LL; + int64_t _end_time = 0LL; + int64_t _duration = 0LL; + std::string _perf_name; +}; +#else +class qnn_perf { +public: + qnn_perf(const std::string & perf_name) {} + qnn_perf() = delete; + qnn_perf(const qnn_perf & ) = delete; + qnn_perf & operator= (const qnn_perf & ) = delete; + + void start() {} + void info() {} +}; +#endif + +// ================================================================================================= +// +// helper data type / data structure / macros / functions of +// Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK +// ref:https://github.com/pytorch/executorch/tree/main/backends/qualcomm +// ================================================================================================= +enum qnn_sdk_profile_level { + profile_off = 0, + profile_basic = 1, + profile_detail = 2 +}; + +using _pfn_rpc_mem_init = void (*)(void); +using _pfn_rpc_mem_deinit = void (*)(void); +using _pfn_rpc_mem_alloc = void *(*) (int, uint32_t, int); +using _pfn_rpc_mem_free = void (*)(void *); +using _pfn_rpc_mem_to_fd = int (*)(void *); + +using _pfn_qnnsaver_initialize = decltype(QnnSaver_initialize); +using _pfn_qnninterface_getproviders = decltype(QnnInterface_getProviders); +using _pfn_qnnsysteminterface_getproviders = decltype(QnnSystemInterface_getProviders); + +#define QNN_VER_PTR(x) (&((x).v1)) +#define RPCMEM_DEFAULT_FLAGS 1 +#define RPCMEM_HEAP_ID_SYSTEM 25 + +#define VALIDATE(value, status) \ + do { \ + status = value; \ + if (status != QNN_SUCCESS) { \ + QNN_LOG_WARN("%s expected QNN_SUCCESS\n", #value); \ + return status; \ + } \ + } while (0) + +#define QNN_TENSOR_GET_ID(tensor) get_qnn_tensorid(tensor) +#define QNN_TENSOR_GET_NAME(tensor) get_qnn_tensorname(tensor) +#define QNN_TENSOR_GET_TYPE(tensor) get_qnn_tensortype(tensor) +#define QNN_TENSOR_GET_DATA_FORMAT(tensor) get_qnn_tensor_dataformat(tensor) +#define QNN_TENSOR_GET_DATA_TYPE(tensor) get_qnn_tensor_datatype(tensor) +#define QNN_TENSOR_GET_QUANT_PARAMS(tensor) get_qnn_tensor_quantparams(tensor) +#define QNN_TENSOR_GET_RANK(tensor) get_qnn_tensor_rank(tensor) +#define QNN_TENSOR_GET_DIMENSIONS(tensor) get_qnn_tensor_dimensions(tensor) +#define QNN_TENSOR_GET_MEM_TYPE(tensor) get_qnn_tensor_memtype(tensor) + +#define QNN_TENSOR_SET_ID(tensor, value) set_qnn_tensor_id(tensor, value) +#define QNN_TENSOR_SET_NAME(tensor, value) set_qnn_tensor_name(tensor, value) +#define QNN_TENSOR_SET_TYPE(tensor, value) set_qnn_tensor_type(tensor, value) +#define QNN_TENSOR_SET_DATA_FORMAT(tensor, value) set_qnn_tensor_dataformat(tensor, value) +#define QNN_TENSOR_SET_DATA_TYPE(tensor, value) set_qnn_tensor_datatype(tensor, value) +#define QNN_TENSOR_SET_QUANT_PARAMS(tensor, value) set_qnn_tensor_quantparams(tensor, value) +#define QNN_TENSOR_SET_RANK(tensor, value) set_qnn_tensor_rank(tensor, value) +#define QNN_TENSOR_SET_DIMENSIONS(tensor, value) set_qnn_tensor_dimensions(tensor, value) +#define QNN_TENSOR_SET_MEM_TYPE(tensor, value) set_qnn_tensor_memtype(tensor, value) +#define QNN_TENSOR_SET_CLIENT_BUF(tensor, value) set_qnn_tensor_clientbuf(tensor, value) +#define QNN_TENSOR_SET_MEM_HANDLE(tensor, value) set_qnn_tensor_memhandle(tensor, value) +#define VALIDATE_TENSOR_VERSION(tensor, err) VALIDATE(validate_tensor_version(tensor), err) + +static inline int validate_tensor_version(Qnn_Tensor_t tensor) { + if (tensor.version != QNN_TENSOR_VERSION_1) { + QNN_LOG_WARN( + "validate_tensor_version() tensor %s, got unsupported version %d\n", + tensor.v1.name, tensor.version); + return 1; + } + return 0; +} + +static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.id; + } + + return 0u; +} + +static inline const char * get_qnn_tensorname(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.name; + } + return nullptr; +} + +static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.type; + } + return QNN_TENSOR_TYPE_UNDEFINED; +} + +static inline Qnn_TensorDataFormat_t + get_qnn_tensor_dataformat(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.dataFormat; + } + return QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER; +} + +static inline Qnn_DataType_t + get_qnn_tensor_datatype(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.dataType; + } + return QNN_DATATYPE_UNDEFINED; +} + +static inline Qnn_QuantizeParams_t + get_qnn_tensor_quantparams(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.quantizeParams; + } + return QNN_QUANTIZE_PARAMS_INIT; +} + +static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.rank; + } + return 0u; +} + +static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.dimensions; + } + return nullptr; +} + +static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.memType; + } + return QNN_TENSORMEMTYPE_UNDEFINED; +} + +static inline void set_qnn_tensor_id(Qnn_Tensor_t & tensor, uint32_t id) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.id = id; + } +} + +static inline void set_qnn_tensor_name(Qnn_Tensor_t & tensor, const char * name) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.name = name; + } +} + +static inline void set_qnn_tensor_type(Qnn_Tensor_t & tensor, Qnn_TensorType_t type) { + if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.type = type; } } @@ -419,18 +672,13 @@ static size_t memscpy(void * dst, size_t dst_size, const void * src, size_t copy return min_size; } -static char * ggml_qnn_strndup(const char * source, size_t maxlen) { - return ::strndup(source, maxlen); -} - static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) { int err = 0; VALIDATE_TENSOR_VERSION(src, err); dst.version = src.version; QNN_TENSOR_SET_NAME( - dst, ggml_qnn_strndup(QNN_TENSOR_GET_NAME(src), - std::string(QNN_TENSOR_GET_NAME(src)).size())); + dst, ::strndup(QNN_TENSOR_GET_NAME(src),std::string(QNN_TENSOR_GET_NAME(src)).size())); if (nullptr == QNN_TENSOR_GET_NAME(dst)) { return 1; } @@ -508,140 +756,61 @@ static int free_qnn_tensor(Qnn_Tensor_t & tensor) { return err; } -static uint32_t ggml_get_tensor_rank(const ggml_tensor * tensor) { - uint32_t rank = 0; - for (int i = 0; i < GGML_MAX_DIMS; i++) { - if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) { - rank++; - } - } - return rank; -} - -// TODO: mapping more ggml data type to QNN data type -// ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684 -static Qnn_DataType_t qnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) { - switch (ggmltype) { - case GGML_TYPE_F16: - return QNN_DATATYPE_FLOAT_16; - case GGML_TYPE_F32: - return QNN_DATATYPE_FLOAT_32; - case GGML_TYPE_I8: - return QNN_DATATYPE_INT_8; - case GGML_TYPE_Q8_0: - return QNN_DATATYPE_SFIXED_POINT_8; - case GGML_TYPE_Q4_0: - return QNN_DATATYPE_SFIXED_POINT_4; - default: - break; - } - return QNN_DATATYPE_UNDEFINED; -} - -// TODO: only support GGML_OP_ADD/GGML_OP_MUL/GGML_OP_MUL_MAT -static const char * qnn_opname_from_ggmlop(enum ggml_op ggmlop) { - switch (ggmlop) { - case GGML_OP_ADD: - return QNN_OP_ELEMENT_WISE_ADD; - case GGML_OP_MUL: - return QNN_OP_ELEMENT_WISE_MULTIPLY; - case GGML_OP_MUL_MAT: - return QNN_OP_MAT_MUL; - default: - break; - } - - return nullptr; -} - -static uint32_t ggml_get_tensor_data_size(const ggml_tensor * tensor) { - /* - size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); - size_t n_dims = ggml_get_tensor_rank(tensor); - for (int i = 1; i < n_dims; i++) { - data_size *= tensor->ne[i]; - } - - return data_size; - */ - return ggml_nbytes(tensor); -} - -template -Fn load_qnn_functionpointers(void * handle, const char * function_name) { +template Fn load_qnn_functionpointers(void * handle, const char * function_name) { return reinterpret_cast(dlsym(handle, function_name)); } -static const char * get_qnn_backend_name(int n_backend_type) { - switch (n_backend_type) { - case QNN_BACKEND_CPU: - return "QNN-CPU"; - case QNN_BACKEND_GPU: - return "QNN-GPU"; - case QNN_BACKEND_NPU: - return "QNN-NPU"; - case QNN_BACKEND_GGML: - return "ggml"; //"fake" QNN backend, used for compare performance between QNN backend and original GGML - default: - return "unknown"; - } -} - -static const char * qnn_get_chipset_desc(uint32_t chipset_id) { - switch (chipset_id) { - case SM8450: - return "SM8450"; - case SM8475: - return "SM8475"; - case SM8550: - return "SM8550"; - case SM8650: - return "SM8650"; - default: - return "unknown"; - } -} - static intptr_t align_to(size_t alignment, intptr_t offset) { return offset % alignment == 0 - ? offset - : offset + (static_cast(alignment) - - offset % static_cast(alignment)); + ? offset + : offset + (static_cast(alignment) - + offset % static_cast(alignment)); } -static void ggml_qnn_log_internal(ggml_log_level level, const char * file, - const char * func, int line, - const char * format, ...) { - static std::mutex ggml_qnn_log_internal_mutex; - static char s_ggml_qnn_log_internal_buf[GGML_QNN_LOGBUF_LEN]; +static void qnn_sdk_logcallback(const char * fmt, QnnLog_Level_t level, + uint64_t timestamp, va_list argp) { + +#if ENABLE_QNNSDK_LOG + static std::mutex log_mutex; + static unsigned char s_ggml_qnn_logbuf[QNN_LOGBUF_LEN]; + + const char * log_level_desc = ""; + switch (level) { + case QNN_LOG_LEVEL_ERROR: + log_level_desc = "ERROR"; + break; + case QNN_LOG_LEVEL_WARN: + log_level_desc = "WARNING"; + break; + case QNN_LOG_LEVEL_INFO: + log_level_desc = "INFO"; + break; + case QNN_LOG_LEVEL_DEBUG: + log_level_desc = "DEBUG"; + break; + case QNN_LOG_LEVEL_VERBOSE: + log_level_desc = "VERBOSE"; + break; + case QNN_LOG_LEVEL_MAX: + log_level_desc = "UNKNOWN"; + break; + } + double ms = (double) timestamp / 1000000.0; { - std::lock_guard lock(ggml_qnn_log_internal_mutex); - va_list args; + std::lock_guard lock(log_mutex); - va_start(args, format); - int len_prefix = - snprintf(s_ggml_qnn_log_internal_buf, GGML_QNN_LOGBUF_LEN, - "[%s, %d]: ", func, line); - int len = vsnprintf(s_ggml_qnn_log_internal_buf + len_prefix, - GGML_QNN_LOGBUF_LEN - len_prefix, format, args); - if (len < (GGML_QNN_LOGBUF_LEN - len_prefix)) { -#if (defined __ANDROID__) || (defined ANDROID) - // for Android APK - __android_log_print(level, "ggml-qnn", "%s\n", s_ggml_qnn_log_internal_buf); -#endif - // for Android command line application or WoA - printf("%s\n", s_ggml_qnn_log_internal_buf); - } - va_end(args); + memset(s_ggml_qnn_logbuf, 0, QNN_LOGBUF_LEN); + vsnprintf(reinterpret_cast(s_ggml_qnn_logbuf), QNN_LOGBUF_LEN, fmt, argp); + QNN_LOG_DEBUG("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggml_qnn_logbuf); } +#endif } // ================================================================================================= // -// wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI -// Engine Direct) SDK -// +// wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK +// ref:https://github.com/pytorch/executorch/tree/main/backends/qualcomm // ================================================================================================= class qnn_interface { @@ -778,11 +947,6 @@ class qnn_interface { const QnnSystemInterface_t * _qnn_sys_interface = nullptr; }; -// ================================================================================================= -// -// wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK -// -// ================================================================================================= class qnn_instance { public: using BackendIdType = decltype(QnnInterface_t{}.backendId); @@ -796,44 +960,354 @@ class qnn_instance { ~qnn_instance() {} - int qnn_init(const QnnSaver_Config_t ** saver_config); + int qnn_init(const QnnSaver_Config_t ** saver_config) { + BackendIdType backend_id = QNN_BACKEND_ID_NULL; + QNN_LOG_DEBUG("enter qni_init\n"); - int qnn_finalize(); + std::lock_guard lock(_init_mutex); - const qnn_interface & get_qnn_interface() { - if (!_qnn_interface.is_loaded()) { - QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + if (0 != load_system()) { + QNN_LOG_WARN("can not load QNN system lib, pls check why?\n"); + return 1; + } else { + QNN_LOG_DEBUG("load QNN system lib successfully\n"); } - return _qnn_interface; - } - const QNN_INTERFACE_VER_TYPE & get_qnn_raw_interface() { - if (!_qnn_interface.is_loaded()) { - QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + std::string backend_lib_path = _lib_path + _backend_name; + if (0 == _lib_path_to_backend_id.count(backend_lib_path)) { + int is_load_ok = load_backend(backend_lib_path, saver_config); + if (0 != is_load_ok) { + QNN_LOG_WARN("failed to load QNN backend\n"); + return 2; + } } - return _qnn_raw_interface; - } - const QNN_SYSTEM_INTERFACE_VER_TYPE & get_qnn_raw_system_interface() { - if (!_qnn_interface.is_loaded()) { - QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + backend_id = _lib_path_to_backend_id[backend_lib_path]; + if (0 == _loaded_backend.count(backend_id) || + 0 == _loaded_lib_handle.count(backend_id)) { + QNN_LOG_WARN("library %s is loaded but loaded backend count=%zu, " + "loaded lib_handle count=%zu\n", + backend_lib_path.c_str(), _loaded_backend.count(backend_id), + _loaded_lib_handle.count(backend_id)); + return 3; } - return _qnn_raw_system_interface; - } - const Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; } + _qnn_interface.set_qnn_interface(_loaded_backend[backend_id]); - const Qnn_ProfileHandle_t get_qnn_profile_handle() { - return _qnn_profile_handle; - } + _qnn_interface.qnn_log_create(qnn_sdk_logcallback, _qnn_log_level, + &_qnn_log_handle); + if (nullptr == _qnn_log_handle) { + QNN_LOG_WARN( + "why failed to initialize qnn log\n"); // NPU backend not work on + // Qualcomm SoC equipped low-end phone + return 4; + } else { + QNN_LOG_DEBUG("initialize qnn log successfully\n"); + } - const Qnn_DeviceHandle_t get_qnn_device_handle() { - return _qnn_device_handle; - } + std::vector temp_backend_config; + _qnn_interface.qnn_backend_create( + _qnn_log_handle, + temp_backend_config.empty() ? nullptr : temp_backend_config.data(), + &_qnn_backend_handle); + if (nullptr == _qnn_backend_handle) { + QNN_LOG_WARN("why failed to initialize qnn backend\n"); + return 5; + } else { + QNN_LOG_DEBUG("initialize qnn backend successfully\n"); + } - const Qnn_BackendHandle_t get_qnn_backend_handle() { - return _qnn_backend_handle; - } + if (nullptr != _qnn_raw_interface.propertyHasCapability) { + auto qnnStatus = + _qnn_raw_interface.propertyHasCapability(QNN_PROPERTY_GROUP_DEVICE); + if (QNN_PROPERTY_NOT_SUPPORTED == qnnStatus) { + QNN_LOG_WARN("device property is not supported\n"); + } + if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnnStatus) { + QNN_LOG_WARN("device property is not known to backend\n"); + } + } + + Qnn_ErrorHandle_t qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, nullptr, + &_qnn_device_handle); + if (QNN_SUCCESS != qnn_status && + QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnn_status) { + QNN_LOG_WARN("failed to create QNN device\n"); + } else { + QNN_LOG_INFO("create device successfully\n"); + } + + if (qnn_sdk_profile_level::profile_off != _profile_level) { + QNN_LOG_INFO("profiling turned on; level = %d", _profile_level); + if (qnn_sdk_profile_level::profile_basic == _profile_level) { + QNN_LOG_INFO("basic profiling requested. creating Qnn Profile object\n"); + if (QNN_PROFILE_NO_ERROR != + _qnn_raw_interface.profileCreate(_qnn_backend_handle, + QNN_PROFILE_LEVEL_BASIC, + &_qnn_profile_handle)) { + QNN_LOG_WARN("unable to create profile handle in the backend\n"); + return 6; + } else { + QNN_LOG_DEBUG("initialize qnn profile successfully\n"); + } + } else if (qnn_sdk_profile_level::profile_detail == _profile_level) { + QNN_LOG_INFO("detailed profiling requested. Creating Qnn Profile object\n"); + if (QNN_PROFILE_NO_ERROR != + _qnn_raw_interface.profileCreate(_qnn_backend_handle, + QNN_PROFILE_LEVEL_DETAILED, + &_qnn_profile_handle)) { + QNN_LOG_WARN("unable to create profile handle in the backend\n"); + return 7; + } else { + QNN_LOG_DEBUG("initialize qnn profile successfully\n"); + } + } + } + + _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL); + if (nullptr == _rpc_lib_handle) { + QNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dlerror()); + return 8; + } else { + QNN_LOG_DEBUG("load rpcmem lib successfully\n"); + set_rpcmem_initialized(true); + } + __pfn_rpc_mem_init = reinterpret_cast<_pfn_rpc_mem_init>( + dlsym(_rpc_lib_handle, "rpcmem_init")); + __pfn_rpc_mem_deinit = reinterpret_cast<_pfn_rpc_mem_deinit>( + dlsym(_rpc_lib_handle, "rpcmem_deinit")); + __pfn_rpc_mem_alloc = reinterpret_cast<_pfn_rpc_mem_alloc>( + dlsym(_rpc_lib_handle, "rpcmem_alloc")); + __pfn_rpc_mem_free = reinterpret_cast<_pfn_rpc_mem_free>( + dlsym(_rpc_lib_handle, "rpcmem_free")); + __pfn_rpc_mem_to_fd = reinterpret_cast<_pfn_rpc_mem_to_fd>( + dlsym(_rpc_lib_handle, "rpcmem_to_fd")); + if (nullptr == __pfn_rpc_mem_alloc || nullptr == __pfn_rpc_mem_free || + nullptr == __pfn_rpc_mem_to_fd) { + QNN_LOG_WARN("unable to access symbols in QNN RPC lib. dlerror(): %s", dlerror()); + dlclose(_rpc_lib_handle); + return 9; + } + + if (nullptr != + __pfn_rpc_mem_init) // make Qualcomm's SoC equipped low-end phone happy + __pfn_rpc_mem_init(); + + std::vector temp_context_config; + _qnn_interface.qnn_context_create( + _qnn_backend_handle, _qnn_device_handle, + temp_context_config.empty() ? nullptr : temp_context_config.data(), + &_qnn_context_handle); + if (nullptr == _qnn_context_handle) { + QNN_LOG_WARN("why failed to initialize qnn context\n"); + return 10; + } else { + QNN_LOG_DEBUG("initialize qnn context successfully\n"); + } + + if (_backend_name.find("Htp") != std::variant_npos) { + const QnnDevice_PlatformInfo_t * p_info = nullptr; + _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info); + QNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices); + QnnDevice_HardwareDeviceInfo_t * infos = p_info->v1.hwDevices; + for (int i = 0; i < p_info->v1.numHwDevices; i++) { + QNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId, + infos[i].v1.deviceType, infos[i].v1.numCores); + QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension; + QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = devinfo->onChipDevice; + QnnHtpDevice_Arch_t htp_arch = chipinfo.arch; + QNN_LOG_INFO("htp_type:%d(%s)", devinfo->devType, (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : ""); + QNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB", \ + chipinfo.socModel, qnn_get_chipset_desc(chipinfo.socModel), \ + htp_arch, qnn_get_htparch_desc(htp_arch), chipinfo.vtcmSize); + g_qnn_mgr[QNN_BACKEND_NPU].socinfo = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize }; + } + _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info); + + + //TODO: faster approach to probe the accurate capacity of rpc ion memory + size_t candidate_size = 0; + uint8_t * rpc_buffer = nullptr; + const int SIZE_IN_MB = (1 << 20); + size_t probe_slots[] = {1024, 1536, 2048 - 48, 2048}; + size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); + for (size_t idx = 0; idx < probe_counts; idx++) { + rpc_buffer = static_cast(alloc_rpcmem(probe_slots[idx] * SIZE_IN_MB, 4)); + if (nullptr == rpc_buffer) { + QNN_LOG_INFO("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno)); + break; + } else { + candidate_size = probe_slots[idx]; + free_rpcmem(rpc_buffer); + rpc_buffer = nullptr; + } + } + if (candidate_size > _rpcmem_capacity) + _rpcmem_capacity = candidate_size; + QNN_LOG_INFO("capacity of rpc ion memory %d MB\n", _rpcmem_capacity); + } + + QNN_LOG_DEBUG("leave qni_init\n"); + + return 0; + } + + int qnn_finalize() { + int ret_status = 0; + Qnn_ErrorHandle_t error = QNN_SUCCESS; + + if (nullptr != + __pfn_rpc_mem_deinit) // make Qualcomm's SoC equipped low-end phone happy + __pfn_rpc_mem_deinit(); + + if (dlclose(_rpc_lib_handle) != 0) { + QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dlerror()); + } else { + QNN_LOG_DEBUG("succeed to close rpcmem lib\n"); + } + + if (nullptr != _qnn_context_handle) { + error = _qnn_interface.qnn_context_free(_qnn_context_handle, + _qnn_profile_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); + } + _qnn_context_handle = nullptr; + } + + if (nullptr != _qnn_profile_handle) { + error = _qnn_interface.qnn_profile_free(_qnn_profile_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); + } + _qnn_profile_handle = nullptr; + } + + if (nullptr != _qnn_device_handle) { + error = _qnn_interface.qnn_device_free(_qnn_device_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); + } + _qnn_device_handle = nullptr; + } + + if (nullptr != _qnn_backend_handle) { + error = _qnn_interface.qnn_backend_free(_qnn_backend_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); + } + _qnn_backend_handle = nullptr; + } + + if (nullptr != _qnn_log_handle) { + error = _qnn_interface.qnn_log_free(_qnn_log_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); + } + _qnn_log_handle = nullptr; + } + + unload_backend(); + + unload_system(); + + return ret_status; + } + + int init_qnn_graph(const char * graph_name, bool debug, + uint8_t do_node_validation = true, + const QnnGraph_Config_t ** graph_configs = nullptr) { + int result = 0; + + if (nullptr == graph_name) { + QNN_LOG_WARN("graph name is null\n"); + return 1; + } + + if (!_graph_name.empty()) { + QNN_LOG_WARN("qnn model for graph %s already initialized\n", graph_name); + return 2; + } + + if (!do_node_validation) { + QNN_LOG_WARN("node validation disabled, backend will not perform op " + "validation prior to adding node\n"); + } + + _graph_name = graph_name; + _debug_tensor = debug; + _do_node_validations = do_node_validation; + + result = _qnn_raw_interface.graphCreate(_qnn_context_handle, graph_name, + graph_configs, &_qnn_graph_handle); + if (result != QNN_GRAPH_NO_ERROR || nullptr == _qnn_graph_handle) { + QNN_LOG_WARN("failed to create graph in qnn context\n"); + return 3; + } else { + QNN_LOG_INFO("succeed to create graph %s, %p\n", graph_name, _qnn_graph_handle); + } + + return 0; + } + + int finalize_qnn_graph() { + if (nullptr != _qnn_graph_handle) { + if (_qnn_raw_interface.graphFinalize(_qnn_graph_handle, + _qnn_profile_handle, + nullptr) != QNN_GRAPH_NO_ERROR) { + QNN_LOG_WARN("finalizing graph failure\n"); + } + } else { + QNN_LOG_DEBUG("qnn graph handle is null\n"); + } + + return 0; + } + + const qnn_interface & get_qnn_interface() { + if (!_qnn_interface.is_loaded()) { + QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + } + return _qnn_interface; + } + + const QNN_INTERFACE_VER_TYPE & get_qnn_raw_interface() { + if (!_qnn_interface.is_loaded()) { + QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + } + return _qnn_raw_interface; + } + + const QNN_SYSTEM_INTERFACE_VER_TYPE & get_qnn_raw_system_interface() { + if (!_qnn_interface.is_loaded()) { + QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + } + return _qnn_raw_system_interface; + } + + const Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; } + + const Qnn_ProfileHandle_t get_qnn_profile_handle() { + return _qnn_profile_handle; + } + + const Qnn_DeviceHandle_t get_qnn_device_handle() { + return _qnn_device_handle; + } + + const Qnn_BackendHandle_t get_qnn_backend_handle() { + return _qnn_backend_handle; + } const Qnn_ContextHandle_t get_qnn_context_handle() { return _qnn_context_handle; @@ -845,12 +1319,6 @@ class qnn_instance { const Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; } - int init_qnn_graph(const char * graph_name, bool debug, - uint8_t do_node_validation = 1, - const QnnGraph_Config_t ** graph_configs = nullptr); - - int finalize_qnn_graph(); - int init_htp_perfinfra() { QnnDevice_Infrastructure_t device_infra = nullptr; int error = _qnn_raw_interface.deviceGetInfrastructure(&device_infra); @@ -945,793 +1413,416 @@ class qnn_instance { _rpcmem_initialized = initialized; } - int32_t rpcmem_to_fd(void * buf); - - int register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor); - - void unregister_rpcmem(); - - void * alloc_rpcmem(size_t bytes, size_t alignment); - - void free_rpcmem(void * buf); - size_t get_rpcmem_capacity() { return _rpcmem_capacity; } - bool is_rpcmem_allocated(void * buf); - bool is_rpcmem_registered(Qnn_MemHandle_t handle) { return _qnn_mem_set.count(handle) != 0U; } - public: - std::map> - _qnn_graph_map; - - private: - int load_system(); - - int unload_system(); - - int load_backend(std::string & lib_path, const QnnSaver_Config_t ** saver_config); - - int unload_backend(); - - void set_qnn_raw_interface(QNN_INTERFACE_VER_TYPE & raw_interface) { - _qnn_raw_interface = raw_interface; - } - - void set_qnn_raw_system_interface(QNN_SYSTEM_INTERFACE_VER_TYPE & raw_interface) { - _qnn_raw_system_interface = raw_interface; - } - - private: - static constexpr const int _required_num_providers = 1; - - private: - std::string _lib_path; - std::string _backend_name; - std::string _model_name; // prebuilt QNN model name, not used currently - BackendIdType _backend_id; - - bool _debug_tensor = false; // flag to indicate if requested graph is to be run in debug mode - bool _do_node_validations = true; // flag to indicate whether all add_node - // calls need to be validated - QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_DEBUG; - - ggml_qnn_profile_level _profile_level = ggml_qnn_profile_level::profile_detail; - - qnn_interface _qnn_interface; - - void * _system_lib_handle = nullptr; - - Qnn_GraphHandle_t _qnn_graph_handle = nullptr; - - Qnn_LogHandle_t _qnn_log_handle = nullptr; - - Qnn_ProfileHandle_t _qnn_profile_handle = nullptr; - - Qnn_DeviceHandle_t _qnn_device_handle = nullptr; - - Qnn_BackendHandle_t _qnn_backend_handle = nullptr; - - Qnn_ContextHandle_t _qnn_context_handle = nullptr; - - QnnSystemContext_Handle_t _qnn_system_handle = nullptr; - - QnnHtpDevice_PerfInfrastructure_t * _qnn_htp_perfinfra = nullptr; - uint32_t _qnn_power_configid = 1; - uint32_t _qnn_rpc_pollingtime = 9999; // 0-10000 us for high performing - - QNN_INTERFACE_VER_TYPE _qnn_raw_interface; - QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface; - - std::unordered_set _qnn_mem_set; - - std::mutex _init_mutex; - std::unordered_map _loaded_lib_handle; - std::unordered_map _lib_path_to_backend_id; - std::unordered_map _loaded_backend; - - void * _rpc_lib_handle = nullptr; - std::atomic_bool _rpcmem_initialized{false}; - pfn_rpc_mem_alloc _pfn_rpc_mem_alloc; - pfn_rpc_mem_free _pfn_rpc_mem_free; - pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd; - pfn_rpc_mem_init _pfn_rpc_mem_init; - pfn_rpc_mem_deinit _pfn_rpc_mem_deinit; - std::unordered_map _rpcmem_store_map; - size_t _rpcmem_capacity = 512; - - std::string _graph_name; -}; - -// ================================================================================================= -// -// implementation of QNN wrapper class -// -// ================================================================================================= -void * qnn_instance::alloc_rpcmem(size_t bytes, size_t alignment) { - if (!_rpcmem_initialized) { - QNN_LOG_WARN("rpc memory not initialized\n"); - return nullptr; - } - - auto allocate_bytes = static_cast(bytes + alignment); - void * buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, - allocate_bytes); - if (buf == nullptr) { - QNN_LOG_WARN("failed to allocate rpc memory\n"); - return nullptr; - } - - auto aligned_buf = reinterpret_cast( - align_to(alignment, reinterpret_cast(buf))); - bool status = - _rpcmem_store_map.insert(std::pair(aligned_buf, buf)).second; - if (!status) { - QNN_LOG_WARN("failed to allocate rpc memory\n"); - _pfn_rpc_mem_free(buf); - } - - return aligned_buf; -} - -void qnn_instance::free_rpcmem(void * buf) { - if (!_rpcmem_initialized) { - QNN_LOG_WARN("rpc memory not initialized\n"); - } else if (0 == _rpcmem_store_map.count(buf)) { - QNN_LOG_WARN("no allocated tensor\n"); - } else { - _pfn_rpc_mem_free(_rpcmem_store_map[buf]); - _rpcmem_store_map.erase(buf); - } -} + void * alloc_rpcmem(size_t bytes, size_t alignment) { + if (!_rpcmem_initialized) { + QNN_LOG_WARN("rpc memory not initialized\n"); + return nullptr; + } -int32_t qnn_instance::rpcmem_to_fd(void * buf) { - int32_t mem_fd = -1; - if (!is_rpcmem_initialized()) { - QNN_LOG_WARN("rpc memory not initialized\n"); - } else { - mem_fd = _pfn_rpc_mem_to_fd(buf); - } + auto allocate_bytes = static_cast(bytes + alignment); + void * buf = __pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, + allocate_bytes); + if (buf == nullptr) { + QNN_LOG_WARN("failed to allocate rpc memory\n"); + return nullptr; + } - return mem_fd; -} + auto aligned_buf = reinterpret_cast( + align_to(alignment, reinterpret_cast(buf))); + bool status = + _rpcmem_store_map.insert(std::pair(aligned_buf, buf)).second; + if (!status) { + QNN_LOG_WARN("failed to allocate rpc memory\n"); + __pfn_rpc_mem_free(buf); + } -int qnn_instance::register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor) { - if (nullptr == p_data || (nullptr == p_tensor)) { - QNN_LOG_WARN("invalid param\n"); - return 1; + return aligned_buf; } - if (!is_rpcmem_initialized()) { - QNN_LOG_WARN("rpc memory not initialized\n"); - return 2; + void free_rpcmem(void * buf) { + if (!_rpcmem_initialized) { + QNN_LOG_WARN("rpc memory not initialized\n"); + } else if (0 == _rpcmem_store_map.count(buf)) { + QNN_LOG_WARN("no allocated tensor\n"); + } else { + __pfn_rpc_mem_free(_rpcmem_store_map[buf]); + _rpcmem_store_map.erase(buf); + } } - if (is_rpcmem_allocated(p_data)) { - QNN_LOG_WARN("rpc memory already allocated\n"); - // return 3; - } - if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) { - QNN_LOG_WARN("tensor %s has been registered shared memory\n", - (QNN_VER_PTR(*p_tensor)->name)); - return 4; - } + int32_t rpcmem_to_fd(void * buf) { + int32_t mem_fd = -1; + if (!is_rpcmem_initialized()) { + QNN_LOG_WARN("rpc memory not initialized\n"); + } else { + mem_fd = __pfn_rpc_mem_to_fd(buf); + } - int32_t mem_fd = rpcmem_to_fd(p_data); - if (-1 == mem_fd) { - QNN_LOG_WARN("failed to get file descriptor\n"); - return 5; + return mem_fd; } - QNN_LOG_DEBUG("mem_fd %d\n", mem_fd); - Qnn_MemDescriptor_t descriptor = {{QNN_VER_PTR(*p_tensor)->rank, - QNN_VER_PTR(*p_tensor)->dimensions, - nullptr}, - QNN_VER_PTR(*p_tensor)->dataType, - QNN_MEM_TYPE_ION, - {{mem_fd}}}; - Qnn_MemHandle_t handle = nullptr; - int error = QNN_SUCCESS; - error = _qnn_interface.qnn_mem_register(_qnn_context_handle, &descriptor, - /*numDescriptors=*/1, &handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to register shared memory, error %d, %s\n", - QNN_GET_ERROR_CODE(error), strerror(error)); - return 6; - } else { - QNN_LOG_INFO("tensor %s successfully register shared memory\n", - (QNN_VER_PTR(*p_tensor)->name)); - } - QNN_VER_PTR(*p_tensor)->memHandle = handle; - _qnn_mem_set.insert(handle); - - return 0; -} -void qnn_instance::unregister_rpcmem() { - Qnn_ErrorHandle_t error = QNN_SUCCESS; + int register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor) { + if (nullptr == p_data || (nullptr == p_tensor)) { + QNN_LOG_WARN("invalid param\n"); + return 1; + } - if (_qnn_mem_set.empty()) { - QNN_LOG_WARN("no rpcmem registered\n"); - } + if (!is_rpcmem_initialized()) { + QNN_LOG_WARN("rpc memory not initialized\n"); + return 2; + } - for (auto &mem_handle : _qnn_mem_set) { - error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1); + if (is_rpcmem_allocated(p_data)) { + QNN_LOG_WARN("rpc memory already allocated\n"); + // return 3; + } + if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) { + QNN_LOG_WARN("tensor %s has been registered shared memory\n", + (QNN_VER_PTR(*p_tensor)->name)); + return 4; + } + + int32_t mem_fd = rpcmem_to_fd(p_data); + if (-1 == mem_fd) { + QNN_LOG_WARN("failed to get file descriptor\n"); + return 5; + } + QNN_LOG_DEBUG("mem_fd %d\n", mem_fd); + Qnn_MemDescriptor_t descriptor = {{QNN_VER_PTR(*p_tensor)->rank, + QNN_VER_PTR(*p_tensor)->dimensions, + nullptr}, + QNN_VER_PTR(*p_tensor)->dataType, + QNN_MEM_TYPE_ION, + {{mem_fd}}}; + Qnn_MemHandle_t handle = nullptr; + int error = QNN_SUCCESS; + error = _qnn_interface.qnn_mem_register(_qnn_context_handle, &descriptor, + /*numDescriptors=*/1, &handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to unregister shared memory, error %d\n", - QNN_GET_ERROR_CODE(error)); + QNN_LOG_WARN("failed to register shared memory, error %d, %s\n", + QNN_GET_ERROR_CODE(error), strerror(error)); + return 6; + } else { + QNN_LOG_INFO("tensor %s successfully register shared memory\n", + (QNN_VER_PTR(*p_tensor)->name)); } - } - _qnn_mem_set.clear(); -} - -bool qnn_instance::is_rpcmem_allocated(void * buf) { - return _rpcmem_store_map.count(buf) != 0U; -} - -int qnn_instance::load_backend(std::string & lib_path, - const QnnSaver_Config_t ** saver_config) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - QNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str()); + QNN_VER_PTR(*p_tensor)->memHandle = handle; + _qnn_mem_set.insert(handle); - void * lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL); - if (nullptr == lib_handle) { - QNN_LOG_WARN("can not open QNN library %s, with error: %s", - lib_path.c_str(), dlerror()); - return 1; - } - - auto get_providers = load_qnn_functionpointers<_pfn_QnnInterface_getProviders *>( - lib_handle, "QnnInterface_getProviders"); - if (nullptr == get_providers) { - QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", - dlerror()); - return 2; + return 0; } - std::uint32_t num_providers = 0; - const QnnInterface_t ** provider_list = nullptr; - error = get_providers(&provider_list, &num_providers); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to get providers, error %d", - QNN_GET_ERROR_CODE(error)); - return 3; - } - QNN_LOG_DEBUG("num_providers=%d\n", num_providers); - if (num_providers != _required_num_providers) { - QNN_LOG_WARN("providers is %d instead of required %d", num_providers, - _required_num_providers); - return 4; - } + void unregister_rpcmem() { + Qnn_ErrorHandle_t error = QNN_SUCCESS; - if (nullptr == provider_list) { - QNN_LOG_WARN("failed to get qnn interface providers\n"); - return 5; - } - bool found_valid_interface = false; - QNN_INTERFACE_VER_TYPE qnn_interface; - for (size_t idx = 0; idx < num_providers; idx++) { - if (QNN_API_VERSION_MAJOR == - provider_list[idx]->apiVersion.coreApiVersion.major && - QNN_API_VERSION_MINOR <= - provider_list[idx]->apiVersion.coreApiVersion.minor) { - found_valid_interface = true; - qnn_interface = provider_list[idx]->QNN_INTERFACE_VER_NAME; - break; + if (_qnn_mem_set.empty()) { + QNN_LOG_WARN("no rpcmem registered\n"); } - } - if (!found_valid_interface) { - QNN_LOG_WARN("unable to find a valid qnn interface\n"); - return 6; - } else { - QNN_LOG_INFO("find a valid qnn interface\n"); - } - set_qnn_raw_interface(qnn_interface); - - BackendIdType backend_id = provider_list[0]->backendId; - _lib_path_to_backend_id[lib_path] = backend_id; - if (_loaded_backend.count(backend_id) > 0) { - QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n", - lib_path.c_str(), backend_id); - } - _loaded_backend[backend_id] = provider_list[0]; - if (_loaded_lib_handle.count(backend_id) > 0) { - QNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]); - int dlclose_error = dlclose(_loaded_lib_handle[backend_id]); - if (dlclose_error != 0) { - QNN_LOG_WARN("fail to close %p with error %s\n", - _loaded_lib_handle[backend_id], dlerror()); + for (auto & mem_handle : _qnn_mem_set) { + error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to unregister shared memory, error %d\n", + QNN_GET_ERROR_CODE(error)); + } } + _qnn_mem_set.clear(); } - _loaded_lib_handle[backend_id] = lib_handle; - _backend_id = backend_id; - return 0; -} - -int qnn_instance::unload_backend() { - int dlclose_error = 0; - for (auto & it : _loaded_lib_handle) { - dlclose_error = dlclose(it.second); - if (dlclose_error != 0) { - QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, - dlerror()); - } + bool is_rpcmem_allocated(void * buf) { + return _rpcmem_store_map.count(buf) != 0U; } - _loaded_lib_handle.clear(); - _lib_path_to_backend_id.clear(); - _loaded_backend.clear(); - return 0; -} + public: + std::map> + _qnn_graph_map; -int qnn_instance::load_system() { - Qnn_ErrorHandle_t error = QNN_SUCCESS; + private: + int load_system() { + Qnn_ErrorHandle_t error = QNN_SUCCESS; - std::string system_lib_path = _lib_path + "libQnnSystem.so"; - QNN_LOG_DEBUG("system_lib_path:%s\n", system_lib_path.c_str()); + std::string system_lib_path = _lib_path + "libQnnSystem.so"; + QNN_LOG_DEBUG("system_lib_path:%s\n", system_lib_path.c_str()); - _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL); - if (nullptr == _system_lib_handle) { - QNN_LOG_WARN("can not open QNN library %s, error: %s\n", - system_lib_path.c_str(), dlerror()); - return 1; - } + _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL); + if (nullptr == _system_lib_handle) { + QNN_LOG_WARN("can not open QNN library %s, error: %s\n", + system_lib_path.c_str(), dlerror()); + return 1; + } - auto * get_providers = - reinterpret_cast<_pfn_QnnSystemInterface_getProviders *>( - dlsym(_system_lib_handle, "QnnSystemInterface_getProviders")); - if (nullptr == get_providers) { - QNN_LOG_WARN( - "can not load QNN symbol QnnSystemInterface_getProviders: %s\n", - dlerror()); - return 2; - } + auto * get_providers = + reinterpret_cast<_pfn_qnnsysteminterface_getproviders *>( + dlsym(_system_lib_handle, "QnnSystemInterface_getProviders")); + if (nullptr == get_providers) { + QNN_LOG_WARN( + "can not load QNN symbol QnnSystemInterface_getProviders: %s\n", + dlerror()); + return 2; + } - uint32_t num_providers = 0; - const QnnSystemInterface_t ** provider_list = nullptr; - error = get_providers(&provider_list, &num_providers); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to get providers, error %d\n", - QNN_GET_ERROR_CODE(error)); - return 3; - } + uint32_t num_providers = 0; + const QnnSystemInterface_t ** provider_list = nullptr; + error = get_providers(&provider_list, &num_providers); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get providers, error %d\n", + QNN_GET_ERROR_CODE(error)); + return 3; + } - if (num_providers != _required_num_providers) { - QNN_LOG_WARN("providers is %d instead of required %d\n", num_providers, - _required_num_providers); - return 4; - } + if (num_providers != _required_num_providers) { + QNN_LOG_WARN("providers is %d instead of required %d\n", num_providers, + _required_num_providers); + return 4; + } - if (nullptr == provider_list) { - QNN_LOG_WARN("can not get providers\n"); - return 5; - } + if (nullptr == provider_list) { + QNN_LOG_WARN("can not get providers\n"); + return 5; + } - QNN_SYSTEM_INTERFACE_VER_TYPE qnn_system_interface; - bool found_valid_system_interface = false; - for (size_t idx = 0; idx < num_providers; idx++) { - if (QNN_SYSTEM_API_VERSION_MAJOR == + QNN_SYSTEM_INTERFACE_VER_TYPE qnn_system_interface; + bool found_valid_system_interface = false; + for (size_t idx = 0; idx < num_providers; idx++) { + if (QNN_SYSTEM_API_VERSION_MAJOR == provider_list[idx]->systemApiVersion.major && - QNN_SYSTEM_API_VERSION_MINOR <= + QNN_SYSTEM_API_VERSION_MINOR <= provider_list[idx]->systemApiVersion.minor) { - found_valid_system_interface = true; - qnn_system_interface = - provider_list[idx]->QNN_SYSTEM_INTERFACE_VER_NAME; - break; + found_valid_system_interface = true; + qnn_system_interface = + provider_list[idx]->QNN_SYSTEM_INTERFACE_VER_NAME; + break; + } } - } - if (!found_valid_system_interface) { - QNN_LOG_WARN("unable to find a valid qnn system interface\n"); - return 6; - } else { - QNN_LOG_INFO("find a valid qnn system interface\n"); - } - set_qnn_raw_system_interface(qnn_system_interface); - - _qnn_interface.set_qnn_system_interface(provider_list[0]); - - _qnn_interface.qnn_system_context_create(&_qnn_system_handle); - if (nullptr == _qnn_system_handle) { - QNN_LOG_WARN("can not create QNN system contenxt\n"); - } else { - QNN_LOG_INFO("initialize qnn system successfully\n"); - } - - return 0; -} - -int qnn_instance::unload_system() { - int result = 0; + if (!found_valid_system_interface) { + QNN_LOG_WARN("unable to find a valid qnn system interface\n"); + return 6; + } else { + QNN_LOG_INFO("find a valid qnn system interface\n"); + } + set_qnn_raw_system_interface(qnn_system_interface); - if (nullptr == _system_lib_handle) { - QNN_LOG_DEBUG("system lib handle is null\n"); - return 1; - } + _qnn_interface.set_qnn_system_interface(provider_list[0]); - if (nullptr != _qnn_system_handle) { - result = _qnn_interface.qnn_system_context_free(_qnn_system_handle); - if (result != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN system context\n"); + _qnn_interface.qnn_system_context_create(&_qnn_system_handle); + if (nullptr == _qnn_system_handle) { + QNN_LOG_WARN("can not create QNN system contenxt\n"); + } else { + QNN_LOG_INFO("initialize qnn system successfully\n"); } - _qnn_system_handle = nullptr; - } - int dlclose_error = dlclose(_system_lib_handle); - if (dlclose_error != 0) { - QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", - dlerror()); - return 2; + return 0; } - _system_lib_handle = nullptr; - - return result; -} + int unload_system() { + int result = 0; -static void ggml_qnn_logcallback(const char * fmt, QnnLog_Level_t level, - uint64_t timestamp, va_list argp) { + if (nullptr == _system_lib_handle) { + QNN_LOG_DEBUG("system lib handle is null\n"); + return 1; + } -#if ENABLE_QNN_LOG - static std::mutex log_mutex; - static unsigned char s_ggml_qnn_logbuf[GGML_QNN_LOGBUF_LEN]; + if (nullptr != _qnn_system_handle) { + result = _qnn_interface.qnn_system_context_free(_qnn_system_handle); + if (result != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN system context\n"); + } + _qnn_system_handle = nullptr; + } - const char * log_level_desc = ""; - switch (level) { - case QNN_LOG_LEVEL_ERROR: - log_level_desc = "ERROR"; - break; - case QNN_LOG_LEVEL_WARN: - log_level_desc = "WARNING"; - break; - case QNN_LOG_LEVEL_INFO: - log_level_desc = "INFO"; - break; - case QNN_LOG_LEVEL_DEBUG: - log_level_desc = "DEBUG"; - break; - case QNN_LOG_LEVEL_VERBOSE: - log_level_desc = "VERBOSE"; - break; - case QNN_LOG_LEVEL_MAX: - log_level_desc = "UNKNOWN"; - break; - } + int dlclose_error = dlclose(_system_lib_handle); + if (dlclose_error != 0) { + QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", + dlerror()); + return 2; + } - double ms = (double) timestamp / 1000000.0; - { - std::lock_guard lock(log_mutex); + _system_lib_handle = nullptr; - memset(s_ggml_qnn_logbuf, 0, GGML_QNN_LOGBUF_LEN); - vsnprintf(reinterpret_cast(s_ggml_qnn_logbuf), GGML_QNN_LOGBUF_LEN, fmt, argp); - QNN_LOG_DEBUG("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggml_qnn_logbuf); + return result; } -#endif -} -int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { - BackendIdType backend_id = QNN_BACKEND_ID_NULL; - QNN_LOG_DEBUG("enter qni_init\n"); + int load_backend(std::string & lib_path, const QnnSaver_Config_t ** saver_config) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + QNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str()); - std::lock_guard lock(_init_mutex); - - if (0 != load_system()) { - QNN_LOG_WARN("can not load QNN system lib, pls check why?\n"); - return 1; - } else { - QNN_LOG_DEBUG("load QNN system lib successfully\n"); - } + void * lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL); + if (nullptr == lib_handle) { + QNN_LOG_WARN("can not open QNN library %s, with error: %s", + lib_path.c_str(), dlerror()); + return 1; + } - std::string backend_lib_path = _lib_path + _backend_name; - if (0 == _lib_path_to_backend_id.count(backend_lib_path)) { - int is_load_ok = load_backend(backend_lib_path, saver_config); - if (0 != is_load_ok) { - QNN_LOG_WARN("failed to load QNN backend\n"); + auto get_providers = load_qnn_functionpointers<_pfn_qnninterface_getproviders *>( + lib_handle, "QnnInterface_getProviders"); + if (nullptr == get_providers) { + QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", + dlerror()); return 2; } - } - - backend_id = _lib_path_to_backend_id[backend_lib_path]; - if (0 == _loaded_backend.count(backend_id) || - 0 == _loaded_lib_handle.count(backend_id)) { - QNN_LOG_WARN("library %s is loaded but loaded backend count=%zu, " - "loaded lib_handle count=%zu\n", - backend_lib_path.c_str(), _loaded_backend.count(backend_id), - _loaded_lib_handle.count(backend_id)); - return 3; - } - - _qnn_interface.set_qnn_interface(_loaded_backend[backend_id]); - - _qnn_interface.qnn_log_create(ggml_qnn_logcallback, _qnn_log_level, - &_qnn_log_handle); - if (nullptr == _qnn_log_handle) { - QNN_LOG_WARN( - "why failed to initialize qnn log\n"); // NPU backend not work on - // Qualcomm SoC equipped low-end phone - return 4; - } else { - QNN_LOG_DEBUG("initialize qnn log successfully\n"); - } - std::vector temp_backend_config; - _qnn_interface.qnn_backend_create( - _qnn_log_handle, - temp_backend_config.empty() ? nullptr : temp_backend_config.data(), - &_qnn_backend_handle); - if (nullptr == _qnn_backend_handle) { - QNN_LOG_WARN("why failed to initialize qnn backend\n"); - return 5; - } else { - QNN_LOG_DEBUG("initialize qnn backend successfully\n"); - } - - if (nullptr != _qnn_raw_interface.propertyHasCapability) { - auto qnnStatus = - _qnn_raw_interface.propertyHasCapability(QNN_PROPERTY_GROUP_DEVICE); - if (QNN_PROPERTY_NOT_SUPPORTED == qnnStatus) { - QNN_LOG_WARN("device property is not supported\n"); + std::uint32_t num_providers = 0; + const QnnInterface_t ** provider_list = nullptr; + error = get_providers(&provider_list, &num_providers); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get providers, error %d", + QNN_GET_ERROR_CODE(error)); + return 3; } - if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnnStatus) { - QNN_LOG_WARN("device property is not known to backend\n"); + QNN_LOG_DEBUG("num_providers=%d\n", num_providers); + if (num_providers != _required_num_providers) { + QNN_LOG_WARN("providers is %d instead of required %d", num_providers, + _required_num_providers); + return 4; } - } - Qnn_ErrorHandle_t qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, nullptr, - &_qnn_device_handle); - if (QNN_SUCCESS != qnn_status && - QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnn_status) { - QNN_LOG_WARN("failed to create QNN device\n"); - } else { - QNN_LOG_INFO("create device successfully\n"); - } - - if (ggml_qnn_profile_level::profile_off != _profile_level) { - QNN_LOG_INFO("profiling turned on; level = %d", _profile_level); - if (ggml_qnn_profile_level::profile_basic == _profile_level) { - QNN_LOG_INFO("basic profiling requested. creating Qnn Profile object\n"); - if (QNN_PROFILE_NO_ERROR != - _qnn_raw_interface.profileCreate(_qnn_backend_handle, - QNN_PROFILE_LEVEL_BASIC, - &_qnn_profile_handle)) { - QNN_LOG_WARN("unable to create profile handle in the backend\n"); - return 6; - } else { - QNN_LOG_DEBUG("initialize qnn profile successfully\n"); - } - } else if (ggml_qnn_profile_level::profile_detail == _profile_level) { - QNN_LOG_INFO("detailed profiling requested. Creating Qnn Profile object\n"); - if (QNN_PROFILE_NO_ERROR != - _qnn_raw_interface.profileCreate(_qnn_backend_handle, - QNN_PROFILE_LEVEL_DETAILED, - &_qnn_profile_handle)) { - QNN_LOG_WARN("unable to create profile handle in the backend\n"); - return 7; - } else { - QNN_LOG_DEBUG("initialize qnn profile successfully\n"); - } + if (nullptr == provider_list) { + QNN_LOG_WARN("failed to get qnn interface providers\n"); + return 5; } - } - - _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL); - if (nullptr == _rpc_lib_handle) { - QNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dlerror()); - return 8; - } else { - QNN_LOG_DEBUG("load rpcmem lib successfully\n"); - set_rpcmem_initialized(true); - } - _pfn_rpc_mem_init = reinterpret_cast( - dlsym(_rpc_lib_handle, "rpcmem_init")); - _pfn_rpc_mem_deinit = reinterpret_cast( - dlsym(_rpc_lib_handle, "rpcmem_deinit")); - _pfn_rpc_mem_alloc = reinterpret_cast( - dlsym(_rpc_lib_handle, "rpcmem_alloc")); - _pfn_rpc_mem_free = reinterpret_cast( - dlsym(_rpc_lib_handle, "rpcmem_free")); - _pfn_rpc_mem_to_fd = reinterpret_cast( - dlsym(_rpc_lib_handle, "rpcmem_to_fd")); - if (nullptr == _pfn_rpc_mem_alloc || nullptr == _pfn_rpc_mem_free || - nullptr == _pfn_rpc_mem_to_fd) { - QNN_LOG_WARN("unable to access symbols in QNN RPC lib. dlerror(): %s", dlerror()); - dlclose(_rpc_lib_handle); - return 9; - } - - if (nullptr != - _pfn_rpc_mem_init) // make Qualcomm's SoC equipped low-end phone happy - _pfn_rpc_mem_init(); - - std::vector temp_context_config; - _qnn_interface.qnn_context_create( - _qnn_backend_handle, _qnn_device_handle, - temp_context_config.empty() ? nullptr : temp_context_config.data(), - &_qnn_context_handle); - if (nullptr == _qnn_context_handle) { - QNN_LOG_WARN("why failed to initialize qnn context\n"); - return 10; - } else { - QNN_LOG_DEBUG("initialize qnn context successfully\n"); - } - - if (_backend_name.find("Htp") != std::variant_npos) { - const QnnDevice_PlatformInfo_t * p_info = nullptr; - _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info); - QNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices); - QnnDevice_HardwareDeviceInfo_t * infos = p_info->v1.hwDevices; - for (int i = 0; i < p_info->v1.numHwDevices; i++) { - QNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId, - infos[i].v1.deviceType, infos[i].v1.numCores); - QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension; - QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = devinfo->onChipDevice; - QnnHtpDevice_Arch_t chiparch = chipinfo.arch; - QNN_LOG_INFO("htp_type:%d(%s)", devinfo->devType, (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : ""); - QNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d, vtcm_size_in_mb:%d MB", chipinfo.socModel, - qnn_get_chipset_desc(chipinfo.socModel), chiparch, chipinfo.vtcmSize); - } - _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info); - - - //TODO: faster approach to probe the accurate capacity of rpc ion memory - size_t candidate_size = 0; - uint8_t * rpc_buffer = nullptr; - const int SIZE_IN_MB = (1 << 20); - size_t probe_slots[] = {1024, 1536, 2048 - 48, 2048}; - size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); - for (size_t idx = 0; idx < probe_counts; idx++) { - rpc_buffer = static_cast(alloc_rpcmem(probe_slots[idx] * SIZE_IN_MB, 4)); - if (nullptr == rpc_buffer) { - QNN_LOG_INFO("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno)); + bool found_valid_interface = false; + QNN_INTERFACE_VER_TYPE qnn_interface; + for (size_t idx = 0; idx < num_providers; idx++) { + if (QNN_API_VERSION_MAJOR == + provider_list[idx]->apiVersion.coreApiVersion.major && + QNN_API_VERSION_MINOR <= + provider_list[idx]->apiVersion.coreApiVersion.minor) { + found_valid_interface = true; + qnn_interface = provider_list[idx]->QNN_INTERFACE_VER_NAME; break; - } else { - candidate_size = probe_slots[idx]; - free_rpcmem(rpc_buffer); - rpc_buffer = nullptr; } } - if (candidate_size > _rpcmem_capacity) - _rpcmem_capacity = candidate_size; - QNN_LOG_INFO("capacity of rpc ion memory %d MB\n", _rpcmem_capacity); - } - - QNN_LOG_DEBUG("leave qni_init\n"); - - return 0; -} - -int qnn_instance::qnn_finalize() { - int ret_status = 0; - Qnn_ErrorHandle_t error = QNN_SUCCESS; - - if (nullptr != - _pfn_rpc_mem_deinit) // make Qualcomm's SoC equipped low-end phone happy - _pfn_rpc_mem_deinit(); - - if (dlclose(_rpc_lib_handle) != 0) { - QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dlerror()); - } else { - QNN_LOG_DEBUG("succeed to close rpcmem lib\n"); - } - - if (nullptr != _qnn_context_handle) { - error = _qnn_interface.qnn_context_free(_qnn_context_handle, - _qnn_profile_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), - QNN_GET_ERROR_CODE(error)); - } - _qnn_context_handle = nullptr; - } - if (nullptr != _qnn_profile_handle) { - error = _qnn_interface.qnn_profile_free(_qnn_profile_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), - QNN_GET_ERROR_CODE(error)); + if (!found_valid_interface) { + QNN_LOG_WARN("unable to find a valid qnn interface\n"); + return 6; + } else { + QNN_LOG_INFO("find a valid qnn interface\n"); + } + set_qnn_raw_interface(qnn_interface); + + BackendIdType backend_id = provider_list[0]->backendId; + _lib_path_to_backend_id[lib_path] = backend_id; + if (_loaded_backend.count(backend_id) > 0) { + QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n", + lib_path.c_str(), backend_id); + } + _loaded_backend[backend_id] = provider_list[0]; + if (_loaded_lib_handle.count(backend_id) > 0) { + QNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]); + int dlclose_error = dlclose(_loaded_lib_handle[backend_id]); + if (dlclose_error != 0) { + QNN_LOG_WARN("fail to close %p with error %s\n", + _loaded_lib_handle[backend_id], dlerror()); + } } - _qnn_profile_handle = nullptr; + _loaded_lib_handle[backend_id] = lib_handle; + _backend_id = backend_id; + + return 0; } - if (nullptr != _qnn_device_handle) { - error = _qnn_interface.qnn_device_free(_qnn_device_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), - QNN_GET_ERROR_CODE(error)); + int unload_backend() { + int dlclose_error = 0; + for (auto & it : _loaded_lib_handle) { + dlclose_error = dlclose(it.second); + if (dlclose_error != 0) { + QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, + dlerror()); + } } - _qnn_device_handle = nullptr; + + _loaded_lib_handle.clear(); + _lib_path_to_backend_id.clear(); + _loaded_backend.clear(); + + return 0; } - if (nullptr != _qnn_backend_handle) { - error = _qnn_interface.qnn_backend_free(_qnn_backend_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), - QNN_GET_ERROR_CODE(error)); - } - _qnn_backend_handle = nullptr; + void set_qnn_raw_interface(QNN_INTERFACE_VER_TYPE & raw_interface) { + _qnn_raw_interface = raw_interface; } - if (nullptr != _qnn_log_handle) { - error = _qnn_interface.qnn_log_free(_qnn_log_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), - QNN_GET_ERROR_CODE(error)); - } - _qnn_log_handle = nullptr; + void set_qnn_raw_system_interface(QNN_SYSTEM_INTERFACE_VER_TYPE & raw_interface) { + _qnn_raw_system_interface = raw_interface; } - unload_backend(); + private: + static constexpr const int _required_num_providers = 1; - unload_system(); + private: + std::string _lib_path; + std::string _backend_name; + std::string _model_name; // prebuilt QNN model name, not used currently + BackendIdType _backend_id; - return ret_status; -} + bool _debug_tensor = false; + bool _do_node_validations = true; -int qnn_instance::init_qnn_graph(const char * graph_name, bool debug, - uint8_t do_node_validation, - const QnnGraph_Config_t ** graph_configs) { - int result = 0; + QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_DEBUG; - if (nullptr == graph_name) { - QNN_LOG_WARN("graph name is null\n"); - return 1; - } + qnn_sdk_profile_level _profile_level = qnn_sdk_profile_level::profile_detail; - if (!_graph_name.empty()) { - QNN_LOG_WARN("qnn model for graph %s already initialized\n", graph_name); - return 2; - } + qnn_interface _qnn_interface; - if (!do_node_validation) { - QNN_LOG_WARN("node validation disabled, backend will not perform op " - "validation prior to adding node\n"); - } + void * _system_lib_handle = nullptr; - _graph_name = graph_name; - _debug_tensor = debug; - _do_node_validations = do_node_validation; + Qnn_GraphHandle_t _qnn_graph_handle = nullptr; - result = _qnn_raw_interface.graphCreate(_qnn_context_handle, graph_name, - graph_configs, &_qnn_graph_handle); - if (result != QNN_GRAPH_NO_ERROR || nullptr == _qnn_graph_handle) { - QNN_LOG_WARN("failed to create graph in qnn context\n"); - return 3; - } else { - QNN_LOG_INFO("succeed to create graph %s, %p\n", graph_name, _qnn_graph_handle); - } + Qnn_LogHandle_t _qnn_log_handle = nullptr; - return 0; -} + Qnn_ProfileHandle_t _qnn_profile_handle = nullptr; -int qnn_instance::finalize_qnn_graph() { - if (nullptr != _qnn_graph_handle) { - if (_qnn_raw_interface.graphFinalize(_qnn_graph_handle, - _qnn_profile_handle, - nullptr) != QNN_GRAPH_NO_ERROR) { - QNN_LOG_WARN("finalizing graph failure\n"); - } - } else { - QNN_LOG_DEBUG("qnn graph handle is null\n"); - } + Qnn_DeviceHandle_t _qnn_device_handle = nullptr; - return 0; -} + Qnn_BackendHandle_t _qnn_backend_handle = nullptr; + + Qnn_ContextHandle_t _qnn_context_handle = nullptr; + + QnnSystemContext_Handle_t _qnn_system_handle = nullptr; + + QnnHtpDevice_PerfInfrastructure_t * _qnn_htp_perfinfra = nullptr; + uint32_t _qnn_power_configid = 1; + uint32_t _qnn_rpc_pollingtime = 9999; // 0-10000 us for high performing + + QNN_INTERFACE_VER_TYPE _qnn_raw_interface; + QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface; + + std::unordered_set _qnn_mem_set; + + std::mutex _init_mutex; + std::unordered_map _loaded_lib_handle; + std::unordered_map _lib_path_to_backend_id; + std::unordered_map _loaded_backend; + + void * _rpc_lib_handle = nullptr; + std::atomic_bool _rpcmem_initialized{false}; + _pfn_rpc_mem_alloc __pfn_rpc_mem_alloc; + _pfn_rpc_mem_free __pfn_rpc_mem_free; + _pfn_rpc_mem_to_fd __pfn_rpc_mem_to_fd; + _pfn_rpc_mem_init __pfn_rpc_mem_init; + _pfn_rpc_mem_deinit __pfn_rpc_mem_deinit; + std::unordered_map _rpcmem_store_map; + size_t _rpcmem_capacity = 512; + + std::string _graph_name; +}; // ================================================================================================= // -// implementation of GGML's QNN backend +// implementation of QNN backend for GGML // // ================================================================================================= -static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, - const struct ggml_tensor *tensor, +static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, + const struct ggml_tensor * tensor, bool b_dump_tensor_info) { // only support the following 3 OPs currently // provide a GENERAL approach could fix this problem in a standalone PR of refine ggml backend @@ -1739,23 +1830,18 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, // which the backend's ggml_backend_xxx_buffer_is_host return true. // this approach could be found: // https://github.com/ggerganov/llama.cpp/pull/7641 - // - // ensure tensor->src[0] and tensor->src[1] is not nullptr. - bool supported_op = - ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL) || - (tensor->op == GGML_OP_MUL_MAT)); + bool supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL) + || (tensor->op == GGML_OP_MUL_MAT)); if (!supported_op) { return false; } + const struct ggml_tensor * src0 = tensor->src[0]; const struct ggml_tensor * src1 = tensor->src[1]; - const int64_t ne00 = src0->ne[0]; const int64_t ne01 = src0->ne[1]; - const int64_t ne10 = src1->ne[0]; const int64_t ne11 = src1->ne[1]; - const int64_t ne20 = tensor->ne[0]; const int64_t ne21 = tensor->ne[1]; @@ -1801,15 +1887,11 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, return false; } - // make ggml_get_tensor_rank and QNN SDK happy + // make qnn_get_ggml_tensor_rank and QNN SDK happy if ((ne00 <= 1 || ne01 <= 1 || ne10 <= 1 || ne11 <= 1)) { return false; } - if ((ne20 < 32) || (ne21 < 32) || (ne10 < 32)) { - return false; - } - int qtype = src0->type; if (tensor->op == GGML_OP_ADD) { return (qtype == GGML_TYPE_F32 || qtype == GGML_TYPE_F16 || @@ -1837,75 +1919,32 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, } } + static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { Qnn_ErrorHandle_t error = QNN_SUCCESS; bool graph_initialized = false; - int64_t n_begin_time = 0LL; - int64_t n_end_time = 0LL; - int64_t n_duration = 0LL; - - qnn_instance * instance = nullptr; - std::string graph_name = "ggml_op_qnn_add"; - Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Tensor_t * tensor_0 = nullptr; - Qnn_Tensor_t * tensor_1 = nullptr; - Qnn_Tensor_t * tensor_2 = nullptr; - Qnn_Param_t qnn_params[] = {}; - enum ggml_op ggmlop = GGML_OP_ADD; - Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; - Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; - Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; - - if ((nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { - QNN_LOG_WARN("pls check why GGML tensor is null"); - return; - } + qnn_instance * instance = nullptr; + std::string graph_name = "ggml_op_qnn_add"; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Tensor_t * tensor_0 = nullptr; + Qnn_Tensor_t * tensor_1 = nullptr; + Qnn_Tensor_t * tensor_2 = nullptr; + Qnn_Param_t qnn_params[] = {}; + enum ggml_op ggmlop = GGML_OP_ADD; + Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; + + CHECK_PARAMS(ctx, src0, src1, dst); tensor_0 = (Qnn_Tensor_t *) src0->extra; tensor_1 = (Qnn_Tensor_t *) src1->extra; tensor_2 = (Qnn_Tensor_t *) dst->extra; - if ((nullptr == tensor_0) || (nullptr == tensor_1) || - (nullptr == tensor_2)) { - QNN_LOG_WARN("pls check why QNN tensor is null"); - return; - } - if (nullptr == ctx) { - QNN_LOG_WARN("pls check why backend ctx is null"); - return; - } instance = ctx->instance; - if (nullptr == instance) { - QNN_LOG_WARN("pls check why qnn instance is null"); - return; - } - QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; - - n_begin_time = ggml_time_us(); - - if (0) { - QNN_LOG_DEBUG("call %s\n", __func__); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, src0->type, ggml_type_name(src0->type), - src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], - src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, src1->type, ggml_type_name(src1->type), - src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], - src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - dst->name, dst->type, ggml_type_name(dst->type), - dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], - dst->nb[1], dst->nb[2]); - QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], - src0->ne[3]); - QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); - QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); - QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); - } + qnn_perf perf("ggml_qnn_add"); + perf.start(); + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; @@ -1947,36 +1986,39 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QNN_LOG_INFO("can't create qnn graph handle with graph name %s, " "error = %d\n", graph_name.c_str(), error); - return; + goto failure; } error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - ggml_get_tensor_data_size(src0)}; + qnn_get_ggml_tensor_data_size(src0)}; QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - ggml_get_tensor_data_size(src1)}; + qnn_get_ggml_tensor_data_size(src1)}; QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - ggml_get_tensor_data_size(dst)}; + qnn_get_ggml_tensor_data_size(dst)}; QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; @@ -1990,17 +2032,19 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src error = qnn_raw_interface.graphAddNode(graph_handle, op_config); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } - error = - qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); instance->_qnn_graph_map[map_entry] = graph_item; @@ -2011,8 +2055,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src tensor_1 = std::get<2>(graph_item); tensor_2 = std::get<3>(graph_item); - // QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], - // src0->ne[2], src0->ne[3]); uint32_t dimensions_input_0[] = { (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; @@ -2024,38 +2066,61 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src (uint32_t) dst->ne[3]}; QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - ggml_get_tensor_data_size(src0)}; + qnn_get_ggml_tensor_data_size(src0)}; QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - ggml_get_tensor_data_size(src1)}; + qnn_get_ggml_tensor_data_size(src1)}; QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - ggml_get_tensor_data_size(dst)}; + qnn_get_ggml_tensor_data_size(dst)}; QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; - error = - qnn_raw_interface.graphExecute(graph_handle, tensor_inputs,2, + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs,2, tensor_outputs,1, nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } } +failure: + if (QNN_SUCCESS != error) { + QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); + QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); + QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, src0->type, ggml_type_name(src0->type), + src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], + src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, src1->type, ggml_type_name(src1->type), + src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], + src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, dst->type, ggml_type_name(dst->type), + dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], + dst->nb[1], dst->nb[2]); + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], + src0->ne[3]); + } + QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions; QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions; - n_end_time = ggml_time_us(); - n_duration = (n_end_time - n_begin_time) / 1000; - QNN_LOG_DEBUG("duration of ggml_qnn_add : %lld milliseconds\n", n_duration); + + perf.info(); } /* @@ -2074,69 +2139,32 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * dst) { Qnn_ErrorHandle_t error = QNN_SUCCESS; bool graph_initialized = false; - int64_t n_begin_time = 0LL; - int64_t n_end_time = 0LL; - int64_t n_duration = 0LL; - - qnn_instance * instance = nullptr; - std::string graph_name = "ggml_op_qnn_mul_mat"; - Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Tensor_t * tensor_0 = nullptr; - Qnn_Tensor_t * tensor_1 = nullptr; - Qnn_Tensor_t * tensor_2 = nullptr; - - Qnn_Param_t qnn_params[] = {}; - - enum ggml_op ggmlop = GGML_OP_MUL_MAT; - Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; - Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; - Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; - - if ((nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { - QNN_LOG_WARN("pls check why GGML tensor is null"); - return; - } + qnn_instance * instance = nullptr; + std::string graph_name = "ggml_op_qnn_mul_mat"; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Tensor_t * tensor_0 = nullptr; + Qnn_Tensor_t * tensor_1 = nullptr; + Qnn_Tensor_t * tensor_2 = nullptr; + Qnn_Param_t qnn_params[] = {}; + enum ggml_op ggmlop = GGML_OP_MUL_MAT; + Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; + + CHECK_PARAMS(ctx, src0, src1, dst); tensor_0 = (Qnn_Tensor_t *) src0->extra; tensor_1 = (Qnn_Tensor_t *) src1->extra; tensor_2 = (Qnn_Tensor_t *) dst->extra; - if ((nullptr == tensor_0) || (nullptr == tensor_1) || - (nullptr == tensor_2)) { - QNN_LOG_WARN("pls check why QNN tensor is null"); - return; - } - if (nullptr == ctx) { - QNN_LOG_WARN("pls check why backend ctx is null"); - return; - } instance = ctx->instance; - if (nullptr == instance) { - QNN_LOG_WARN("pls check why qnn instance is null"); - return; - } - QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + qnn_perf perf("ggml_qnn_mul_mat"); + perf.start(); - n_begin_time = ggml_time_us(); - QNN_LOG_DEBUG("call %s\n", __func__); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, src0->type, ggml_type_name(src0->type), - src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], - src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, src1->type, ggml_type_name(src1->type), - src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], - src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], - dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); - QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], - src0->ne[3]); - QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); - QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); - QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); + tensor_0 = (Qnn_Tensor_t *) src0->extra; + tensor_1 = (Qnn_Tensor_t *) src1->extra; + tensor_2 = (Qnn_Tensor_t *) dst->extra; + instance = ctx->instance; + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; @@ -2178,36 +2206,39 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, QNN_LOG_INFO("can't create qnn graph handle with graph name %s, " "error = %d\n", graph_name.c_str(), error); - return; + goto failure; } error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - ggml_get_tensor_data_size(src0)}; + qnn_get_ggml_tensor_data_size(src0)}; QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - ggml_get_tensor_data_size(src1)}; + qnn_get_ggml_tensor_data_size(src1)}; QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - ggml_get_tensor_data_size(dst)}; + qnn_get_ggml_tensor_data_size(dst)}; QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; @@ -2220,10 +2251,12 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, error = qnn_raw_interface.graphAddNode(graph_handle, op_config); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, @@ -2231,6 +2264,7 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); instance->_qnn_graph_map[map_entry] = graph_item; @@ -2241,7 +2275,6 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, tensor_1 = std::get<2>(graph_item); tensor_2 = std::get<3>(graph_item); - QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); uint32_t dimensions_input_0[] = { (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; @@ -2252,41 +2285,60 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - ggml_get_tensor_data_size(src0)}; + qnn_get_ggml_tensor_data_size(src0)}; QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - ggml_get_tensor_data_size(src1)}; + qnn_get_ggml_tensor_data_size(src1)}; QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - ggml_get_tensor_data_size(dst)}; + qnn_get_ggml_tensor_data_size(dst)}; Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; - error = - qnn_raw_interface.graphExecute(graph_handle, + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } } +failure: + if (QNN_SUCCESS != error) { + QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); + QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); + QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, src0->type, ggml_type_name(src0->type), + src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], + src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, src1->type, ggml_type_name(src1->type), + src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], + src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], + dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], + src0->ne[3]); + } + QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions; QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions; - n_end_time = ggml_time_us(); - n_duration = (n_end_time - n_begin_time) / 1000; - QNN_LOG_DEBUG("duration of ggml_qnn_mul_mat : %lld milliseconds\n", - n_duration); - QNN_LOG_DEBUG("call %s done\n", __func__); + perf.info(); } // common function for GGML OPs using QNN API @@ -2296,10 +2348,6 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, ggml_tensor * dst) { Qnn_ErrorHandle_t error = QNN_SUCCESS; bool graph_initialized = false; - int64_t n_begin_time = 0LL; - int64_t n_end_time = 0LL; - int64_t n_duration = 0LL; - qnn_instance * instance = nullptr; std::string qnn_graph_name = "ggml_qnn_graph"; std::string qnn_op_config_name = "ggml_qnn_op_config"; @@ -2308,73 +2356,39 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, Qnn_Tensor_t * tensor_0 = nullptr; Qnn_Tensor_t * tensor_1 = nullptr; Qnn_Tensor_t * tensor_2 = nullptr; - Qnn_Param_t qnn_params[] = {}; - Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; - if ((nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { - QNN_LOG_WARN("pls check why GGML tensor is null"); - return; - } + CHECK_PARAMS(ctx, src0, src1, dst); tensor_0 = (Qnn_Tensor_t *) src0->extra; tensor_1 = (Qnn_Tensor_t *) src1->extra; tensor_2 = (Qnn_Tensor_t *) dst->extra; - if ((nullptr == tensor_0) || (nullptr == tensor_1) || - (nullptr == tensor_2)) { - QNN_LOG_WARN("pls check why QNN tensor is null"); - return; - } - if (nullptr == ctx) { - QNN_LOG_WARN("pls check why backend ctx is null"); - return; - } instance = ctx->instance; - if (nullptr == instance) { - QNN_LOG_WARN("pls check why qnn instance is null"); + qnn_perf perf(ggml_op_name(ggmlop)); + perf.start(); + + qnn_op_name = qnn_opname_from_ggmlop(ggmlop); + if (nullptr == qnn_op_name) { + QNN_LOG_WARN("ggml op %d(%s) not supported currently", ggmlop, ggml_op_name(ggmlop)); return; } + + tensor_0 = (Qnn_Tensor_t *) src0->extra; + tensor_1 = (Qnn_Tensor_t *) src1->extra; + tensor_2 = (Qnn_Tensor_t *) dst->extra; + instance = ctx->instance; QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); - qnn_op_name = qnn_opname_from_ggmlop(ggmlop); - if (nullptr == qnn_op_name) { - QNN_LOG_WARN( - "pls check why can not get QNN OP name with ggml op %d(%s)", ggmlop, - ggml_op_name(ggmlop)); - return; - } - - n_begin_time = ggml_time_us(); - - QNN_LOG_DEBUG("call %s\n", __func__); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, src0->type, ggml_type_name(src0->type), - src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], - src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, src1->type, ggml_type_name(src1->type), - src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], - src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], - dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); - QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], - src0->ne[3]); - QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); - QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); - QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; + uint32_t dimensions_input_0[] = { (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; @@ -2413,37 +2427,40 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, QNN_LOG_INFO("can't create qnn graph handle with ggml op %s, graph " "name %s, error = %d\n", ggml_op_name(ggmlop), qnn_graph_name.c_str(), error); - return; + goto failure; } error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - ggml_get_tensor_data_size(src0)}; + qnn_get_ggml_tensor_data_size(src0)}; QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - ggml_get_tensor_data_size(src1)}; + qnn_get_ggml_tensor_data_size(src1)}; QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - ggml_get_tensor_data_size(dst)}; + qnn_get_ggml_tensor_data_size(dst)}; QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; @@ -2456,10 +2473,12 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, error = qnn_raw_interface.graphAddNode(graph_handle, op_config); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, @@ -2467,6 +2486,7 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); instance->_qnn_graph_map[map_entry] = graph_item; @@ -2477,7 +2497,6 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, tensor_1 = std::get<2>(graph_item); tensor_2 = std::get<3>(graph_item); - QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); uint32_t dimensions_input_0[] = { (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; @@ -2488,21 +2507,21 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - ggml_get_tensor_data_size(src0)}; + qnn_get_ggml_tensor_data_size(src0)}; QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - ggml_get_tensor_data_size(src1)}; + qnn_get_ggml_tensor_data_size(src1)}; QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - ggml_get_tensor_data_size(dst)}; + qnn_get_ggml_tensor_data_size(dst)}; Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; @@ -2513,16 +2532,36 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } } +failure: + if (QNN_SUCCESS != error) { + QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); + QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); + QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, src0->type, ggml_type_name(src0->type), + src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], + src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, src1->type, ggml_type_name(src1->type), + src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], + src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], + dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], + src0->ne[3]); + } + QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions; QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions; - n_end_time = ggml_time_us(); - n_duration = (n_end_time - n_begin_time) / 1000; - QNN_LOG_DEBUG("duration of ggml_qnn_%s : %lld milliseconds\n", - ggml_op_name(ggmlop), n_duration); - QNN_LOG_DEBUG("call %s done\n", __func__); + perf.info(); } static void ggml_qnn_repeat(ggml_backend_qnn_context * ctx, @@ -2829,44 +2868,6 @@ bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx, return true; } -struct ggml_backend_qnn_buffer_context { - ggml_backend_qnn_buffer_context(size_t device) - : device(device) - , name(GGML_QNN_NAME + std::to_string(device)) {} - - ~ggml_backend_qnn_buffer_context() { - if (buffer) { - free(buffer); - } - - for (auto * sub_buffer : sub_buffers) { - free(sub_buffer); - } - - for (auto * qnn_tensor : qnn_tensors) { - free_qnn_tensor(*qnn_tensor); - free(qnn_tensor); - } - - sub_buffers.clear(); - qnn_tensors.clear(); - } - void * buffer = nullptr; - - struct ggml_backend_qnn_context * backend_ctx = nullptr; - - size_t buffer_size = 0; - std::vector sub_buffers; - std::vector qnn_tensors; - size_t device; - std::string name; -}; - -struct ggml_backend_qnn_buffer_type_context { - size_t device; - std::string name; -}; - static const char * ggml_backend_qnn_buffer_get_name(ggml_backend_buffer_t buffer) { GGML_UNUSED(buffer); return "QNN"; @@ -2922,7 +2923,7 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t QNN_QUANTIZATION_ENCODING_UNDEFINED, {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}}, - .rank = ggml_get_tensor_rank(tensor), + .rank = qnn_get_ggml_tensor_rank(tensor), .dimensions = dimensions, .memType = QNN_TENSORMEMTYPE_RAW, {.clientBuf = {.data = nullptr, .dataSize = 0}}}}}; @@ -3122,7 +3123,7 @@ GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const ggml_tensor * op) { ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *) backend->context; - return (ggml_qnn_can_handle_op(ctx, op, true)); + return (ggml_qnn_can_handle_op(ctx, op, false)); } GGML_CALL static bool ggml_backend_qnn_offload_op(ggml_backend_t backend,const ggml_tensor * tensor) { @@ -3213,14 +3214,13 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device) { return nullptr; } - //ref:https://github.com/zhouwg/llama.cpp/pull/1 static ggml_backend_qnn_buffer_type_context ggml_backend_qnn_buffer_type_contexts[GGML_QNN_MAX_DEVICES]; static ggml_backend_buffer_type ggml_backend_qnn_buffer_types[GGML_QNN_MAX_DEVICES]; static bool ggml_backend_qnn_buffer_type_initialized = false; if (!ggml_backend_qnn_buffer_type_initialized) { for (size_t i = 0; i < GGML_QNN_MAX_DEVICES; i++) { auto & context = ggml_backend_qnn_buffer_type_contexts[i]; - context = { i, std::string(GGML_QNN_NAME) + std::to_string(i) }; + context = { i, std::string(QNN_BACKEND_NAME) + std::to_string(i) }; ggml_backend_qnn_buffer_types[i] = { /* .iface = */ { /* .get_name = */ ggml_backend_qnn_buffer_type_name, @@ -3285,10 +3285,10 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { } else { if (0 == setenv("LD_LIBRARY_PATH", path.c_str(), 1)) { QNN_LOG_INFO("%s backend setenv successfully\n", - get_qnn_backend_name(device)); + qnn_get_backend_name(device)); } else { QNN_LOG_ERROR("%s backend setenv failure\n", - get_qnn_backend_name(device)); + qnn_get_backend_name(device)); } } @@ -3298,7 +3298,7 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { if (0 != result) { QNN_LOG_WARN( "init qnn subsystem failed with qnn backend %s, pls check why\n", - get_qnn_backend_name(device)); + qnn_get_backend_name(device)); delete instance; return nullptr; } @@ -3309,7 +3309,7 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { return nullptr; } - std::string device_name = get_qnn_backend_name(device); + std::string device_name = qnn_get_backend_name(device); QNN_LOG_INFO("qnn device name %s", device_name.c_str()); instance->init_qnn_graph(device_name.c_str(), false); g_qnn_mgr[device].instance = instance; From 5269e082aa479de382fefde7518a84036c1b6b7f Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Tue, 11 Jun 2024 23:05:00 +0800 Subject: [PATCH 014/166] ggml-qnn: refine ggml inference using QNN NPU --- ggml-qnn.cpp | 250 ++++++++++++------------ tests/ggml-qnn/ggml-qnn-ut-build-run.sh | 10 +- tests/ggml-qnn/ggml-qnn-ut.cpp | 42 ++-- 3 files changed, 149 insertions(+), 153 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index 43a8fcd3ea8cb..4700e145112d6 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -55,6 +55,7 @@ #include "Saver/QnnSaver.h" #include "System/QnnSystemInterface.h" #include "HTP/QnnHtpDevice.h" +#include // ================================================================================================= // @@ -72,9 +73,16 @@ static int free_qnn_tensor(Qnn_Tensor_t & tensor); // self-defined macro / data structure // // ================================================================================================= -#define ENABLE_QNNBACKEND_DEBUG 1 // for troubleshooting QNN backend +#ifdef NDEBUG +#define ENABLE_QNNBACKEND_DEBUG 0 // for troubleshooting QNN backend #define ENABLE_QNNSDK_LOG 0 // enable/disable QNN SDK's internal log #define ENABLE_QNNBACKEND_PERF 0 // enable/disable op's perf info +#else +#define ENABLE_QNNBACKEND_DEBUG 1 // for troubleshooting QNN backend +#define ENABLE_QNNSDK_LOG 1 // enable/disable QNN SDK's internal log +#define ENABLE_QNNBACKEND_PERF 1 // enable/disable op's perf info +#endif + #define QNN_LOGBUF_LEN 4096 #define QNN_BACKEND_NAME "qnn" @@ -393,7 +401,6 @@ static void qnn_internal_log(ggml_log_level level, const char * file, } } - static bool qnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { if ((nullptr == ctx) || (nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { @@ -438,8 +445,8 @@ class qnn_perf { void info() { _end_time = ggml_time_us(); - _duration = (_end_time - _begin_time) / 1000; - QNN_LOG_DEBUG("duration of %s : %lld milliseconds\n", _perf_name.c_str(), _duration); + _duration = (_end_time - _begin_time); + QNN_LOG_DEBUG("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration); } private: @@ -473,15 +480,15 @@ enum qnn_sdk_profile_level { profile_detail = 2 }; -using _pfn_rpc_mem_init = void (*)(void); -using _pfn_rpc_mem_deinit = void (*)(void); -using _pfn_rpc_mem_alloc = void *(*) (int, uint32_t, int); -using _pfn_rpc_mem_free = void (*)(void *); -using _pfn_rpc_mem_to_fd = int (*)(void *); +using pfn_rpc_mem_init = void (*)(void); +using pfn_rpc_mem_deinit = void (*)(void); +using pfn_rpc_mem_alloc = void *(*) (int, uint32_t, int); +using pfn_rpc_mem_free = void (*)(void *); +using pfn_rpc_mem_to_fd = int (*)(void *); -using _pfn_qnnsaver_initialize = decltype(QnnSaver_initialize); -using _pfn_qnninterface_getproviders = decltype(QnnInterface_getProviders); -using _pfn_qnnsysteminterface_getproviders = decltype(QnnSystemInterface_getProviders); +using pfn_qnnsaver_initialize = decltype(QnnSaver_initialize); +using pfn_qnninterface_getproviders = decltype(QnnInterface_getProviders); +using pfn_qnnsysteminterface_getproviders = decltype(QnnSystemInterface_getProviders); #define QNN_VER_PTR(x) (&((x).v1)) #define RPCMEM_DEFAULT_FLAGS 1 @@ -702,7 +709,7 @@ static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) { if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) { Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; Qnn_AxisScaleOffset_t & axis_scale_offset = src_qparam_cpy.axisScaleOffsetEncoding; - Qnn_ScaleOffset_t **scaleOffset = & axis_scale_offset.scaleOffset; + Qnn_ScaleOffset_t ** scaleOffset = & axis_scale_offset.scaleOffset; size_t scaleOffsetSize = axis_scale_offset.numScaleOffsets * sizeof(Qnn_ScaleOffset_t); *scaleOffset = (Qnn_ScaleOffset_t *) malloc(scaleOffsetSize); memscpy(*scaleOffset, scaleOffsetSize, @@ -732,8 +739,8 @@ static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) { uint32_t rank = QNN_TENSOR_GET_RANK(src); QNN_TENSOR_SET_RANK(dst, rank); - size_t dim_size = rank * sizeof(uint32_t); - uint32_t *dimensions = (uint32_t *) malloc(dim_size); + size_t dim_size = rank * sizeof(uint32_t); + uint32_t * dimensions = (uint32_t *) malloc(dim_size); if (dimensions == nullptr) { QNN_LOG_WARN("deep_copy_qnn_tensors() allocation error while copying " "tensor %s\n", @@ -1072,26 +1079,26 @@ class qnn_instance { QNN_LOG_DEBUG("load rpcmem lib successfully\n"); set_rpcmem_initialized(true); } - __pfn_rpc_mem_init = reinterpret_cast<_pfn_rpc_mem_init>( + _pfn_rpc_mem_init = reinterpret_cast( dlsym(_rpc_lib_handle, "rpcmem_init")); - __pfn_rpc_mem_deinit = reinterpret_cast<_pfn_rpc_mem_deinit>( + _pfn_rpc_mem_deinit = reinterpret_cast( dlsym(_rpc_lib_handle, "rpcmem_deinit")); - __pfn_rpc_mem_alloc = reinterpret_cast<_pfn_rpc_mem_alloc>( + _pfn_rpc_mem_alloc = reinterpret_cast( dlsym(_rpc_lib_handle, "rpcmem_alloc")); - __pfn_rpc_mem_free = reinterpret_cast<_pfn_rpc_mem_free>( + _pfn_rpc_mem_free = reinterpret_cast( dlsym(_rpc_lib_handle, "rpcmem_free")); - __pfn_rpc_mem_to_fd = reinterpret_cast<_pfn_rpc_mem_to_fd>( + _pfn_rpc_mem_to_fd = reinterpret_cast( dlsym(_rpc_lib_handle, "rpcmem_to_fd")); - if (nullptr == __pfn_rpc_mem_alloc || nullptr == __pfn_rpc_mem_free || - nullptr == __pfn_rpc_mem_to_fd) { + if (nullptr == _pfn_rpc_mem_alloc || nullptr == _pfn_rpc_mem_free || + nullptr == _pfn_rpc_mem_to_fd) { QNN_LOG_WARN("unable to access symbols in QNN RPC lib. dlerror(): %s", dlerror()); dlclose(_rpc_lib_handle); return 9; } if (nullptr != - __pfn_rpc_mem_init) // make Qualcomm's SoC equipped low-end phone happy - __pfn_rpc_mem_init(); + _pfn_rpc_mem_init) // make Qualcomm's SoC equipped low-end phone happy + _pfn_rpc_mem_init(); std::vector temp_context_config; _qnn_interface.qnn_context_create( @@ -1124,7 +1131,6 @@ class qnn_instance { } _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info); - //TODO: faster approach to probe the accurate capacity of rpc ion memory size_t candidate_size = 0; uint8_t * rpc_buffer = nullptr; @@ -1145,6 +1151,16 @@ class qnn_instance { if (candidate_size > _rpcmem_capacity) _rpcmem_capacity = candidate_size; QNN_LOG_INFO("capacity of rpc ion memory %d MB\n", _rpcmem_capacity); + + if (0 != init_htp_perfinfra()) { + QNN_LOG_WARN("initialize HTP performance failure"); + } + if (0 != set_rpc_polling()) { + QNN_LOG_WARN("set RPC polling failure"); + } + if (0 != set_high_performance_mode()) { + QNN_LOG_WARN("set HTP high performance mode failure"); + } } QNN_LOG_DEBUG("leave qni_init\n"); @@ -1156,9 +1172,8 @@ class qnn_instance { int ret_status = 0; Qnn_ErrorHandle_t error = QNN_SUCCESS; - if (nullptr != - __pfn_rpc_mem_deinit) // make Qualcomm's SoC equipped low-end phone happy - __pfn_rpc_mem_deinit(); + if (nullptr != _pfn_rpc_mem_deinit) // make Qualcomm's SoC equipped low-end phone happy + _pfn_rpc_mem_deinit(); if (dlclose(_rpc_lib_handle) != 0) { QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dlerror()); @@ -1325,6 +1340,8 @@ class qnn_instance { if (error != QNN_SUCCESS) { QNN_LOG_WARN("failed to get qnn device infra\n"); return 1; + } else { + QNN_LOG_INFO("HTP backend perf_infrastructure creation ok\n"); } QnnHtpDevice_Infrastructure_t * htp_infra = static_cast(device_infra); @@ -1333,6 +1350,11 @@ class qnn_instance { uint32_t device_id = 0; uint32_t core_id = 0; htp_perfinfra->createPowerConfigId(device_id, core_id, &power_configid); + if (htp_infra->infraType != QNN_HTP_DEVICE_INFRASTRUCTURE_TYPE_PERF) { + QNN_LOG_INFO("HTP infra type = %d, which is not perf infra type", htp_infra->infraType); + } else { + QNN_LOG_INFO("HTP infra type = %d, which is perf infra type\n", htp_infra->infraType); + } _qnn_htp_perfinfra = htp_perfinfra; _qnn_power_configid = power_configid; @@ -1343,14 +1365,17 @@ class qnn_instance { if (_qnn_rpc_pollingtime > 0) { QnnHtpPerfInfrastructure_PowerConfig_t rpc_pollingTime; memset(&rpc_pollingTime, 0, sizeof(rpc_pollingTime)); - rpc_pollingTime.option = - QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME; + rpc_pollingTime.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME; rpc_pollingTime.rpcPollingTimeConfig = _qnn_rpc_pollingtime; - const QnnHtpPerfInfrastructure_PowerConfig_t * powerConfigs[] = { - &rpc_pollingTime, nullptr}; + + QnnHtpPerfInfrastructure_PowerConfig_t rpc_ControlLatency; + memset(&rpc_ControlLatency, 0, sizeof(rpc_ControlLatency)); + rpc_ControlLatency.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY; + rpc_ControlLatency.rpcControlLatencyConfig = 40; + + const QnnHtpPerfInfrastructure_PowerConfig_t * powerConfigs[] = {&rpc_pollingTime, &rpc_ControlLatency, nullptr}; if (_qnn_htp_perfinfra) { - _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, - powerConfigs); + _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, powerConfigs); } } return 0; @@ -1426,7 +1451,7 @@ class qnn_instance { } auto allocate_bytes = static_cast(bytes + alignment); - void * buf = __pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, + void * buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, allocate_bytes); if (buf == nullptr) { QNN_LOG_WARN("failed to allocate rpc memory\n"); @@ -1439,7 +1464,7 @@ class qnn_instance { _rpcmem_store_map.insert(std::pair(aligned_buf, buf)).second; if (!status) { QNN_LOG_WARN("failed to allocate rpc memory\n"); - __pfn_rpc_mem_free(buf); + _pfn_rpc_mem_free(buf); } return aligned_buf; @@ -1451,7 +1476,7 @@ class qnn_instance { } else if (0 == _rpcmem_store_map.count(buf)) { QNN_LOG_WARN("no allocated tensor\n"); } else { - __pfn_rpc_mem_free(_rpcmem_store_map[buf]); + _pfn_rpc_mem_free(_rpcmem_store_map[buf]); _rpcmem_store_map.erase(buf); } } @@ -1461,7 +1486,7 @@ class qnn_instance { if (!is_rpcmem_initialized()) { QNN_LOG_WARN("rpc memory not initialized\n"); } else { - mem_fd = __pfn_rpc_mem_to_fd(buf); + mem_fd = _pfn_rpc_mem_to_fd(buf); } return mem_fd; @@ -1560,7 +1585,7 @@ class qnn_instance { } auto * get_providers = - reinterpret_cast<_pfn_qnnsysteminterface_getproviders *>( + reinterpret_cast( dlsym(_system_lib_handle, "QnnSystemInterface_getProviders")); if (nullptr == get_providers) { QNN_LOG_WARN( @@ -1661,7 +1686,7 @@ class qnn_instance { return 1; } - auto get_providers = load_qnn_functionpointers<_pfn_qnninterface_getproviders *>( + auto get_providers = load_qnn_functionpointers( lib_handle, "QnnInterface_getProviders"); if (nullptr == get_providers) { QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", @@ -1805,11 +1830,11 @@ class qnn_instance { void * _rpc_lib_handle = nullptr; std::atomic_bool _rpcmem_initialized{false}; - _pfn_rpc_mem_alloc __pfn_rpc_mem_alloc; - _pfn_rpc_mem_free __pfn_rpc_mem_free; - _pfn_rpc_mem_to_fd __pfn_rpc_mem_to_fd; - _pfn_rpc_mem_init __pfn_rpc_mem_init; - _pfn_rpc_mem_deinit __pfn_rpc_mem_deinit; + pfn_rpc_mem_alloc _pfn_rpc_mem_alloc; + pfn_rpc_mem_free _pfn_rpc_mem_free; + pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd; + pfn_rpc_mem_init _pfn_rpc_mem_init; + pfn_rpc_mem_deinit _pfn_rpc_mem_deinit; std::unordered_map _rpcmem_store_map; size_t _rpcmem_capacity = 512; @@ -1824,101 +1849,63 @@ class qnn_instance { static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, const struct ggml_tensor * tensor, bool b_dump_tensor_info) { - // only support the following 3 OPs currently - // provide a GENERAL approach could fix this problem in a standalone PR of refine ggml backend - // subsystem for mixed inference between CPU&GPU / CPU&NPU easily for ANY ggml backends - // which the backend's ggml_backend_xxx_buffer_is_host return true. - // this approach could be found: - // https://github.com/ggerganov/llama.cpp/pull/7641 - bool supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL) - || (tensor->op == GGML_OP_MUL_MAT)); - if (!supported_op) { + if (ggml_is_empty(tensor) || tensor->op == GGML_OP_RESHAPE || + tensor->op == GGML_OP_TRANSPOSE || tensor->op == GGML_OP_VIEW || + tensor->op == GGML_OP_PERMUTE || tensor->op == GGML_OP_NONE) { return false; } const struct ggml_tensor * src0 = tensor->src[0]; const struct ggml_tensor * src1 = tensor->src[1]; + if (nullptr == src0 || nullptr == src1) { + return false; + } + const int64_t ne00 = src0->ne[0]; const int64_t ne01 = src0->ne[1]; const int64_t ne10 = src1->ne[0]; const int64_t ne11 = src1->ne[1]; - const int64_t ne20 = tensor->ne[0]; - const int64_t ne21 = tensor->ne[1]; - - //TODO: support other quantized data type - if (ggml_is_quantized(src0->type)) { - if ((src0->type != GGML_TYPE_Q8_0) && (src0->type != GGML_TYPE_Q4_0)) { - return false; - } - } - - if (b_dump_tensor_info) { - if (tensor->op == GGML_OP_MUL_MAT) { - QNN_LOG_DEBUG("GGML_OP_MUL_MAT"); - QNN_LOG_DEBUG("op name:%s, tensor type:%s", - ggml_op_name(tensor->op), - ggml_type_name(tensor->type)); - QNN_LOG_DEBUG("src0 type:%s", ggml_type_name(tensor->src[0]->type)); - QNN_LOG_DEBUG("src1 type:%s", ggml_type_name(tensor->src[1]->type)); - QNN_LOG_DEBUG("src0 %15s: type = %i (%5s) ne = %5" PRIi64 - " x %5" PRIi64 " x %5" PRIi64 - ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, src0->type, ggml_type_name(src0->type), - src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], - src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("src1 %15s: type = %i (%5s) ne = %5" PRIi64 - " x %5" PRIi64 " x %5" PRIi64 - ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, src1->type, ggml_type_name(src1->type), - src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], - src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG( - " %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - tensor->name, tensor->type, ggml_type_name(tensor->type), - tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], - tensor->nb[1], tensor->nb[2]); - } - } - if (ggml_is_empty(tensor) || tensor->op == GGML_OP_RESHAPE || - tensor->op == GGML_OP_TRANSPOSE || tensor->op == GGML_OP_VIEW || - tensor->op == GGML_OP_PERMUTE || tensor->op == GGML_OP_NONE) { + // make qnn_get_ggml_tensor_rank and QNN SDK happy + if (ne00 <= 1 || ne01 <= 1 || ne10 <= 1 || ne11 <= 1) { return false; } - // make qnn_get_ggml_tensor_rank and QNN SDK happy - if ((ne00 <= 1 || ne01 <= 1 || ne10 <= 1 || ne11 <= 1)) { + // TODO: support other GGML OPs using QNN API + // a GENERAL approach could fix this problem in a standalone PR of refine ggml backend + // subsystem for mixed inference between CPU&GPU / CPU&NPU easily for ANY ggml backends + // which the backend's ggml_backend_xxx_buffer_is_host return true. + // this approach could be found: + // https://github.com/ggerganov/llama.cpp/pull/7641 + bool supported_op = false; + supported_op = (tensor->op == GGML_OP_ADD); + supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL) || (tensor->op == GGML_OP_MUL_MAT)); + if (!supported_op) { return false; } - int qtype = src0->type; - if (tensor->op == GGML_OP_ADD) { - return (qtype == GGML_TYPE_F32 || qtype == GGML_TYPE_F16 || - qtype == GGML_TYPE_Q8_0) && - (src1->type == GGML_TYPE_F32); + //TODO: support other quantized data type + if (ggml_is_quantized(src0->type)) { + if (src0->type != GGML_TYPE_Q8_0 && src0->type != GGML_TYPE_Q4_0) { + return false; + } } + int qtype = src0->type; if (tensor->op == GGML_OP_MUL) { return (qtype == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32); } if (tensor->op == GGML_OP_MUL_MAT) { - if (ctx->device == QNN_BACKEND_GGML) { - return (ne00 == ne10) && (src1->ne[2] % src0->ne[2] == 0) && - (src1->ne[3] % src0->ne[3] == 0); - } - if ((ctx->device == QNN_BACKEND_NPU) && (qtype == GGML_TYPE_Q8_0) && - (src1->type == GGML_TYPE_F32) && (tensor->type == GGML_TYPE_F32)) { + if (ne00 <= 32 || ne01 <= 32 || ne10 <= 32 || ne11 <= 32) { + return false; + } else { return true; } - if (ctx->device == QNN_BACKEND_CPU || ctx->device == QNN_BACKEND_GPU) { - return (ne00 == ne10) && (ne00 == ne01); - } - return false; } -} + return true; +} static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -1978,10 +1965,25 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src if (!graph_initialized) { graph_name = graph_name + "_" + std::to_string(ctx->threads) + src0->name + "_" + src1->name; - QNN_LOG_DEBUG("graph name %s", graph_name.c_str()); - error = qnn_raw_interface.graphCreate( - instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, - &graph_handle); + QNN_LOG_INFO("graph name %s", graph_name.c_str()); + if (ctx->device == QNN_BACKEND_NPU) { + QnnHtpGraph_CustomConfig_t custom_config; + custom_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; + custom_config.numHvxThreads = 8; + + QnnGraph_Config_t graph_config; + graph_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_config.customConfig = &custom_config; + const QnnGraph_Config_t * p_graphconfig[] = {&graph_config, NULL}; + error = qnn_raw_interface.graphCreate( + instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig, + &graph_handle); + } else { + error = qnn_raw_interface.graphCreate( + instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, + &graph_handle); + } + if (QNN_SUCCESS != error) { QNN_LOG_INFO("can't create qnn graph handle with graph name %s, " "error = %d\n", @@ -2112,8 +2114,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); - QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], - src0->ne[3]); } QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; @@ -2198,7 +2198,7 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, if (!graph_initialized) { graph_name = graph_name + "_" + std::to_string(ctx->threads) + src0->name + "_" + src1->name; - QNN_LOG_DEBUG("graph name %s", graph_name.c_str()); + QNN_LOG_INFO("graph name %s", graph_name.c_str()); error = qnn_raw_interface.graphCreate( instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, &graph_handle); @@ -2331,8 +2331,6 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); - QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], - src0->ne[3]); } QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; @@ -2894,7 +2892,6 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t Qnn_ErrorHandle_t error = QNN_SUCCESS; ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; - static int idx = 0; char tensor_name[GGML_MAX_NAME] = {0}; snprintf(tensor_name, GGML_MAX_NAME, "tensor_%04d", idx++); @@ -3061,7 +3058,7 @@ GGML_CALL static const char * ggml_backend_qnn_name(ggml_backend_t backend) { GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { QNN_LOG_INFO("enter %s", __func__); ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; - QNN_LOG_DEBUG("idx %d, name:%s", ctx->device, g_qnn_mgr[ctx->device].name); + QNN_LOG_INFO("idx %d, name:%s", ctx->device, g_qnn_mgr[ctx->device].name); qnn_instance * instance = (qnn_instance *)g_qnn_mgr[ctx->device].instance; if (instance != nullptr) { @@ -3073,7 +3070,7 @@ GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { auto & graph_item = graph_it->second; Qnn_GraphHandle_t & graph_handle = std::get<0>(graph_item); GGML_UNUSED(graph_handle); - QNN_LOG_DEBUG("graph type:%s", graph_it->first.c_str()); + QNN_LOG_INFO("graph type:%s", graph_it->first.c_str()); } instance->_qnn_graph_map.clear(); @@ -3104,7 +3101,7 @@ GGML_CALL static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backe params.type = GGML_TASK_TYPE_COMPUTE; params.ith = 0; for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_tensor *node = cgraph->nodes[i]; + ggml_tensor * node = cgraph->nodes[i]; if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { @@ -3213,7 +3210,6 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device) { device, GGML_QNN_MAX_DEVICES - 1); return nullptr; } - static ggml_backend_qnn_buffer_type_context ggml_backend_qnn_buffer_type_contexts[GGML_QNN_MAX_DEVICES]; static ggml_backend_buffer_type ggml_backend_qnn_buffer_types[GGML_QNN_MAX_DEVICES]; static bool ggml_backend_qnn_buffer_type_initialized = false; diff --git a/tests/ggml-qnn/ggml-qnn-ut-build-run.sh b/tests/ggml-qnn/ggml-qnn-ut-build-run.sh index 192f2f4bda2f5..4c21be5a41fa2 100755 --- a/tests/ggml-qnn/ggml-qnn-ut-build-run.sh +++ b/tests/ggml-qnn/ggml-qnn-ut-build-run.sh @@ -12,6 +12,8 @@ ANDROID_PLATFORM=android-34 GGML_QNN_UT=ggml-qnn-ut REMOTE_PATH=/data/local/tmp/ +BUILDTYPE=Debug +BUILDTYPE=Release function dump_vars() @@ -70,7 +72,7 @@ function check_and_download_ndk() function build_arm64 { - cmake -H. -B./out/arm64-v8a -DTARGET_NAME=${GGML_QNN_UT} -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=${ANDROID_PLATFORM} -DANDROID_NDK=${ANDROID_NDK} -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DQNN_SDK_PATH=${QNN_SDK_PATH} + cmake -H. -B./out/arm64-v8a -DTARGET_NAME=${GGML_QNN_UT} -DCMAKE_BUILD_TYPE=${BUILDTYPE} -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=${ANDROID_PLATFORM} -DANDROID_NDK=${ANDROID_NDK} -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DQNN_SDK_PATH=${QNN_SDK_PATH} cd ./out/arm64-v8a make @@ -166,9 +168,9 @@ function show_usage() echo "Usage:" echo " $0 build (build Android command line UT program)" echo " $0 updateqnnlibs (upload the latest QNN libs to Android phone)" - echo " $0 GGML_OP_ADD 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU)" - echo " $0 GGML_OP_MUL 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU)" - echo " $0 GGML_OP_MUL_MAT 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU)" + echo " $0 GGML_OP_ADD 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU) / 3(ggml)" + echo " $0 GGML_OP_MUL 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU) / 3(ggml)" + echo " $0 GGML_OP_MUL_MAT 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU) / 3(ggml)" echo -e "\n\n\n" } diff --git a/tests/ggml-qnn/ggml-qnn-ut.cpp b/tests/ggml-qnn/ggml-qnn-ut.cpp index eb072beae6bd4..9af433ceb6690 100644 --- a/tests/ggml-qnn/ggml-qnn-ut.cpp +++ b/tests/ggml-qnn/ggml-qnn-ut.cpp @@ -72,14 +72,12 @@ static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const int len_prefix = snprintf(s_ggml_qnn_log_internal_buf, GGML_QNN_LOGBUF_LEN, "[%s, %d]: ", func, line); int len = vsnprintf(s_ggml_qnn_log_internal_buf + len_prefix, GGML_QNN_LOGBUF_LEN - len_prefix, format, args); if (len < (GGML_QNN_LOGBUF_LEN - len_prefix)) { - //for Android command line application or WoA printf("%s\n", s_ggml_qnn_log_internal_buf); } va_end(args); } } - static const char * get_qnn_backend_name(int n_backend_type) { switch (n_backend_type) { case 0: @@ -95,7 +93,6 @@ static const char * get_qnn_backend_name(int n_backend_type) { } } - static bool ggml_graph_compute_helper( struct ggml_backend * backend, struct ggml_cgraph * graph, @@ -123,26 +120,25 @@ static bool ggml_graph_compute_helper( } #endif - //a new approch of mixed inference if (nullptr != backend) return ggml_backend_graph_compute(backend, graph) == GGML_STATUS_SUCCESS; else return ggml_graph_compute(graph, &plan); } - #define QK8_0 32 + typedef struct { uint16_t d; // delta int8_t qs[QK8_0]; // quants } block_q8_0; - static inline float ggml_compute_fp16_to_fp32(uint16_t h) { __fp16 tmp; memcpy(&tmp, &h, sizeof(uint16_t)); return (float)tmp; } + #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) static void tensor_dump(const ggml_tensor * tensor, const char * name) { @@ -245,7 +241,6 @@ static void tensor_dump(const ggml_tensor * tensor, const char * name) { } } - static uint32_t get_tensor_rank(const ggml_tensor * tensor) { uint32_t rank = 0; for (int i = 0; i < GGML_MAX_DIMS; i++) { @@ -256,7 +251,6 @@ static uint32_t get_tensor_rank(const ggml_tensor * tensor) { return rank; } - static uint32_t get_tensor_data_size(const ggml_tensor * tensor) { size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); size_t n_dims = get_tensor_rank(tensor); @@ -270,7 +264,6 @@ static uint32_t get_tensor_data_size(const ggml_tensor * tensor) { return ggml_nbytes(tensor); } - //ref: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-backend-ops.cpp#L20 static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) { // static RNG initialization (revisit if n_threads stops being constant) @@ -305,8 +298,11 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m t.join(); } if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) { - //ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float)); +#ifdef GGML_USE_QNN memcpy((char*)tensor->data, data.data(), size * sizeof(float)); +#else + ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float)); +#endif } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) { GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0); std::vector dataq(ggml_row_size(tensor->type, size)); @@ -321,18 +317,23 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m } ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im); GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size())); - //ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size()); +#ifdef GGML_USE_QNN memcpy((char*)tensor->data, dataq.data(), dataq.size()); +#else + ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size()); +#endif } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) { // This is going to create some weird integers though. - //ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor)); +#ifdef GGML_USE_QNN memcpy((char*)tensor->data, data.data(), ggml_nbytes(tensor)); +#else + ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor)); +#endif } else { GGML_ASSERT(false); } } - //ref: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-backend-ops.cpp#L310 static void initialize_tensors(ggml_context * ctx) { for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { @@ -340,19 +341,17 @@ static void initialize_tensors(ggml_context * ctx) { } } - static void show_usage() { printf(" " \ "\nUsage: test_qnn_ops [options]\n" \ "\n" \ "Options:\n" \ " -t GGML_OP_ADD / GGML_OP_MUL / GGML_OP_MULMAT\n" \ - " -b 0(QNN_CPU) 1(QNN_GPU) 2(QNN_NPU)\n" \ + " -b 0(QNN_CPU) 1(QNN_GPU) 2(QNN_NPU) 3(ggml)\n" \ " ?/h print usage infomation\n\n" ); } - static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { int64_t n_begin_time = 0LL; int64_t n_end_time = 0LL; @@ -369,16 +368,15 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { ggml_backend_t backend = nullptr; ggml_backend_buffer_t buffer= nullptr; - ggml_type qtype = GGML_TYPE_I8; - qtype = GGML_TYPE_F32; + ggml_type qtype = GGML_TYPE_I8; qtype = GGML_TYPE_F16; qtype = GGML_TYPE_Q8_0; + qtype = GGML_TYPE_F32; std::vector work_buffer; QNN_LOG_DEBUG("enter qnn_ggml_op\n"); QNN_LOG_DEBUG("ggml op:%d(%s)\n", n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type)); - n_begin_time = ggml_time_us(); srand(time(NULL)); @@ -473,7 +471,6 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { initialize_tensors(ctx); } ggml_set_f32(src1, (rand() % 100 + 1)); - //ggml_set_f32(dst, 0.0f); } ggml_graph_compute_helper(backend, gf, work_buffer, num_threads, nullptr, nullptr); @@ -501,13 +498,13 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { ggml_free(ctx); ggml_backend_buffer_free(buffer); ggml_backend_free(backend); + n_end_time = ggml_time_us(); n_duration = (n_end_time - n_begin_time) / 1000; QNN_LOG_DEBUG("duration of ut GGML_OP_%s using QNN backend %s: %lld milliseconds\n", ggml_op_name((enum ggml_op)n_ggml_op_type), get_qnn_backend_name(n_backend_type), n_duration); return 0; } - int main(int argc, char * argv[]) { int num_threads = 4; int n_backend_type = QNN_BACKEND_CPU; @@ -531,7 +528,7 @@ int main(int argc, char * argv[]) { } else if (0 == strcmp(argv[i], "-b")) { if (i + 1 < argc) { int backend = atoi(argv[i + 1]); - if (backend <= QNN_BACKEND_NPU) + if (backend <= QNN_BACKEND_GGML) n_backend_type = backend; else { show_usage(); @@ -549,5 +546,6 @@ int main(int argc, char * argv[]) { QNN_LOG_DEBUG("backend %d, ggml op:%d(%s)", n_backend_type, n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type)); qnn_op_ut(num_threads, n_backend_type, n_ggml_op_type); + return 0; } From faaa86b7e4925c0ea38480cc1b88e1a52097e221 Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Wed, 12 Jun 2024 16:30:50 +0800 Subject: [PATCH 015/166] ggml-qnn: refine ggml inference using QNN NPU --- ggml-qnn.cpp | 668 ++++++++++++++++++++++++--------- tests/ggml-qnn/CMakeLists.txt | 8 +- tests/ggml-qnn/ggml-qnn-ut.cpp | 3 +- 3 files changed, 507 insertions(+), 172 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index 4700e145112d6..f59c54fcacd97 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -1001,12 +1001,10 @@ class qnn_instance { _qnn_interface.set_qnn_interface(_loaded_backend[backend_id]); - _qnn_interface.qnn_log_create(qnn_sdk_logcallback, _qnn_log_level, - &_qnn_log_handle); + _qnn_interface.qnn_log_create(qnn_sdk_logcallback, _qnn_log_level, &_qnn_log_handle); if (nullptr == _qnn_log_handle) { - QNN_LOG_WARN( - "why failed to initialize qnn log\n"); // NPU backend not work on - // Qualcomm SoC equipped low-end phone + // NPU backend not work on Qualcomm SoC equipped low-end phone + QNN_LOG_WARN("why failed to initialize qnn log\n"); return 4; } else { QNN_LOG_DEBUG("initialize qnn log successfully\n"); @@ -1025,23 +1023,62 @@ class qnn_instance { } if (nullptr != _qnn_raw_interface.propertyHasCapability) { - auto qnnStatus = + Qnn_ErrorHandle_t qnn_status = _qnn_raw_interface.propertyHasCapability(QNN_PROPERTY_GROUP_DEVICE); - if (QNN_PROPERTY_NOT_SUPPORTED == qnnStatus) { + if (QNN_PROPERTY_NOT_SUPPORTED == qnn_status) { QNN_LOG_WARN("device property is not supported\n"); } - if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnnStatus) { + if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnn_status) { QNN_LOG_WARN("device property is not known to backend\n"); } } - Qnn_ErrorHandle_t qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, nullptr, - &_qnn_device_handle); + Qnn_ErrorHandle_t qnn_status = QNN_SUCCESS; + if (_backend_name.find("Htp") != std::variant_npos) { + const QnnDevice_PlatformInfo_t * p_info = nullptr; + _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info); + QNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices); + QnnDevice_HardwareDeviceInfo_t * infos = p_info->v1.hwDevices; + QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = { }; + for (int i = 0; i < p_info->v1.numHwDevices; i++) { + QNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId, + infos[i].v1.deviceType, infos[i].v1.numCores); + QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension; + chipinfo = devinfo->onChipDevice; + QnnHtpDevice_Arch_t htp_arch = chipinfo.arch; + QNN_LOG_INFO("htp_type:%d(%s)", devinfo->devType, (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : ""); + QNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB", \ + chipinfo.socModel, qnn_get_chipset_desc(chipinfo.socModel), \ + htp_arch, qnn_get_htparch_desc(htp_arch), chipinfo.vtcmSize); + g_qnn_mgr[QNN_BACKEND_NPU].socinfo = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize }; + } + _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info); + + QnnHtpDevice_CustomConfig_t soc_customconfig; + soc_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_SOC; + soc_customconfig.socModel = chipinfo.socModel; + QnnDevice_Config_t soc_devconfig; + soc_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; + soc_devconfig.customConfig = &soc_customconfig; + + QnnHtpDevice_CustomConfig_t arch_customconfig; + arch_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_ARCH; + arch_customconfig.arch.arch = chipinfo.arch; + arch_customconfig.arch.deviceId = 0; // Id of device to be used. If single device is used by default 0. + QnnDevice_Config_t arch_devconfig; + arch_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; + arch_devconfig.customConfig = &arch_customconfig; + + const QnnDevice_Config_t * p_deviceconfig[] = {&soc_devconfig, &arch_devconfig, NULL}; + qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, p_deviceconfig, &_qnn_device_handle); + } else { + qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, nullptr, &_qnn_device_handle); + } if (QNN_SUCCESS != qnn_status && QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnn_status) { QNN_LOG_WARN("failed to create QNN device\n"); } else { - QNN_LOG_INFO("create device successfully\n"); + QNN_LOG_INFO("create QNN device successfully\n"); } if (qnn_sdk_profile_level::profile_off != _profile_level) { @@ -1096,9 +1133,9 @@ class qnn_instance { return 9; } - if (nullptr != - _pfn_rpc_mem_init) // make Qualcomm's SoC equipped low-end phone happy + if (nullptr != _pfn_rpc_mem_init) { // make Qualcomm's SoC equipped low-end phone happy _pfn_rpc_mem_init(); + } std::vector temp_context_config; _qnn_interface.qnn_context_create( @@ -1113,32 +1150,14 @@ class qnn_instance { } if (_backend_name.find("Htp") != std::variant_npos) { - const QnnDevice_PlatformInfo_t * p_info = nullptr; - _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info); - QNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices); - QnnDevice_HardwareDeviceInfo_t * infos = p_info->v1.hwDevices; - for (int i = 0; i < p_info->v1.numHwDevices; i++) { - QNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId, - infos[i].v1.deviceType, infos[i].v1.numCores); - QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension; - QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = devinfo->onChipDevice; - QnnHtpDevice_Arch_t htp_arch = chipinfo.arch; - QNN_LOG_INFO("htp_type:%d(%s)", devinfo->devType, (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : ""); - QNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB", \ - chipinfo.socModel, qnn_get_chipset_desc(chipinfo.socModel), \ - htp_arch, qnn_get_htparch_desc(htp_arch), chipinfo.vtcmSize); - g_qnn_mgr[QNN_BACKEND_NPU].socinfo = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize }; - } - _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info); - //TODO: faster approach to probe the accurate capacity of rpc ion memory size_t candidate_size = 0; uint8_t * rpc_buffer = nullptr; - const int SIZE_IN_MB = (1 << 20); + const int size_in_mb = (1 << 20); size_t probe_slots[] = {1024, 1536, 2048 - 48, 2048}; size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); for (size_t idx = 0; idx < probe_counts; idx++) { - rpc_buffer = static_cast(alloc_rpcmem(probe_slots[idx] * SIZE_IN_MB, 4)); + rpc_buffer = static_cast(alloc_rpcmem(probe_slots[idx] * size_in_mb, 4)); if (nullptr == rpc_buffer) { QNN_LOG_INFO("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno)); break; @@ -1150,7 +1169,7 @@ class qnn_instance { } if (candidate_size > _rpcmem_capacity) _rpcmem_capacity = candidate_size; - QNN_LOG_INFO("capacity of rpc ion memory %d MB\n", _rpcmem_capacity); + QNN_LOG_INFO("capacity of QNN rpc ion memory is about %d MB\n", _rpcmem_capacity); if (0 != init_htp_perfinfra()) { QNN_LOG_WARN("initialize HTP performance failure"); @@ -1181,6 +1200,10 @@ class qnn_instance { QNN_LOG_DEBUG("succeed to close rpcmem lib\n"); } + if (_backend_name.find("Htp") != std::variant_npos) { + _qnn_htp_perfinfra->destroyPowerConfigId(_qnn_power_configid); + } + if (nullptr != _qnn_context_handle) { error = _qnn_interface.qnn_context_free(_qnn_context_handle, _qnn_profile_handle); @@ -1239,6 +1262,9 @@ class qnn_instance { return ret_status; } + //keep it for further usage of offload the entire cgraph to a single QNN DAG directly + //which was used in Qualcomm's dedicated AI technology +#if 0 int init_qnn_graph(const char * graph_name, bool debug, uint8_t do_node_validation = true, const QnnGraph_Config_t ** graph_configs = nullptr) { @@ -1288,6 +1314,7 @@ class qnn_instance { return 0; } +#endif const qnn_interface & get_qnn_interface() { if (!_qnn_interface.is_loaded()) { @@ -1362,70 +1389,86 @@ class qnn_instance { } int set_rpc_polling() { - if (_qnn_rpc_pollingtime > 0) { - QnnHtpPerfInfrastructure_PowerConfig_t rpc_pollingTime; - memset(&rpc_pollingTime, 0, sizeof(rpc_pollingTime)); - rpc_pollingTime.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME; - rpc_pollingTime.rpcPollingTimeConfig = _qnn_rpc_pollingtime; - - QnnHtpPerfInfrastructure_PowerConfig_t rpc_ControlLatency; - memset(&rpc_ControlLatency, 0, sizeof(rpc_ControlLatency)); - rpc_ControlLatency.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY; - rpc_ControlLatency.rpcControlLatencyConfig = 40; - - const QnnHtpPerfInfrastructure_PowerConfig_t * powerConfigs[] = {&rpc_pollingTime, &rpc_ControlLatency, nullptr}; - if (_qnn_htp_perfinfra) { - _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, powerConfigs); + if (_qnn_htp_perfinfra) { + QnnHtpPerfInfrastructure_PowerConfig_t rpc_polling_time; + memset(&rpc_polling_time, 0, sizeof(rpc_polling_time)); + rpc_polling_time.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME; + //use rpc polling time recommended 0-10000 us + rpc_polling_time.rpcPollingTimeConfig = 9999; + + QnnHtpPerfInfrastructure_PowerConfig_t rpc_control_latency; + memset(&rpc_control_latency, 0, sizeof(rpc_control_latency)); + rpc_control_latency.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY; + //use rpc control latency recommended 100 us, refer hexagon sdk + rpc_control_latency.rpcControlLatencyConfig = 100; + + const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = { + &rpc_polling_time, + &rpc_control_latency, + nullptr}; + Qnn_ErrorHandle_t qnn_status = _qnn_htp_perfinfra->setPowerConfig( + _qnn_power_configid, + power_configs); + if (qnn_status != QNN_SUCCESS) { + QNN_LOG_WARN("set htp perf failed\n"); + } else { + QNN_LOG_INFO("set htp perf ok\n"); } + } else { + QNN_LOG_WARN("can't set htp perf\n"); } + return 0; } int set_high_performance_mode() { if (nullptr == _qnn_htp_perfinfra) { - QNN_LOG_DEBUG("perf intra is null\n"); + QNN_LOG_WARN("perf intra is null\n"); return 1; } - QnnHtpPerfInfrastructure_PowerConfig_t powerConfig; - memset(&powerConfig, 0, sizeof(powerConfig)); - powerConfig.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3; - powerConfig.dcvsV3Config.dcvsEnable = 0; - powerConfig.dcvsV3Config.setDcvsEnable = 1; - powerConfig.dcvsV3Config.contextId = _qnn_power_configid; - powerConfig.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE; - powerConfig.dcvsV3Config.setSleepLatency = - 1; // true to consider Latency parameter otherwise False - powerConfig.dcvsV3Config.setBusParams = - 1; // true to consider Bus parameter otherwise False - powerConfig.dcvsV3Config.setCoreParams = - 1; // true to consider Core parameter otherwise False - powerConfig.dcvsV3Config.sleepDisable = - 0; // true to consider sleep/LPM modes, False to enable - powerConfig.dcvsV3Config.setSleepDisable = - 0; // true to consider sleep disable/enable parameter otherwise False set sleep latency parameter - uint32_t latencyValue = 40; - powerConfig.dcvsV3Config.sleepLatency = - latencyValue; // range 40-2000 micro sec + QnnHtpPerfInfrastructure_PowerConfig_t power_config; + memset(&power_config, 0, sizeof(power_config)); + power_config.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3; + power_config.dcvsV3Config.dcvsEnable = 0; + power_config.dcvsV3Config.setDcvsEnable = 1; + power_config.dcvsV3Config.contextId = _qnn_power_configid; + power_config.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE; + power_config.dcvsV3Config.setSleepLatency = + 1; // true to consider Latency parameter otherwise false + power_config.dcvsV3Config.sleepLatency = 10; + power_config.dcvsV3Config.setBusParams = + 1; // true to consider Bus parameter otherwise false + power_config.dcvsV3Config.setCoreParams = + 1; // true to consider Core parameter otherwise false + power_config.dcvsV3Config.sleepDisable = + 1; // true to consider sleep/LPM modes, false to enable + power_config.dcvsV3Config.setSleepDisable = + 1; // true to consider sleep disable/enable parameter otherwise false set sleep latency parameter // set Bus Clock Parameters - powerConfig.dcvsV3Config.busVoltageCornerMin = + power_config.dcvsV3Config.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - powerConfig.dcvsV3Config.busVoltageCornerTarget = + power_config.dcvsV3Config.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - powerConfig.dcvsV3Config.busVoltageCornerMax = + power_config.dcvsV3Config.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; // set Core Clock Parameters - powerConfig.dcvsV3Config.coreVoltageCornerMin = + power_config.dcvsV3Config.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - powerConfig.dcvsV3Config.coreVoltageCornerTarget = + power_config.dcvsV3Config.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - powerConfig.dcvsV3Config.coreVoltageCornerMax = + power_config.dcvsV3Config.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; // set power config with different performance parameters - const QnnHtpPerfInfrastructure_PowerConfig_t *powerConfigs[] = { - &powerConfig, nullptr}; - - _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, powerConfigs); + const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = { + &power_config, nullptr}; + Qnn_ErrorHandle_t qnn_status = QNN_SUCCESS; + qnn_status = _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs); + if (qnn_status != QNN_SUCCESS) { + QNN_LOG_WARN("set htp high performance mode failed\n"); + } else { + QNN_LOG_INFO("set htp high performance mode ok\n"); + } return 0; } @@ -1505,7 +1548,7 @@ class qnn_instance { if (is_rpcmem_allocated(p_data)) { QNN_LOG_WARN("rpc memory already allocated\n"); - // return 3; + return 3; } if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) { QNN_LOG_WARN("tensor %s has been registered shared memory\n", @@ -1518,7 +1561,7 @@ class qnn_instance { QNN_LOG_WARN("failed to get file descriptor\n"); return 5; } - QNN_LOG_DEBUG("mem_fd %d\n", mem_fd); + QNN_LOG_INFO("mem_fd %d\n", mem_fd); Qnn_MemDescriptor_t descriptor = {{QNN_VER_PTR(*p_tensor)->rank, QNN_VER_PTR(*p_tensor)->dimensions, nullptr}, @@ -1538,11 +1581,24 @@ class qnn_instance { (QNN_VER_PTR(*p_tensor)->name)); } QNN_VER_PTR(*p_tensor)->memHandle = handle; - _qnn_mem_set.insert(handle); + _qnn_mem_set.insert((std::pair(p_data, handle))); return 0; } + void * get_rpcmem_from_memhandle(Qnn_MemHandle_t mem_handle) { + for (std::unordered_map::iterator it = _qnn_mem_set.begin(); + it != _qnn_mem_set.end(); + it++) { + Qnn_MemHandle_t mem_handle = it->second; + if (it->second == mem_handle) { + return it->first; + } + } + QNN_LOG_WARN("can't find rpcmem from qnn mem handle %p", mem_handle); + return nullptr; + } + void unregister_rpcmem() { Qnn_ErrorHandle_t error = QNN_SUCCESS; @@ -1550,7 +1606,10 @@ class qnn_instance { QNN_LOG_WARN("no rpcmem registered\n"); } - for (auto & mem_handle : _qnn_mem_set) { + for (std::unordered_map::iterator it = _qnn_mem_set.begin(); + it != _qnn_mem_set.end(); + it++) { + Qnn_MemHandle_t mem_handle = it->second; error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1); if (error != QNN_SUCCESS) { QNN_LOG_WARN("failed to unregister shared memory, error %d\n", @@ -1561,7 +1620,7 @@ class qnn_instance { } bool is_rpcmem_allocated(void * buf) { - return _rpcmem_store_map.count(buf) != 0U; + return _qnn_mem_set.count(buf) != 0U; } @@ -1686,8 +1745,9 @@ class qnn_instance { return 1; } - auto get_providers = load_qnn_functionpointers( - lib_handle, "QnnInterface_getProviders"); + auto get_providers = + load_qnn_functionpointers( + lib_handle, "QnnInterface_getProviders"); if (nullptr == get_providers) { QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dlerror()); @@ -1786,7 +1846,7 @@ class qnn_instance { private: std::string _lib_path; std::string _backend_name; - std::string _model_name; // prebuilt QNN model name, not used currently + std::string _model_name; // Qualcomm's dedicated prebuilt model name, keep it for further usage BackendIdType _backend_id; bool _debug_tensor = false; @@ -1816,12 +1876,11 @@ class qnn_instance { QnnHtpDevice_PerfInfrastructure_t * _qnn_htp_perfinfra = nullptr; uint32_t _qnn_power_configid = 1; - uint32_t _qnn_rpc_pollingtime = 9999; // 0-10000 us for high performing QNN_INTERFACE_VER_TYPE _qnn_raw_interface; QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface; - std::unordered_set _qnn_mem_set; + std::unordered_map _qnn_mem_set; std::mutex _init_mutex; std::unordered_map _loaded_lib_handle; @@ -1898,9 +1957,8 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, if (tensor->op == GGML_OP_MUL_MAT) { if (ne00 <= 32 || ne01 <= 32 || ne10 <= 32 || ne11 <= 32) { - return false; - } else { - return true; + //make mul_mat with QNN RPC happy + //return false; } } @@ -1964,17 +2022,29 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src if (!graph_initialized) { graph_name = graph_name + "_" + std::to_string(ctx->threads) + - src0->name + "_" + src1->name; + "_" + src0->name + "_" + src1->name; QNN_LOG_INFO("graph name %s", graph_name.c_str()); if (ctx->device == QNN_BACKEND_NPU) { - QnnHtpGraph_CustomConfig_t custom_config; - custom_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; - custom_config.numHvxThreads = 8; - - QnnGraph_Config_t graph_config; - graph_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_config.customConfig = &custom_config; - const QnnGraph_Config_t * p_graphconfig[] = {&graph_config, NULL}; + QnnHtpGraph_CustomConfig_t hvx_config; + hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; + hvx_config.numHvxThreads = 8; + QnnGraph_Config_t graph_hvx_config; + graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_hvx_config.customConfig = &hvx_config; + + QnnHtpGraph_CustomConfig_t dlbc_config; + dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; + /* + dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; + dlbc_config.optimizationOption.floatValue = 1; // set to 0 to turn off DLBC + */ + dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; + + QnnGraph_Config_t graph_dlbc_config; + graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_dlbc_config.customConfig = &dlbc_config; + + const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config, &graph_dlbc_config, NULL}; error = qnn_raw_interface.graphCreate( instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig, &graph_handle); @@ -1989,7 +2059,21 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src "error = %d\n", graph_name.c_str(), error); goto failure; + } else { + QNN_LOG_INFO("create qnn graph handle with graph name %s ok\n", graph_name.c_str()); } + + if (ctx->device == QNN_BACKEND_NPU) { + QNN_VER_PTR(*tensor_0)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; + QNN_VER_PTR(*tensor_0)->clientBuf= {.data=nullptr, .dataSize=0}; + + QNN_VER_PTR(*tensor_1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; + QNN_VER_PTR(*tensor_1)->clientBuf= {.data=nullptr, .dataSize=0}; + + QNN_VER_PTR(*tensor_2)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; + QNN_VER_PTR(*tensor_2)->clientBuf= {.data=nullptr, .dataSize=0}; + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); @@ -2006,13 +2090,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src goto failure; } - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - qnn_get_ggml_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - qnn_get_ggml_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - qnn_get_ggml_tensor_data_size(dst)}; - QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; @@ -2023,6 +2100,46 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + if (ctx->device != QNN_BACKEND_NPU) { + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, + qnn_get_ggml_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, + qnn_get_ggml_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, + qnn_get_ggml_tensor_data_size(dst)}; + } else { + uint8_t * qnn_buffer_0 = nullptr; + uint8_t * qnn_buffer_1 = nullptr; + uint8_t * qnn_buffer_2 = nullptr; + qnn_instance * instance = ctx->instance; + + qnn_buffer_0 = static_cast(instance->alloc_rpcmem(ggml_nbytes(src0), 4)); + if (nullptr == qnn_buffer_0) { + QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + } else { + QNN_LOG_INFO("alloc rpcmem successfully\n"); + } + instance->register_rpcmem(qnn_buffer_0, tensor_0); + memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0)); + + qnn_buffer_1 = static_cast(instance->alloc_rpcmem(ggml_nbytes(src1), 4)); + if (nullptr == qnn_buffer_1) { + QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + } else { + QNN_LOG_INFO("alloc rpcmem successfully\n"); + } + instance->register_rpcmem(qnn_buffer_1, tensor_1); + memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); + + qnn_buffer_2 = static_cast(instance->alloc_rpcmem(ggml_nbytes(dst), 4)); + if (nullptr == qnn_buffer_2) { + QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + } else { + QNN_LOG_INFO("alloc rpcmem successfully\n"); + } + instance->register_rpcmem(qnn_buffer_2, tensor_2); + } + Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; Qnn_OpConfig_t op_config = { @@ -2048,6 +2165,12 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QNN_LOG_INFO("error = %d\n", error); goto failure; } + + if (ctx->device == QNN_BACKEND_NPU) { + uint8_t * qnn_buffer_2 = static_cast(ctx->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*tensor_2)->memHandle)); + memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst)); + } auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); instance->_qnn_graph_map[map_entry] = graph_item; } else { @@ -2067,13 +2190,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - qnn_get_ggml_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - qnn_get_ggml_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - qnn_get_ggml_tensor_data_size(dst)}; - QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; @@ -2084,6 +2200,25 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + if (ctx->device != QNN_BACKEND_NPU) { + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, + qnn_get_ggml_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, + qnn_get_ggml_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, + qnn_get_ggml_tensor_data_size(dst)}; + } else { + uint8_t * qnn_buffer_0 = static_cast(ctx->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*tensor_0)->memHandle)); + if (nullptr != qnn_buffer_0) + memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0)); + + uint8_t * qnn_buffer_1 = static_cast(ctx->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*tensor_1)->memHandle)); + if (nullptr != qnn_buffer_1) + memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); + } + Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs,2, @@ -2093,7 +2228,15 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QNN_LOG_INFO("error = %d\n", error); goto failure; } + + if (ctx->device == QNN_BACKEND_NPU) { + uint8_t * qnn_buffer_2 = static_cast(ctx->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*tensor_2)->memHandle)); + if (nullptr != qnn_buffer_2) + memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst)); + } } + failure: if (QNN_SUCCESS != error) { QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); @@ -2197,17 +2340,55 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, if (!graph_initialized) { graph_name = graph_name + "_" + std::to_string(ctx->threads) + - src0->name + "_" + src1->name; + "_" + src0->name + "_" + src1->name; QNN_LOG_INFO("graph name %s", graph_name.c_str()); - error = qnn_raw_interface.graphCreate( - instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, - &graph_handle); + if (ctx->device == QNN_BACKEND_NPU) { + QnnHtpGraph_CustomConfig_t hvx_config; + hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; + hvx_config.numHvxThreads = 8; + QnnGraph_Config_t graph_hvx_config; + graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_hvx_config.customConfig = &hvx_config; + + QnnHtpGraph_CustomConfig_t dlbc_config; + dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; + /* + dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; + dlbc_config.optimizationOption.floatValue = 1; // set to 0 to turn off DLBC + */ + dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; + + QnnGraph_Config_t graph_dlbc_config; + graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_dlbc_config.customConfig = &dlbc_config; + + const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config, &graph_dlbc_config, NULL}; + error = qnn_raw_interface.graphCreate( + instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig, + &graph_handle); + } else { + error = qnn_raw_interface.graphCreate( + instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, + &graph_handle); + } if (QNN_SUCCESS != error) { QNN_LOG_INFO("can't create qnn graph handle with graph name %s, " "error = %d\n", graph_name.c_str(), error); goto failure; } + + if (ctx->device == QNN_BACKEND_NPU) { + QNN_VER_PTR(*tensor_0)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; + QNN_VER_PTR(*tensor_0)->clientBuf= {.data=nullptr, .dataSize=0}; + + QNN_VER_PTR(*tensor_1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; + QNN_VER_PTR(*tensor_1)->clientBuf= {.data=nullptr, .dataSize=0}; + + QNN_VER_PTR(*tensor_2)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; + QNN_VER_PTR(*tensor_2)->clientBuf= {.data=nullptr, .dataSize=0}; + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); @@ -2224,13 +2405,6 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, goto failure; } - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - qnn_get_ggml_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - qnn_get_ggml_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - qnn_get_ggml_tensor_data_size(dst)}; - QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; @@ -2241,6 +2415,46 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + if (ctx->device != QNN_BACKEND_NPU) { + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, + qnn_get_ggml_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, + qnn_get_ggml_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, + qnn_get_ggml_tensor_data_size(dst)}; + } else { + uint8_t * qnn_buffer_0 = nullptr; + uint8_t * qnn_buffer_1 = nullptr; + uint8_t * qnn_buffer_2 = nullptr; + qnn_instance * instance = ctx->instance; + + qnn_buffer_0 = static_cast(instance->alloc_rpcmem(ggml_nbytes(src0), 4)); + if (nullptr == qnn_buffer_0) { + QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + } else { + QNN_LOG_INFO("alloc rpcmem successfully\n"); + } + instance->register_rpcmem(qnn_buffer_0, tensor_0); + memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0)); + + qnn_buffer_1 = static_cast(instance->alloc_rpcmem(ggml_nbytes(src1), 4)); + if (nullptr == qnn_buffer_1) { + QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + } else { + QNN_LOG_INFO("alloc rpcmem successfully\n"); + } + instance->register_rpcmem(qnn_buffer_1, tensor_1); + memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); + + qnn_buffer_2 = static_cast(instance->alloc_rpcmem(ggml_nbytes(dst), 4)); + if (nullptr == qnn_buffer_2) { + QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + } else { + QNN_LOG_INFO("alloc rpcmem successfully\n"); + } + instance->register_rpcmem(qnn_buffer_2, tensor_2); + } + Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; Qnn_OpConfig_t op_config = {(Qnn_OpConfigVersion_t) 1, @@ -2266,6 +2480,13 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, QNN_LOG_INFO("error = %d\n", error); goto failure; } + + if (ctx->device == QNN_BACKEND_NPU) { + uint8_t * qnn_buffer_2 = static_cast(ctx->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*tensor_2)->memHandle)); + memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst)); + } + auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); instance->_qnn_graph_map[map_entry] = graph_item; } else { @@ -2294,12 +2515,24 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - qnn_get_ggml_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - qnn_get_ggml_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - qnn_get_ggml_tensor_data_size(dst)}; + if (ctx->device != QNN_BACKEND_NPU) { + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, + qnn_get_ggml_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, + qnn_get_ggml_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, + qnn_get_ggml_tensor_data_size(dst)}; + } else { + uint8_t * qnn_buffer_0 = static_cast(ctx->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*tensor_0)->memHandle)); + if (nullptr != qnn_buffer_0) + memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0)); + + uint8_t * qnn_buffer_1 = static_cast(ctx->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*tensor_1)->memHandle)); + if (nullptr != qnn_buffer_1) + memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); + } Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; @@ -2311,7 +2544,15 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, QNN_LOG_INFO("error = %d\n", error); goto failure; } + + if (ctx->device == QNN_BACKEND_NPU) { + uint8_t * qnn_buffer_2 = static_cast(ctx->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*tensor_2)->memHandle)); + if (nullptr != qnn_buffer_2) + memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst)); + } } + failure: if (QNN_SUCCESS != error) { QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); @@ -2428,6 +2669,17 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, goto failure; } + if (ctx->device == QNN_BACKEND_NPU) { + QNN_VER_PTR(*tensor_0)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; + QNN_VER_PTR(*tensor_0)->clientBuf= {.data=nullptr, .dataSize=0}; + + QNN_VER_PTR(*tensor_1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; + QNN_VER_PTR(*tensor_1)->clientBuf= {.data=nullptr, .dataSize=0}; + + QNN_VER_PTR(*tensor_2)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; + QNN_VER_PTR(*tensor_2)->clientBuf= {.data=nullptr, .dataSize=0}; + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); @@ -2444,13 +2696,6 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, goto failure; } - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - qnn_get_ggml_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - qnn_get_ggml_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - qnn_get_ggml_tensor_data_size(dst)}; - QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; @@ -2461,6 +2706,46 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + if (ctx->device != QNN_BACKEND_NPU) { + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, + qnn_get_ggml_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, + qnn_get_ggml_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, + qnn_get_ggml_tensor_data_size(dst)}; + } else { + uint8_t * qnn_buffer_0 = nullptr; + uint8_t * qnn_buffer_1 = nullptr; + uint8_t * qnn_buffer_2 = nullptr; + qnn_instance * instance = ctx->instance; + + qnn_buffer_0 = static_cast(instance->alloc_rpcmem(ggml_nbytes(src0), 4)); + if (nullptr == qnn_buffer_0) { + QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + } else { + QNN_LOG_INFO("alloc rpcmem successfully\n"); + } + instance->register_rpcmem(qnn_buffer_0, tensor_0); + memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0)); + + qnn_buffer_1 = static_cast(instance->alloc_rpcmem(ggml_nbytes(src1), 4)); + if (nullptr == qnn_buffer_1) { + QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + } else { + QNN_LOG_INFO("alloc rpcmem successfully\n"); + } + instance->register_rpcmem(qnn_buffer_1, tensor_1); + memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); + + qnn_buffer_2 = static_cast(instance->alloc_rpcmem(ggml_nbytes(dst), 4)); + if (nullptr == qnn_buffer_2) { + QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + } else { + QNN_LOG_INFO("alloc rpcmem successfully\n"); + } + instance->register_rpcmem(qnn_buffer_2, tensor_2); + } + Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; Qnn_OpConfig_t op_config = {(Qnn_OpConfigVersion_t) 1, @@ -2486,6 +2771,13 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, QNN_LOG_INFO("error = %d\n", error); goto failure; } + + if (ctx->device == QNN_BACKEND_NPU) { + uint8_t * qnn_buffer_2 = static_cast(ctx->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*tensor_2)->memHandle)); + memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst)); + } + auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); instance->_qnn_graph_map[map_entry] = graph_item; } else { @@ -2514,17 +2806,28 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - qnn_get_ggml_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - qnn_get_ggml_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - qnn_get_ggml_tensor_data_size(dst)}; + if (ctx->device != QNN_BACKEND_NPU) { + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, + qnn_get_ggml_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, + qnn_get_ggml_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, + qnn_get_ggml_tensor_data_size(dst)}; + } else { + uint8_t * qnn_buffer_0 = static_cast(ctx->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*tensor_0)->memHandle)); + if (nullptr != qnn_buffer_0) + memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0)); + + uint8_t * qnn_buffer_1 = static_cast(ctx->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*tensor_1)->memHandle)); + if (nullptr != qnn_buffer_1) + memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); + } Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; - error = - qnn_raw_interface.graphExecute(graph_handle, + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); @@ -2532,7 +2835,15 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, QNN_LOG_INFO("error = %d\n", error); goto failure; } + + if (ctx->device == QNN_BACKEND_NPU) { + uint8_t * qnn_buffer_2 = static_cast(ctx->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*tensor_2)->memHandle)); + if (nullptr != qnn_buffer_2) + memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst)); + } } + failure: if (QNN_SUCCESS != error) { QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); @@ -2889,9 +3200,9 @@ GGML_CALL static void * ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t b GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - ggml_backend_qnn_buffer_context * ctx = - (ggml_backend_qnn_buffer_context *) buffer->context; + Qnn_ErrorHandle_t error = QNN_SUCCESS; + ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; + static int idx = 0; char tensor_name[GGML_MAX_NAME] = {0}; snprintf(tensor_name, GGML_MAX_NAME, "tensor_%04d", idx++); @@ -2908,22 +3219,43 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t } else if (tensor->flags & GGML_TENSOR_FLAG_OUTPUT) { qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ; } - Qnn_Tensor_t qnn_tensor = { - .version = QNN_TENSOR_VERSION_1, - {.v1 = {.id = 0, - .name = tensor_name, - .type = qnn_tensor_type, - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = qnn_data_type, - .quantizeParams = - {QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, - .offset = 0}}}, - .rank = qnn_get_ggml_tensor_rank(tensor), - .dimensions = dimensions, - .memType = QNN_TENSORMEMTYPE_RAW, - {.clientBuf = {.data = nullptr, .dataSize = 0}}}}}; + Qnn_Tensor_t qnn_tensor = QNN_TENSOR_INIT; + + if (ctx->device != QNN_BACKEND_GPU) { + qnn_tensor = { + .version = QNN_TENSOR_VERSION_1, + {.v1 = {.id = 0, + .name = tensor_name, + .type = qnn_tensor_type, + .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, + .dataType = qnn_data_type, + .quantizeParams = + {QNN_DEFINITION_UNDEFINED, + QNN_QUANTIZATION_ENCODING_UNDEFINED, + {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, + .offset = 0}}}, + .rank = qnn_get_ggml_tensor_rank(tensor), + .dimensions = dimensions, + .memType = QNN_TENSORMEMTYPE_RAW, + {.clientBuf = {.data = nullptr, .dataSize = 0}}}}}; + } else { + qnn_tensor = { + .version = QNN_TENSOR_VERSION_1, + {.v1 = {.id = 0, + .name = tensor_name, + .type = qnn_tensor_type, + .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, + .dataType = qnn_data_type, + .quantizeParams = + {QNN_DEFINITION_UNDEFINED, + QNN_QUANTIZATION_ENCODING_UNDEFINED, + {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, + .offset = 0}}}, + .rank = qnn_get_ggml_tensor_rank(tensor), + .dimensions = dimensions, + .memType = QNN_TENSORMEMTYPE_MEMHANDLE, + {.clientBuf = {.data = nullptr, .dataSize = 0}}}}}; + } Qnn_Tensor_t * p_qnn_tensor = (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t)); if (nullptr == p_qnn_tensor) { @@ -2933,7 +3265,7 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t error = deep_copy_qnn_tensors(qnn_tensor, *p_qnn_tensor); if (error != QNN_SUCCESS) { free(p_qnn_tensor); - QNN_LOG_DEBUG("init tensor failed"); + QNN_LOG_WARN("init tensor failed"); return; } tensor->extra = p_qnn_tensor; @@ -3210,6 +3542,7 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device) { device, GGML_QNN_MAX_DEVICES - 1); return nullptr; } + static ggml_backend_qnn_buffer_type_context ggml_backend_qnn_buffer_type_contexts[GGML_QNN_MAX_DEVICES]; static ggml_backend_buffer_type ggml_backend_qnn_buffer_types[GGML_QNN_MAX_DEVICES]; static bool ggml_backend_qnn_buffer_type_initialized = false; @@ -3307,7 +3640,6 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { std::string device_name = qnn_get_backend_name(device); QNN_LOG_INFO("qnn device name %s", device_name.c_str()); - instance->init_qnn_graph(device_name.c_str(), false); g_qnn_mgr[device].instance = instance; g_qnn_mgr[device].raw_interface = instance->get_qnn_raw_interface(); g_qnn_mgr[device].raw_system_interface = instance->get_qnn_raw_system_interface(); diff --git a/tests/ggml-qnn/CMakeLists.txt b/tests/ggml-qnn/CMakeLists.txt index a78bdaeaf8009..bf061e6c7c3a1 100644 --- a/tests/ggml-qnn/CMakeLists.txt +++ b/tests/ggml-qnn/CMakeLists.txt @@ -6,8 +6,8 @@ set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_POSITION_INDEPENDENT_CODE ON) -#set to ON if target Android phone is based on Qualcomm Snapdragon 8 Gen 3 -set(TARGET_SNAPDRAGON_8_GEN3 OFF) +#set to OFF if target Android phone is not equipped with Qualcomm Snapdragon 8 Gen 3 +set(TARGET_SNAPDRAGON_8_GEN3 ON) set(QNN_INC_PATH ${QNN_SDK_PATH}/include/QNN) set(QNN_LIB_PATH ${QNN_SDK_PATH}/lib/aarch64-android) @@ -35,6 +35,8 @@ add_definitions(-DGGML_USE_QNN) if(CMAKE_BUILD_TYPE STREQUAL "Release") add_definitions(-DNDEBUG) add_definitions(-O3) +else() +add_definitions(-O3) endif() if (TARGET_SNAPDRAGON_8_GEN3) @@ -44,7 +46,7 @@ add_definitions(-mcpu=cortex-x1) add_definitions(-mtune=cortex-x1) else() -# the below build optimization might be works well on ALL mainstream Android phone based on Qualcomm mobile SoC +# the below build optimization might be works well on ALL Android phone equipped with Qualcomm mainstream mobile SoC add_definitions(-mcpu=cortex-a72) endif() diff --git a/tests/ggml-qnn/ggml-qnn-ut.cpp b/tests/ggml-qnn/ggml-qnn-ut.cpp index 9af433ceb6690..0abfc62073f08 100644 --- a/tests/ggml-qnn/ggml-qnn-ut.cpp +++ b/tests/ggml-qnn/ggml-qnn-ut.cpp @@ -415,7 +415,8 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { sizex = ggml_blck_size(qtype) * 2; } } - QNN_LOG_DEBUG("sizex %d\n", sizex); + QNN_LOG_DEBUG("sizex: %d\n", sizex); + QNN_LOG_DEBUG("sizey: %d\n", sizey); if (n_ggml_op_type == GGML_OP_MUL) { src0 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); From 5598fbd15dfd7e0483ca544c4c8a86aca6c79ea2 Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Thu, 13 Jun 2024 15:41:53 +0800 Subject: [PATCH 016/166] review: make a MVP(Minimum Viable PR) style PR in upstream --- ggml-qnn.cpp | 597 +++++++----------------- tests/ggml-qnn/ggml-qnn-ut-build-run.sh | 10 +- tests/ggml-qnn/ggml-qnn-ut.cpp | 17 +- 3 files changed, 183 insertions(+), 441 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index f59c54fcacd97..f268c7f0e825a 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -55,7 +55,7 @@ #include "Saver/QnnSaver.h" #include "System/QnnSystemInterface.h" #include "HTP/QnnHtpDevice.h" -#include +#include "HTP/QnnHtpGraph.h" // ================================================================================================= // @@ -91,12 +91,6 @@ typedef void (*ggml_qnn_func_t)(ggml_backend_qnn_context * ctx, const ggml_tensor * src1, ggml_tensor * dst); -typedef void (*ggml_qnn_func_common_t)(ggml_backend_qnn_context * ctx, - const ggml_op ggml_op, - const ggml_tensor * src0, - const ggml_tensor * src1, - ggml_tensor * dst); - enum qcom_htp_arch { NONE = 0, V68 = 68, @@ -424,6 +418,7 @@ static bool qnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tenso return true; } +#ifndef NDEBUG #define CHECK_PARAMS(ctx, src0, src1, dst) \ do { \ if (!qnn_is_valid_params((ctx), (src0), (src1), (dst))) { \ @@ -431,6 +426,10 @@ static bool qnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tenso } \ } while (0) +#else +#define CHECK_PARAMS(ctx, src0, src1, dst) +#endif + #if ENABLE_QNNBACKEND_PERF class qnn_perf { public: @@ -446,7 +445,7 @@ class qnn_perf { void info() { _end_time = ggml_time_us(); _duration = (_end_time - _begin_time); - QNN_LOG_DEBUG("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration); + QNN_LOG_INFO("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration); } private: @@ -809,7 +808,7 @@ static void qnn_sdk_logcallback(const char * fmt, QnnLog_Level_t level, memset(s_ggml_qnn_logbuf, 0, QNN_LOGBUF_LEN); vsnprintf(reinterpret_cast(s_ggml_qnn_logbuf), QNN_LOGBUF_LEN, fmt, argp); - QNN_LOG_DEBUG("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggml_qnn_logbuf); + QNN_LOG_INFO("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggml_qnn_logbuf); } #endif } @@ -1069,7 +1068,7 @@ class qnn_instance { arch_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; arch_devconfig.customConfig = &arch_customconfig; - const QnnDevice_Config_t * p_deviceconfig[] = {&soc_devconfig, &arch_devconfig, NULL}; + const QnnDevice_Config_t * p_deviceconfig[] = {&soc_devconfig, &arch_devconfig, nullptr}; qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, p_deviceconfig, &_qnn_device_handle); } else { qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, nullptr, &_qnn_device_handle); @@ -1137,10 +1136,14 @@ class qnn_instance { _pfn_rpc_mem_init(); } - std::vector temp_context_config; + /* TODO: not used, keep it for further usage + QnnContext_Config_t qnn_context_config = QNN_CONTEXT_CONFIG_INIT; + qnn_context_config.priority = QNN_PRIORITY_DEFAULT; + const QnnContext_Config_t * context_configs[] = {&qnn_context_config, nullptr}; + */ _qnn_interface.qnn_context_create( _qnn_backend_handle, _qnn_device_handle, - temp_context_config.empty() ? nullptr : temp_context_config.data(), + nullptr, &_qnn_context_handle); if (nullptr == _qnn_context_handle) { QNN_LOG_WARN("why failed to initialize qnn context\n"); @@ -1157,9 +1160,11 @@ class qnn_instance { size_t probe_slots[] = {1024, 1536, 2048 - 48, 2048}; size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); for (size_t idx = 0; idx < probe_counts; idx++) { - rpc_buffer = static_cast(alloc_rpcmem(probe_slots[idx] * size_in_mb, 4)); + rpc_buffer = static_cast(alloc_rpcmem( + probe_slots[idx] * size_in_mb, 4)); if (nullptr == rpc_buffer) { - QNN_LOG_INFO("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno)); + QNN_LOG_INFO("alloc rpcmem %d (MB) failure, %s\n", + probe_slots[idx], strerror(errno)); break; } else { candidate_size = probe_slots[idx]; @@ -1262,8 +1267,8 @@ class qnn_instance { return ret_status; } - //keep it for further usage of offload the entire cgraph to a single QNN DAG directly - //which was used in Qualcomm's dedicated AI technology + //TODO:keep it for further usage of offload the entire cgraph to a single QNN DAG directly + // which was used in Qualcomm's dedicated AI technology #if 0 int init_qnn_graph(const char * graph_name, bool debug, uint8_t do_node_validation = true, @@ -1430,13 +1435,14 @@ class qnn_instance { QnnHtpPerfInfrastructure_PowerConfig_t power_config; memset(&power_config, 0, sizeof(power_config)); power_config.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3; - power_config.dcvsV3Config.dcvsEnable = 0; + power_config.dcvsV3Config.setDcvsEnable = 1; + power_config.dcvsV3Config.dcvsEnable = 0; power_config.dcvsV3Config.contextId = _qnn_power_configid; power_config.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE; power_config.dcvsV3Config.setSleepLatency = 1; // true to consider Latency parameter otherwise false - power_config.dcvsV3Config.sleepLatency = 10; + power_config.dcvsV3Config.sleepLatency = 40; power_config.dcvsV3Config.setBusParams = 1; // true to consider Bus parameter otherwise false power_config.dcvsV3Config.setCoreParams = @@ -1459,6 +1465,7 @@ class qnn_instance { DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; power_config.dcvsV3Config.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + // set power config with different performance parameters const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = { &power_config, nullptr}; @@ -1550,6 +1557,7 @@ class qnn_instance { QNN_LOG_WARN("rpc memory already allocated\n"); return 3; } + if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) { QNN_LOG_WARN("tensor %s has been registered shared memory\n", (QNN_VER_PTR(*p_tensor)->name)); @@ -1710,7 +1718,7 @@ class qnn_instance { int result = 0; if (nullptr == _system_lib_handle) { - QNN_LOG_DEBUG("system lib handle is null\n"); + QNN_LOG_WARN("system lib handle is null\n"); return 1; } @@ -1724,8 +1732,7 @@ class qnn_instance { int dlclose_error = dlclose(_system_lib_handle); if (dlclose_error != 0) { - QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", - dlerror()); + QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", dlerror()); return 2; } @@ -1740,8 +1747,7 @@ class qnn_instance { void * lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL); if (nullptr == lib_handle) { - QNN_LOG_WARN("can not open QNN library %s, with error: %s", - lib_path.c_str(), dlerror()); + QNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dlerror()); return 1; } @@ -1749,8 +1755,7 @@ class qnn_instance { load_qnn_functionpointers( lib_handle, "QnnInterface_getProviders"); if (nullptr == get_providers) { - QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", - dlerror()); + QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dlerror()); return 2; } @@ -1758,14 +1763,12 @@ class qnn_instance { const QnnInterface_t ** provider_list = nullptr; error = get_providers(&provider_list, &num_providers); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to get providers, error %d", - QNN_GET_ERROR_CODE(error)); + QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error)); return 3; } QNN_LOG_DEBUG("num_providers=%d\n", num_providers); if (num_providers != _required_num_providers) { - QNN_LOG_WARN("providers is %d instead of required %d", num_providers, - _required_num_providers); + QNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers); return 4; } @@ -1797,16 +1800,14 @@ class qnn_instance { BackendIdType backend_id = provider_list[0]->backendId; _lib_path_to_backend_id[lib_path] = backend_id; if (_loaded_backend.count(backend_id) > 0) { - QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n", - lib_path.c_str(), backend_id); + QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n", lib_path.c_str(), backend_id); } _loaded_backend[backend_id] = provider_list[0]; if (_loaded_lib_handle.count(backend_id) > 0) { QNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]); int dlclose_error = dlclose(_loaded_lib_handle[backend_id]); if (dlclose_error != 0) { - QNN_LOG_WARN("fail to close %p with error %s\n", - _loaded_lib_handle[backend_id], dlerror()); + QNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], dlerror()); } } _loaded_lib_handle[backend_id] = lib_handle; @@ -1820,8 +1821,7 @@ class qnn_instance { for (auto & it : _loaded_lib_handle) { dlclose_error = dlclose(it.second); if (dlclose_error != 0) { - QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, - dlerror()); + QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, dlerror()); } } @@ -1924,7 +1924,6 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, const int64_t ne01 = src0->ne[1]; const int64_t ne10 = src1->ne[0]; const int64_t ne11 = src1->ne[1]; - // make qnn_get_ggml_tensor_rank and QNN SDK happy if (ne00 <= 1 || ne01 <= 1 || ne10 <= 1 || ne11 <= 1) { return false; @@ -1932,13 +1931,13 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, // TODO: support other GGML OPs using QNN API // a GENERAL approach could fix this problem in a standalone PR of refine ggml backend - // subsystem for mixed inference between CPU&GPU / CPU&NPU easily for ANY ggml backends - // which the backend's ggml_backend_xxx_buffer_is_host return true. - // this approach could be found: + // subsystem for hybrid inference between CPU&GPU / CPU&NPU easily(less the 100 LoC and no + // side-effect to the existing codes) for ANY ggml backends which the backend's + // ggml_backend_xxx_buffer_is_host return true. this approach could be found at: // https://github.com/ggerganov/llama.cpp/pull/7641 bool supported_op = false; supported_op = (tensor->op == GGML_OP_ADD); - supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL) || (tensor->op == GGML_OP_MUL_MAT)); + supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL_MAT)); if (!supported_op) { return false; } @@ -1950,14 +1949,9 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, } } - int qtype = src0->type; - if (tensor->op == GGML_OP_MUL) { - return (qtype == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32); - } - if (tensor->op == GGML_OP_MUL_MAT) { if (ne00 <= 32 || ne01 <= 32 || ne10 <= 32 || ne11 <= 32) { - //make mul_mat with QNN RPC happy + //comment it for make UT of mul_mat with QNN RPC happy //return false; } } @@ -1965,6 +1959,8 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, return true; } +//TODO: this function can be removed later because there are duplicated codes with ggml_qnn_mul_mat +// keep it for illustrate how to implement a specified GGMPL OP using QNN API + QNN RPC static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { Qnn_ErrorHandle_t error = QNN_SUCCESS; @@ -1986,10 +1982,11 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src tensor_1 = (Qnn_Tensor_t *) src1->extra; tensor_2 = (Qnn_Tensor_t *) dst->extra; instance = ctx->instance; + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + qnn_perf perf("ggml_qnn_add"); perf.start(); - QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; @@ -2034,17 +2031,31 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QnnHtpGraph_CustomConfig_t dlbc_config; dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; - /* dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; - dlbc_config.optimizationOption.floatValue = 1; // set to 0 to turn off DLBC - */ - dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; - + dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC QnnGraph_Config_t graph_dlbc_config; graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; graph_dlbc_config.customConfig = &dlbc_config; - const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config, &graph_dlbc_config, NULL}; + QnnHtpGraph_CustomConfig_t opt_config; + opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; + opt_config.optimizationOption.floatValue = 1; // 1 / 3 + QnnGraph_Config_t graph_opt_config; + graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_opt_config.customConfig = &opt_config; + + QnnHtpGraph_CustomConfig_t vtcm_config; + vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; + vtcm_config.vtcmSizeInMB = ctx->socinfo.vtcm_size_in_mb; + QnnGraph_Config_t graph_vtcm_config; + graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_vtcm_config.customConfig = &vtcm_config; + + const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config, + &graph_dlbc_config, + &graph_vtcm_config, + &graph_opt_config, + NULL}; error = qnn_raw_interface.graphCreate( instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig, &graph_handle); @@ -2113,27 +2124,33 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src uint8_t * qnn_buffer_2 = nullptr; qnn_instance * instance = ctx->instance; - qnn_buffer_0 = static_cast(instance->alloc_rpcmem(ggml_nbytes(src0), 4)); + qnn_buffer_0 = static_cast(instance->alloc_rpcmem( + ggml_nbytes(src0), 4)); if (nullptr == qnn_buffer_0) { QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + goto failure; } else { QNN_LOG_INFO("alloc rpcmem successfully\n"); } instance->register_rpcmem(qnn_buffer_0, tensor_0); memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0)); - qnn_buffer_1 = static_cast(instance->alloc_rpcmem(ggml_nbytes(src1), 4)); + qnn_buffer_1 = static_cast(instance->alloc_rpcmem( + ggml_nbytes(src1), 4)); if (nullptr == qnn_buffer_1) { QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + goto failure; } else { QNN_LOG_INFO("alloc rpcmem successfully\n"); } instance->register_rpcmem(qnn_buffer_1, tensor_1); memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); - qnn_buffer_2 = static_cast(instance->alloc_rpcmem(ggml_nbytes(dst), 4)); + qnn_buffer_2 = static_cast(instance->alloc_rpcmem( + ggml_nbytes(dst), 4)); if (nullptr == qnn_buffer_2) { QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + goto failure; } else { QNN_LOG_INFO("alloc rpcmem successfully\n"); } @@ -2144,23 +2161,33 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; Qnn_OpConfig_t op_config = { (Qnn_OpConfigVersion_t) 1, - .v1 = {"ggml_op_add", QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_ELEMENT_WISE_ADD, 0, qnn_params, - 2, tensor_inputs, 1, - tensor_outputs}}; + .v1 = {"ggml_op_add", + QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_ELEMENT_WISE_ADD, + 0, qnn_params, + 2, tensor_inputs, + 1,tensor_outputs} + }; error = qnn_raw_interface.graphAddNode(graph_handle, op_config); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; } - error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); + error = qnn_raw_interface.graphFinalize(graph_handle, + nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; } - error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, + error = qnn_raw_interface.graphExecute(graph_handle, + tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + if (ctx->device == QNN_BACKEND_NPU) { + if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { + QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); + } + } if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; @@ -2221,9 +2248,15 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; - error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs,2, + error = qnn_raw_interface.graphExecute(graph_handle, + tensor_inputs,2, tensor_outputs,1, nullptr, nullptr); + if (ctx->device == QNN_BACKEND_NPU) { + if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { + QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); + } + } if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; @@ -2299,6 +2332,8 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, tensor_1 = (Qnn_Tensor_t *) src1->extra; tensor_2 = (Qnn_Tensor_t *) dst->extra; instance = ctx->instance; + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + qnn_perf perf("ggml_qnn_mul_mat"); perf.start(); @@ -2307,7 +2342,6 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, tensor_2 = (Qnn_Tensor_t *) dst->extra; instance = ctx->instance; - QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; @@ -2338,6 +2372,11 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions; uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions; + //TODO: for scenarios of quantized data in src0 + // pass-1: dequantize src0 to FP32 + // pass-2: dq-src0 * src1 + // the performance gains is worth although there is performance loss in pass-1 + if (!graph_initialized) { graph_name = graph_name + "_" + std::to_string(ctx->threads) + "_" + src0->name + "_" + src1->name; @@ -2352,17 +2391,31 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, QnnHtpGraph_CustomConfig_t dlbc_config; dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; - /* dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; - dlbc_config.optimizationOption.floatValue = 1; // set to 0 to turn off DLBC - */ - dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; - + dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC QnnGraph_Config_t graph_dlbc_config; graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; graph_dlbc_config.customConfig = &dlbc_config; - const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config, &graph_dlbc_config, NULL}; + QnnHtpGraph_CustomConfig_t opt_config; + opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; + opt_config.optimizationOption.floatValue = 1; //1 / 3 + QnnGraph_Config_t graph_opt_config; + graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_opt_config.customConfig = &opt_config; + + QnnHtpGraph_CustomConfig_t vtcm_config; + vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; + vtcm_config.vtcmSizeInMB = ctx->socinfo.vtcm_size_in_mb; + QnnGraph_Config_t graph_vtcm_config; + graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_vtcm_config.customConfig = &vtcm_config; + + const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config, + &graph_dlbc_config, + &graph_vtcm_config, + &graph_opt_config, + NULL}; error = qnn_raw_interface.graphCreate( instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig, &graph_handle); @@ -2428,27 +2481,33 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, uint8_t * qnn_buffer_2 = nullptr; qnn_instance * instance = ctx->instance; - qnn_buffer_0 = static_cast(instance->alloc_rpcmem(ggml_nbytes(src0), 4)); + qnn_buffer_0 = static_cast(instance->alloc_rpcmem( + ggml_nbytes(src0), 4)); if (nullptr == qnn_buffer_0) { QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + goto failure; } else { QNN_LOG_INFO("alloc rpcmem successfully\n"); } instance->register_rpcmem(qnn_buffer_0, tensor_0); memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0)); - qnn_buffer_1 = static_cast(instance->alloc_rpcmem(ggml_nbytes(src1), 4)); + qnn_buffer_1 = static_cast(instance->alloc_rpcmem( + ggml_nbytes(src1), 4)); if (nullptr == qnn_buffer_1) { QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + goto failure; } else { QNN_LOG_INFO("alloc rpcmem successfully\n"); } instance->register_rpcmem(qnn_buffer_1, tensor_1); memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); - qnn_buffer_2 = static_cast(instance->alloc_rpcmem(ggml_nbytes(dst), 4)); + qnn_buffer_2 = static_cast(instance->alloc_rpcmem( + ggml_nbytes(dst), 4)); if (nullptr == qnn_buffer_2) { QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + goto failure; } else { QNN_LOG_INFO("alloc rpcmem successfully\n"); } @@ -2457,25 +2516,35 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; - Qnn_OpConfig_t op_config = {(Qnn_OpConfigVersion_t) 1, - .v1 = {"ggml_op_mul_mat", - QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_MAT_MUL, 0, qnn_params, 2, - tensor_inputs, 1, tensor_outputs}}; + Qnn_OpConfig_t op_config = { + (Qnn_OpConfigVersion_t) 1, + .v1 = {"ggml_op_mul_mat", + QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_MAT_MUL, + 0, qnn_params, + 2, tensor_inputs, + 1, tensor_outputs} + }; error = qnn_raw_interface.graphAddNode(graph_handle, op_config); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; } - error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); + error = qnn_raw_interface.graphFinalize(graph_handle, + nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; } error = qnn_raw_interface.graphExecute(graph_handle, - tensor_inputs, 2, + tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + if (ctx->device == QNN_BACKEND_NPU) { + if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { + QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); + } + } if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; @@ -2537,300 +2606,14 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; error = qnn_raw_interface.graphExecute(graph_handle, - tensor_inputs, 2, + tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - if (ctx->device == QNN_BACKEND_NPU) { - uint8_t * qnn_buffer_2 = static_cast(ctx->instance->get_rpcmem_from_memhandle( - QNN_VER_PTR(*tensor_2)->memHandle)); - if (nullptr != qnn_buffer_2) - memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst)); - } - } - -failure: - if (QNN_SUCCESS != error) { - QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); - QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); - QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, src0->type, ggml_type_name(src0->type), - src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], - src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, src1->type, ggml_type_name(src1->type), - src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], - src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], - dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); - } - - QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; - QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions; - QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions; - perf.info(); -} - -// common function for GGML OPs using QNN API -static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, - const enum ggml_op ggmlop, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - bool graph_initialized = false; - qnn_instance * instance = nullptr; - std::string qnn_graph_name = "ggml_qnn_graph"; - std::string qnn_op_config_name = "ggml_qnn_op_config"; - const char * qnn_op_name = nullptr; - Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Tensor_t * tensor_0 = nullptr; - Qnn_Tensor_t * tensor_1 = nullptr; - Qnn_Tensor_t * tensor_2 = nullptr; - Qnn_Param_t qnn_params[] = {}; - Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; - Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; - Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; - - CHECK_PARAMS(ctx, src0, src1, dst); - tensor_0 = (Qnn_Tensor_t *) src0->extra; - tensor_1 = (Qnn_Tensor_t *) src1->extra; - tensor_2 = (Qnn_Tensor_t *) dst->extra; - instance = ctx->instance; - qnn_perf perf(ggml_op_name(ggmlop)); - perf.start(); - - qnn_op_name = qnn_opname_from_ggmlop(ggmlop); - if (nullptr == qnn_op_name) { - QNN_LOG_WARN("ggml op %d(%s) not supported currently", ggmlop, ggml_op_name(ggmlop)); - return; - } - - tensor_0 = (Qnn_Tensor_t *) src0->extra; - tensor_1 = (Qnn_Tensor_t *) src1->extra; - tensor_2 = (Qnn_Tensor_t *) dst->extra; - instance = ctx->instance; - QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; - - src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); - src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); - dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); - - QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; - QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; - QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; - - uint32_t dimensions_input_0[] = { - (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], (uint32_t) src0->ne[2], - (uint32_t) src0->ne[3]}; - uint32_t dimensions_input_1[] = { - (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2], - (uint32_t) src1->ne[3]}; - uint32_t dimensions_output[] = { - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], - (uint32_t) dst->ne[3]}; - - std::string map_entry = std::string(ggml_op_name(ggmlop)); - if (instance->_qnn_graph_map.find(map_entry) != - instance->_qnn_graph_map.end()) { - graph_initialized = true; - auto & graph_item = instance->_qnn_graph_map[map_entry]; - graph_handle = std::get<0>(graph_item); - } - - uint32_t * tensor_0_dimensions = QNN_VER_PTR(*tensor_0)->dimensions; - uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions; - uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions; - - if (!graph_initialized) { - qnn_graph_name = qnn_graph_name + "_" + ggml_op_name(ggmlop) + - std::to_string(ctx->threads) + src0->name + "_" + - src1->name; - qnn_op_config_name = qnn_op_config_name + "_" + ggml_op_name(ggmlop) + - std::to_string(ctx->threads) + src0->name + "_" + - src1->name; - QNN_LOG_DEBUG("qnn graph name %s", qnn_graph_name.c_str()); - QNN_LOG_DEBUG("qnn op_config name %s", qnn_op_config_name.c_str()); - error = qnn_raw_interface.graphCreate( - instance->get_qnn_context_handle(), qnn_graph_name.c_str(), nullptr, - &graph_handle); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("can't create qnn graph handle with ggml op %s, graph " - "name %s, error = %d\n", - ggml_op_name(ggmlop), qnn_graph_name.c_str(), error); - goto failure; - } - - if (ctx->device == QNN_BACKEND_NPU) { - QNN_VER_PTR(*tensor_0)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; - QNN_VER_PTR(*tensor_0)->clientBuf= {.data=nullptr, .dataSize=0}; - - QNN_VER_PTR(*tensor_1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; - QNN_VER_PTR(*tensor_1)->clientBuf= {.data=nullptr, .dataSize=0}; - - QNN_VER_PTR(*tensor_2)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; - QNN_VER_PTR(*tensor_2)->clientBuf= {.data=nullptr, .dataSize=0}; - } - - error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - - QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); - QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; - QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); - QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; - QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); - QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; - - if (ctx->device != QNN_BACKEND_NPU) { - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - qnn_get_ggml_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - qnn_get_ggml_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - qnn_get_ggml_tensor_data_size(dst)}; - } else { - uint8_t * qnn_buffer_0 = nullptr; - uint8_t * qnn_buffer_1 = nullptr; - uint8_t * qnn_buffer_2 = nullptr; - qnn_instance * instance = ctx->instance; - - qnn_buffer_0 = static_cast(instance->alloc_rpcmem(ggml_nbytes(src0), 4)); - if (nullptr == qnn_buffer_0) { - QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); - } else { - QNN_LOG_INFO("alloc rpcmem successfully\n"); + if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { + QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); } - instance->register_rpcmem(qnn_buffer_0, tensor_0); - memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0)); - - qnn_buffer_1 = static_cast(instance->alloc_rpcmem(ggml_nbytes(src1), 4)); - if (nullptr == qnn_buffer_1) { - QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); - } else { - QNN_LOG_INFO("alloc rpcmem successfully\n"); - } - instance->register_rpcmem(qnn_buffer_1, tensor_1); - memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); - - qnn_buffer_2 = static_cast(instance->alloc_rpcmem(ggml_nbytes(dst), 4)); - if (nullptr == qnn_buffer_2) { - QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); - } else { - QNN_LOG_INFO("alloc rpcmem successfully\n"); - } - instance->register_rpcmem(qnn_buffer_2, tensor_2); - } - - Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; - Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; - Qnn_OpConfig_t op_config = {(Qnn_OpConfigVersion_t) 1, - .v1 = {qnn_op_config_name.c_str(), - QNN_OP_PACKAGE_NAME_QTI_AISW, - qnn_op_name, 0, qnn_params, 2, - tensor_inputs, 1, tensor_outputs}}; - error = qnn_raw_interface.graphAddNode(graph_handle, op_config); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - error = qnn_raw_interface.graphExecute(graph_handle, - tensor_inputs, 2, - tensor_outputs, 1, - nullptr, nullptr); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; } - - if (ctx->device == QNN_BACKEND_NPU) { - uint8_t * qnn_buffer_2 = static_cast(ctx->instance->get_rpcmem_from_memhandle( - QNN_VER_PTR(*tensor_2)->memHandle)); - memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst)); - } - - auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); - instance->_qnn_graph_map[map_entry] = graph_item; - } else { - auto & graph_item = instance->_qnn_graph_map[map_entry]; - graph_handle = std::get<0>(graph_item); - tensor_0 = std::get<1>(graph_item); - tensor_1 = std::get<2>(graph_item); - tensor_2 = std::get<3>(graph_item); - - uint32_t dimensions_input_0[] = { - (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], - (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; - uint32_t dimensions_input_1[] = { - (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], - (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; - uint32_t dimensions_output[] = { - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], - (uint32_t) dst->ne[3]}; - QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); - QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; - QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); - QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; - QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); - QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; - - if (ctx->device != QNN_BACKEND_NPU) { - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - qnn_get_ggml_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - qnn_get_ggml_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - qnn_get_ggml_tensor_data_size(dst)}; - } else { - uint8_t * qnn_buffer_0 = static_cast(ctx->instance->get_rpcmem_from_memhandle( - QNN_VER_PTR(*tensor_0)->memHandle)); - if (nullptr != qnn_buffer_0) - memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0)); - - uint8_t * qnn_buffer_1 = static_cast(ctx->instance->get_rpcmem_from_memhandle( - QNN_VER_PTR(*tensor_1)->memHandle)); - if (nullptr != qnn_buffer_1) - memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); - } - - Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; - Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; - error = qnn_raw_interface.graphExecute(graph_handle, - tensor_inputs, 2, - tensor_outputs, 1, - nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; @@ -2863,8 +2646,6 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); - QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], - src0->ne[3]); } QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; @@ -3038,21 +2819,14 @@ bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx, struct ggml_compute_params * params, struct ggml_tensor * tensor) { ggml_qnn_func_t func = nullptr; - ggml_qnn_func_common_t func_common = nullptr; switch (tensor->op) { case GGML_OP_ADD: func = ggml_qnn_add; break; - - case GGML_OP_MUL: - func_common = ggml_qnn_hanlde_op; - break; - case GGML_OP_MUL_MAT: func = ggml_qnn_mul_mat; break; - case GGML_OP_REPEAT: func = ggml_qnn_repeat; break; @@ -3062,15 +2836,12 @@ bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx, case GGML_OP_DUP: func = ggml_qnn_dup; break; - case GGML_OP_ACC: func = ggml_qnn_acc; break; - case GGML_OP_DIV: func = ggml_qnn_div; break; - case GGML_OP_UNARY: switch (ggml_get_unary_op(tensor)) { case GGML_UNARY_OP_GELU: @@ -3169,10 +2940,9 @@ bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx, return false; } - if (nullptr != func) func(ctx, tensor->src[0], tensor->src[1], tensor); - - if (nullptr != func_common) - func_common(ctx, tensor->op, tensor->src[0], tensor->src[1], tensor); + if (nullptr != func) { + func(ctx, tensor->src[0], tensor->src[1], tensor); + } return true; } @@ -3221,41 +2991,28 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t } Qnn_Tensor_t qnn_tensor = QNN_TENSOR_INIT; - if (ctx->device != QNN_BACKEND_GPU) { - qnn_tensor = { - .version = QNN_TENSOR_VERSION_1, - {.v1 = {.id = 0, - .name = tensor_name, - .type = qnn_tensor_type, - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = qnn_data_type, - .quantizeParams = - {QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, - .offset = 0}}}, - .rank = qnn_get_ggml_tensor_rank(tensor), - .dimensions = dimensions, - .memType = QNN_TENSORMEMTYPE_RAW, - {.clientBuf = {.data = nullptr, .dataSize = 0}}}}}; - } else { - qnn_tensor = { - .version = QNN_TENSOR_VERSION_1, - {.v1 = {.id = 0, - .name = tensor_name, - .type = qnn_tensor_type, - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = qnn_data_type, - .quantizeParams = - {QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, - .offset = 0}}}, - .rank = qnn_get_ggml_tensor_rank(tensor), - .dimensions = dimensions, - .memType = QNN_TENSORMEMTYPE_MEMHANDLE, - {.clientBuf = {.data = nullptr, .dataSize = 0}}}}}; - } + Qnn_TensorMemType_t qnn_mem_type = QNN_TENSORMEMTYPE_RAW; + if (ctx->device == QNN_BACKEND_GPU) { + qnn_mem_type = QNN_TENSORMEMTYPE_MEMHANDLE; + } + + qnn_tensor = { + .version = QNN_TENSOR_VERSION_1, + {.v1 = {.id = 0, + .name = tensor_name, + .type = qnn_tensor_type, + .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, + .dataType = qnn_data_type, + .quantizeParams = + {QNN_DEFINITION_UNDEFINED, + QNN_QUANTIZATION_ENCODING_UNDEFINED, + {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, + .offset = 0}}}, + .rank = qnn_get_ggml_tensor_rank(tensor), + .dimensions = dimensions, + .memType = qnn_mem_type, + {.clientBuf = {.data = nullptr, .dataSize = 0}}}}}; + Qnn_Tensor_t * p_qnn_tensor = (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t)); if (nullptr == p_qnn_tensor) { diff --git a/tests/ggml-qnn/ggml-qnn-ut-build-run.sh b/tests/ggml-qnn/ggml-qnn-ut-build-run.sh index 4c21be5a41fa2..e12b987b8d69d 100755 --- a/tests/ggml-qnn/ggml-qnn-ut-build-run.sh +++ b/tests/ggml-qnn/ggml-qnn-ut-build-run.sh @@ -12,8 +12,8 @@ ANDROID_PLATFORM=android-34 GGML_QNN_UT=ggml-qnn-ut REMOTE_PATH=/data/local/tmp/ -BUILDTYPE=Debug BUILDTYPE=Release +BUILDTYPE=Debug function dump_vars() @@ -100,7 +100,7 @@ function update_qnn_libs() adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnCpu.so ${REMOTE_PATH}/ adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnGpu.so ${REMOTE_PATH}/ - #the QNN NPU(aka HTP/DSP) backend only verified on Xiaomi14(Qualcomm SM8650-AB Snapdragon 8 Gen 3) successfully + #the QNN NPU(aka HTP) backend only verified on Qualcomm Snapdragon 8 Gen 3 equipped Android phone adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp.so ${REMOTE_PATH}/ adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpNetRunExtensions.so ${REMOTE_PATH}/ adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpPrepare.so ${REMOTE_PATH}/ @@ -142,14 +142,9 @@ function run_ggml_qnn_ut() case "$ggmlop" in GGML_OP_ADD) - echo "adb shell ${REMOTE_PATH}/${GGML_QNN_UT} -t GGML_OP_ADD -b $qnnbackend" adb shell ${REMOTE_PATH}/${GGML_QNN_UT} -t GGML_OP_ADD -b $qnnbackend ;; - GGML_OP_MUL) - adb shell ${REMOTE_PATH}/${GGML_QNN_UT} -t GGML_OP_MUL -b $qnnbackend - ;; - GGML_OP_MUL_MAT) adb shell ${REMOTE_PATH}/${GGML_QNN_UT} -t GGML_OP_MUL_MAT -b $qnnbackend ;; @@ -169,7 +164,6 @@ function show_usage() echo " $0 build (build Android command line UT program)" echo " $0 updateqnnlibs (upload the latest QNN libs to Android phone)" echo " $0 GGML_OP_ADD 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU) / 3(ggml)" - echo " $0 GGML_OP_MUL 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU) / 3(ggml)" echo " $0 GGML_OP_MUL_MAT 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU) / 3(ggml)" echo -e "\n\n\n" } diff --git a/tests/ggml-qnn/ggml-qnn-ut.cpp b/tests/ggml-qnn/ggml-qnn-ut.cpp index 0abfc62073f08..fa0883af8993e 100644 --- a/tests/ggml-qnn/ggml-qnn-ut.cpp +++ b/tests/ggml-qnn/ggml-qnn-ut.cpp @@ -346,7 +346,7 @@ static void show_usage() { "\nUsage: test_qnn_ops [options]\n" \ "\n" \ "Options:\n" \ - " -t GGML_OP_ADD / GGML_OP_MUL / GGML_OP_MULMAT\n" \ + " -t GGML_OP_ADD / GGML_OP_MULMAT\n" \ " -b 0(QNN_CPU) 1(QNN_GPU) 2(QNN_NPU) 3(ggml)\n" \ " ?/h print usage infomation\n\n" ); @@ -418,13 +418,9 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { QNN_LOG_DEBUG("sizex: %d\n", sizex); QNN_LOG_DEBUG("sizey: %d\n", sizey); - if (n_ggml_op_type == GGML_OP_MUL) { - src0 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); - src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); - } else { - src0 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); - src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); - } + src0 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); + src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); + ggml_set_input(src0); ggml_set_input(src1); @@ -432,9 +428,6 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { case GGML_OP_ADD: dst = ggml_add(ctx, src0, src1); break; - case GGML_OP_MUL: - dst = ggml_mul(ctx, src0, src1); - break; case GGML_OP_MUL_MAT: dst = ggml_mul_mat(ctx, src0, src1); break; @@ -518,8 +511,6 @@ int main(int argc, char * argv[]) { n_ggml_op_type = GGML_OP_ADD; } else if (0 == memcmp(argv[i + 1], "GGML_OP_MUL_MAT", 15)) { n_ggml_op_type = GGML_OP_MUL_MAT; - } else if (0 == memcmp(argv[i + 1], "GGML_OP_MUL", 11)) { - n_ggml_op_type = GGML_OP_MUL; } else { show_usage(); return 1; From 5e18cdc2689523ea28b829e8ed09db262453023c Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 15 Jun 2024 12:55:06 +0800 Subject: [PATCH 017/166] init the test array with const values --- tests/ggml-qnn/ggml-qnn-ut.cpp | 36 +++++----------------------------- 1 file changed, 5 insertions(+), 31 deletions(-) diff --git a/tests/ggml-qnn/ggml-qnn-ut.cpp b/tests/ggml-qnn/ggml-qnn-ut.cpp index fa0883af8993e..ff01e62f983c7 100644 --- a/tests/ggml-qnn/ggml-qnn-ut.cpp +++ b/tests/ggml-qnn/ggml-qnn-ut.cpp @@ -266,37 +266,12 @@ static uint32_t get_tensor_data_size(const ggml_tensor * tensor) { //ref: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-backend-ops.cpp#L20 static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) { - // static RNG initialization (revisit if n_threads stops being constant) - static const size_t n_threads = std::thread::hardware_concurrency(); - static std::vector generators = []() { - std::random_device rd; - std::vector vec; - vec.reserve(n_threads); - //for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(1234 + i); } // fixed seed - for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(rd()); } - return vec; - }(); - size_t size = ggml_nelements(tensor); std::vector data(size); - - auto init_thread = [&](size_t ith, size_t start, size_t end) { - std::uniform_real_distribution distribution(min, max); - for (size_t i = start; i < end; i++) { - data[i] = distribution(generators[ith]); - } - }; - - std::vector threads; - threads.reserve(n_threads); - for (size_t i = 0; i < n_threads; i++) { - size_t start = i*size/n_threads; - size_t end = (i+1)*size/n_threads; - threads.emplace_back(init_thread, i, start, end); - } - for (auto & t : threads) { - t.join(); + for (size_t i = 0; i < size; i++) { + data[i] = i + 1; } + if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) { #ifdef GGML_USE_QNN memcpy((char*)tensor->data, data.data(), size * sizeof(float)); @@ -378,7 +353,6 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { QNN_LOG_DEBUG("ggml op:%d(%s)\n", n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type)); n_begin_time = ggml_time_us(); - srand(time(NULL)); ctx_size += 1024 * 1024 * 32; QNN_LOG_DEBUG("Allocating Memory of size %zi bytes, %zi MB\n", ctx_size, @@ -460,11 +434,11 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { initialize_tensors(ctx); } else { if (qtype == GGML_TYPE_F32) { - ggml_set_f32(src0, (rand() % 100 + 1)); + ggml_set_f32(src0, 2.f); } else { initialize_tensors(ctx); } - ggml_set_f32(src1, (rand() % 100 + 1)); + ggml_set_f32(src1, 3.f); } ggml_graph_compute_helper(backend, gf, work_buffer, num_threads, nullptr, nullptr); From 6c68adc1d942a5a0173b537237656a4220e7487b Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 14 Jun 2024 18:52:54 +0800 Subject: [PATCH 018/166] add ggml_qnn_tensor_binder --- ggml-qnn.cpp | 110 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 110 insertions(+) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index f268c7f0e825a..62fee4281d1f0 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -1959,6 +1959,116 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, return true; } +template +class ggml_qnn_tensor_binder +{ +public: + ggml_qnn_tensor_binder(const ggml_tensor *tensor, ggml_backend_qnn_context * ctx, Qnn_GraphHandle_t graph_handle) + : _tensor(tensor) + , _qnn_tensor(reinterpret_cast(tensor->extra)) + , _context(ctx) { + _old_dimensions = QNN_VER_PTR(*_qnn_tensor)->dimensions; + const auto qnn_data_type = qnn_datatype_from_ggml_datatype(tensor->type); + const bool is_npu = ctx->device == QNN_BACKEND_NPU; + QNN_VER_PTR(*_qnn_tensor)->type = _tensorType; + if (is_npu) { + QNN_VER_PTR(*_qnn_tensor)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; + QNN_VER_PTR(*_qnn_tensor)->clientBuf= {.data=nullptr, .dataSize=0}; + } + + auto err = ctx->raw_interface.tensorCreateGraphTensor(graph_handle, _qnn_tensor); + if (err != QNN_SUCCESS) { + QNN_LOG_INFO("error = %d\n", err); + _context = nullptr; + return; + } + + _dimensions[0] = (uint32_t)tensor->ne[0]; + _dimensions[1] = (uint32_t)tensor->ne[1]; + _dimensions[2] = (uint32_t)tensor->ne[2]; + _dimensions[3] = (uint32_t)tensor->ne[3]; + QNN_VER_PTR(*_qnn_tensor)->dimensions = _dimensions; + QNN_VER_PTR(*_qnn_tensor)->rank = qnn_get_ggml_tensor_rank(tensor); + QNN_VER_PTR(*_qnn_tensor)->dataType = qnn_data_type; + + + if (is_npu) { + qnn_instance * instance = ctx->instance; + uint8_t *qnn_buffer = static_cast(instance->alloc_rpcmem( + ggml_nbytes(tensor), 4)); // TODO: should we get the align param from device here? + if (!qnn_buffer) { + QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + _context = nullptr; + return; + } else { + QNN_LOG_INFO("alloc rpcmem successfully\n"); + } + + instance->register_rpcmem(qnn_buffer, _qnn_tensor); + if (_tensorType == QNN_TENSOR_TYPE_APP_WRITE || _tensorType == QNN_TENSOR_TYPE_APP_READWRITE) { + memcpy(qnn_buffer, tensor->data, ggml_nbytes(tensor)); + } + } else { + QNN_VER_PTR(*_qnn_tensor)->clientBuf = {tensor->data, + qnn_get_ggml_tensor_data_size(tensor)}; + } + } + + ggml_qnn_tensor_binder(const ggml_tensor *tensor, Qnn_Tensor_t *qnn_tensor, ggml_backend_qnn_context * ctx) + : _tensor(tensor) + , _qnn_tensor(qnn_tensor) + , _context(ctx) { + _old_dimensions = QNN_VER_PTR(*_qnn_tensor)->dimensions; + const auto qnn_data_type = qnn_datatype_from_ggml_datatype(tensor->type); + const bool is_npu = ctx->device == QNN_BACKEND_NPU; + + _dimensions[0] = (uint32_t)tensor->ne[0]; + _dimensions[1] = (uint32_t)tensor->ne[1]; + _dimensions[2] = (uint32_t)tensor->ne[2]; + _dimensions[3] = (uint32_t)tensor->ne[3]; + QNN_VER_PTR(*_qnn_tensor)->dimensions = _dimensions; + QNN_VER_PTR(*_qnn_tensor)->rank = qnn_get_ggml_tensor_rank(tensor); + QNN_VER_PTR(*_qnn_tensor)->dataType = qnn_data_type; + + + if (is_npu) { + uint8_t * qnn_buffer = static_cast(ctx->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*_qnn_tensor)->memHandle)); + if (qnn_buffer) { + memcpy(qnn_buffer, tensor->data, ggml_nbytes(tensor)); + } else { + QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n"); + } + } else { + QNN_VER_PTR(*_qnn_tensor)->clientBuf = {tensor->data, + qnn_get_ggml_tensor_data_size(tensor)}; + } + } + + ~ggml_qnn_tensor_binder() { + if (_context && _context->device == QNN_BACKEND_NPU && + (_tensorType == QNN_TENSOR_TYPE_APP_READWRITE || _tensorType == QNN_TENSOR_TYPE_APP_READ)) { + uint8_t * qnn_buffer = static_cast(_context->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*_qnn_tensor)->memHandle)); + memcpy(_tensor->data, qnn_buffer, ggml_nbytes(_tensor)); + } + + QNN_VER_PTR(*_qnn_tensor)->dimensions = _old_dimensions; + } + +private: + const ggml_tensor *_tensor; + Qnn_Tensor_t *_qnn_tensor; + ggml_backend_qnn_context *_context; + uint32_t *_old_dimensions; + uint32_t _dimensions[4] = {}; + + ggml_qnn_tensor_binder(const ggml_qnn_tensor_binder&) = delete; + ggml_qnn_tensor_binder(ggml_qnn_tensor_binder&&) = delete; + void operator=(const ggml_qnn_tensor_binder&) = delete; + void operator=(ggml_qnn_tensor_binder&&) = delete; +}; + //TODO: this function can be removed later because there are duplicated codes with ggml_qnn_mul_mat // keep it for illustrate how to implement a specified GGMPL OP using QNN API + QNN RPC static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, From 37bb9263dd1687601c7dad0f3fc0332b82f3901c Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 15 Jun 2024 11:13:30 +0800 Subject: [PATCH 019/166] use tensor wrapper in add --- ggml-qnn.cpp | 86 ++++++++++++++++------------------------------------ 1 file changed, 26 insertions(+), 60 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index 62fee4281d1f0..ab28a2daec725 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -1960,10 +1960,10 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, } template -class ggml_qnn_tensor_binder +class ggml_qnn_tensor_readwrite { public: - ggml_qnn_tensor_binder(const ggml_tensor *tensor, ggml_backend_qnn_context * ctx, Qnn_GraphHandle_t graph_handle) + ggml_qnn_tensor_readwrite(const ggml_tensor *tensor, Qnn_GraphHandle_t graph_handle, ggml_backend_qnn_context * ctx) : _tensor(tensor) , _qnn_tensor(reinterpret_cast(tensor->extra)) , _context(ctx) { @@ -1979,6 +1979,7 @@ class ggml_qnn_tensor_binder auto err = ctx->raw_interface.tensorCreateGraphTensor(graph_handle, _qnn_tensor); if (err != QNN_SUCCESS) { QNN_LOG_INFO("error = %d\n", err); + QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor)); _context = nullptr; return; } @@ -1998,7 +1999,9 @@ class ggml_qnn_tensor_binder ggml_nbytes(tensor), 4)); // TODO: should we get the align param from device here? if (!qnn_buffer) { QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor)); _context = nullptr; + // TODO: should we free the tensor here? return; } else { QNN_LOG_INFO("alloc rpcmem successfully\n"); @@ -2014,7 +2017,7 @@ class ggml_qnn_tensor_binder } } - ggml_qnn_tensor_binder(const ggml_tensor *tensor, Qnn_Tensor_t *qnn_tensor, ggml_backend_qnn_context * ctx) + ggml_qnn_tensor_readwrite(const ggml_tensor *tensor, Qnn_Tensor_t *qnn_tensor, ggml_backend_qnn_context * ctx) : _tensor(tensor) , _qnn_tensor(qnn_tensor) , _context(ctx) { @@ -2038,6 +2041,9 @@ class ggml_qnn_tensor_binder memcpy(qnn_buffer, tensor->data, ggml_nbytes(tensor)); } else { QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n"); + QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor)); + _context = nullptr; + return; } } else { QNN_VER_PTR(*_qnn_tensor)->clientBuf = {tensor->data, @@ -2045,7 +2051,7 @@ class ggml_qnn_tensor_binder } } - ~ggml_qnn_tensor_binder() { + ~ggml_qnn_tensor_readwrite() { if (_context && _context->device == QNN_BACKEND_NPU && (_tensorType == QNN_TENSOR_TYPE_APP_READWRITE || _tensorType == QNN_TENSOR_TYPE_APP_READ)) { uint8_t * qnn_buffer = static_cast(_context->instance->get_rpcmem_from_memhandle( @@ -2056,6 +2062,9 @@ class ggml_qnn_tensor_binder QNN_VER_PTR(*_qnn_tensor)->dimensions = _old_dimensions; } + bool is_valid() const { return _context; } + Qnn_Tensor_t * get_qnn_tensor() const { return _qnn_tensor; } + private: const ggml_tensor *_tensor; Qnn_Tensor_t *_qnn_tensor; @@ -2063,12 +2072,15 @@ class ggml_qnn_tensor_binder uint32_t *_old_dimensions; uint32_t _dimensions[4] = {}; - ggml_qnn_tensor_binder(const ggml_qnn_tensor_binder&) = delete; - ggml_qnn_tensor_binder(ggml_qnn_tensor_binder&&) = delete; - void operator=(const ggml_qnn_tensor_binder&) = delete; - void operator=(ggml_qnn_tensor_binder&&) = delete; + ggml_qnn_tensor_readwrite(const ggml_qnn_tensor_readwrite&) = delete; + void operator=(const ggml_qnn_tensor_readwrite&) = delete; + ggml_qnn_tensor_readwrite(ggml_qnn_tensor_readwrite&&) = delete; + void operator=(ggml_qnn_tensor_readwrite&&) = delete; }; +using ggml_qnn_tensor_reader = ggml_qnn_tensor_readwrite; +using ggml_qnn_tensor_writer = ggml_qnn_tensor_readwrite; + //TODO: this function can be removed later because there are duplicated codes with ggml_qnn_mul_mat // keep it for illustrate how to implement a specified GGMPL OP using QNN API + QNN RPC static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, @@ -2078,17 +2090,14 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src qnn_instance * instance = nullptr; std::string graph_name = "ggml_op_qnn_add"; Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Tensor_t * tensor_0 = nullptr; Qnn_Tensor_t * tensor_1 = nullptr; Qnn_Tensor_t * tensor_2 = nullptr; Qnn_Param_t qnn_params[] = {}; enum ggml_op ggmlop = GGML_OP_ADD; - Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; CHECK_PARAMS(ctx, src0, src1, dst); - tensor_0 = (Qnn_Tensor_t *) src0->extra; tensor_1 = (Qnn_Tensor_t *) src1->extra; tensor_2 = (Qnn_Tensor_t *) dst->extra; instance = ctx->instance; @@ -2097,17 +2106,12 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src qnn_perf perf("ggml_qnn_add"); perf.start(); - QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; - src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); - uint32_t dimensions_input_0[] = { - (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], (uint32_t) src0->ne[2], - (uint32_t) src0->ne[3]}; uint32_t dimensions_input_1[] = { (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; @@ -2123,7 +2127,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src graph_handle = std::get<0>(graph_item); } - uint32_t * tensor_0_dimensions = QNN_VER_PTR(*tensor_0)->dimensions; uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions; uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions; @@ -2185,9 +2188,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src } if (ctx->device == QNN_BACKEND_NPU) { - QNN_VER_PTR(*tensor_0)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; - QNN_VER_PTR(*tensor_0)->clientBuf= {.data=nullptr, .dataSize=0}; - QNN_VER_PTR(*tensor_1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; QNN_VER_PTR(*tensor_1)->clientBuf= {.data=nullptr, .dataSize=0}; @@ -2195,9 +2195,8 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QNN_VER_PTR(*tensor_2)->clientBuf= {.data=nullptr, .dataSize=0}; } - error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); + ggml_qnn_tensor_writer tensor_writer0(src0, graph_handle, ctx); + if (!tensor_writer0.is_valid()) { goto failure; } error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1); @@ -2211,9 +2210,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src goto failure; } - QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); - QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; @@ -2222,29 +2218,15 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; if (ctx->device != QNN_BACKEND_NPU) { - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - qnn_get_ggml_tensor_data_size(src0)}; QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, qnn_get_ggml_tensor_data_size(src1)}; QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, qnn_get_ggml_tensor_data_size(dst)}; } else { - uint8_t * qnn_buffer_0 = nullptr; uint8_t * qnn_buffer_1 = nullptr; uint8_t * qnn_buffer_2 = nullptr; qnn_instance * instance = ctx->instance; - qnn_buffer_0 = static_cast(instance->alloc_rpcmem( - ggml_nbytes(src0), 4)); - if (nullptr == qnn_buffer_0) { - QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); - goto failure; - } else { - QNN_LOG_INFO("alloc rpcmem successfully\n"); - } - instance->register_rpcmem(qnn_buffer_0, tensor_0); - memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0)); - qnn_buffer_1 = static_cast(instance->alloc_rpcmem( ggml_nbytes(src1), 4)); if (nullptr == qnn_buffer_1) { @@ -2267,7 +2249,7 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src instance->register_rpcmem(qnn_buffer_2, tensor_2); } - Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; + Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; Qnn_OpConfig_t op_config = { (Qnn_OpConfigVersion_t) 1, @@ -2308,18 +2290,14 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QNN_VER_PTR(*tensor_2)->memHandle)); memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst)); } - auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); + auto graph_item = std::make_tuple(graph_handle, tensor_writer0.get_qnn_tensor(), tensor_1, tensor_2); instance->_qnn_graph_map[map_entry] = graph_item; } else { auto & graph_item = instance->_qnn_graph_map[map_entry]; - graph_handle = std::get<0>(graph_item); - tensor_0 = std::get<1>(graph_item); + ggml_qnn_tensor_writer tensor_writer0(src0, std::get<1>(graph_item), ctx); tensor_1 = std::get<2>(graph_item); tensor_2 = std::get<3>(graph_item); - uint32_t dimensions_input_0[] = { - (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], - (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; uint32_t dimensions_input_1[] = { (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; @@ -2327,9 +2305,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; - QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); - QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; @@ -2338,25 +2313,18 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; if (ctx->device != QNN_BACKEND_NPU) { - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - qnn_get_ggml_tensor_data_size(src0)}; QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, qnn_get_ggml_tensor_data_size(src1)}; QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, qnn_get_ggml_tensor_data_size(dst)}; } else { - uint8_t * qnn_buffer_0 = static_cast(ctx->instance->get_rpcmem_from_memhandle( - QNN_VER_PTR(*tensor_0)->memHandle)); - if (nullptr != qnn_buffer_0) - memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0)); - uint8_t * qnn_buffer_1 = static_cast(ctx->instance->get_rpcmem_from_memhandle( QNN_VER_PTR(*tensor_1)->memHandle)); if (nullptr != qnn_buffer_1) memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); } - Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; + Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs,2, @@ -2382,7 +2350,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src failure: if (QNN_SUCCESS != error) { - QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 @@ -2402,7 +2369,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src dst->nb[1], dst->nb[2]); } - QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions; QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions; From 36e41a1055a85eee98a72f0a29c2c636f476c150 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sun, 16 Jun 2024 21:46:15 +0800 Subject: [PATCH 020/166] use tensor wrapper in matmul --- ggml-qnn.cpp | 59 ++++++---------------------------------------------- 1 file changed, 6 insertions(+), 53 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index ab28a2daec725..8d65b6a4e59ea 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -2394,17 +2394,14 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, qnn_instance * instance = nullptr; std::string graph_name = "ggml_op_qnn_mul_mat"; Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Tensor_t * tensor_0 = nullptr; Qnn_Tensor_t * tensor_1 = nullptr; Qnn_Tensor_t * tensor_2 = nullptr; Qnn_Param_t qnn_params[] = {}; enum ggml_op ggmlop = GGML_OP_MUL_MAT; - Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; CHECK_PARAMS(ctx, src0, src1, dst); - tensor_0 = (Qnn_Tensor_t *) src0->extra; tensor_1 = (Qnn_Tensor_t *) src1->extra; tensor_2 = (Qnn_Tensor_t *) dst->extra; instance = ctx->instance; @@ -2413,22 +2410,16 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, qnn_perf perf("ggml_qnn_mul_mat"); perf.start(); - tensor_0 = (Qnn_Tensor_t *) src0->extra; tensor_1 = (Qnn_Tensor_t *) src1->extra; tensor_2 = (Qnn_Tensor_t *) dst->extra; instance = ctx->instance; - QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; - src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); - uint32_t dimensions_input_0[] = { - (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], (uint32_t) src0->ne[2], - (uint32_t) src0->ne[3]}; uint32_t dimensions_input_1[] = { (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; @@ -2444,7 +2435,6 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, graph_handle = std::get<0>(graph_item); } - uint32_t * tensor_0_dimensions = QNN_VER_PTR(*tensor_0)->dimensions; uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions; uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions; @@ -2508,9 +2498,6 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, } if (ctx->device == QNN_BACKEND_NPU) { - QNN_VER_PTR(*tensor_0)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; - QNN_VER_PTR(*tensor_0)->clientBuf= {.data=nullptr, .dataSize=0}; - QNN_VER_PTR(*tensor_1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; QNN_VER_PTR(*tensor_1)->clientBuf= {.data=nullptr, .dataSize=0}; @@ -2518,9 +2505,8 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, QNN_VER_PTR(*tensor_2)->clientBuf= {.data=nullptr, .dataSize=0}; } - error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); + ggml_qnn_tensor_writer tensor_writer0(src0, graph_handle, ctx); + if (!tensor_writer0.is_valid()) { goto failure; } error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1); @@ -2534,9 +2520,6 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, goto failure; } - QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); - QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; @@ -2545,29 +2528,15 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; if (ctx->device != QNN_BACKEND_NPU) { - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - qnn_get_ggml_tensor_data_size(src0)}; QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, qnn_get_ggml_tensor_data_size(src1)}; QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, qnn_get_ggml_tensor_data_size(dst)}; } else { - uint8_t * qnn_buffer_0 = nullptr; uint8_t * qnn_buffer_1 = nullptr; uint8_t * qnn_buffer_2 = nullptr; qnn_instance * instance = ctx->instance; - qnn_buffer_0 = static_cast(instance->alloc_rpcmem( - ggml_nbytes(src0), 4)); - if (nullptr == qnn_buffer_0) { - QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); - goto failure; - } else { - QNN_LOG_INFO("alloc rpcmem successfully\n"); - } - instance->register_rpcmem(qnn_buffer_0, tensor_0); - memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0)); - qnn_buffer_1 = static_cast(instance->alloc_rpcmem( ggml_nbytes(src1), 4)); if (nullptr == qnn_buffer_1) { @@ -2590,7 +2559,7 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, instance->register_rpcmem(qnn_buffer_2, tensor_2); } - Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; + Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; Qnn_OpConfig_t op_config = { (Qnn_OpConfigVersion_t) 1, @@ -2632,27 +2601,20 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst)); } - auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); + auto graph_item = std::make_tuple(graph_handle, tensor_writer0.get_qnn_tensor(), tensor_1, tensor_2); instance->_qnn_graph_map[map_entry] = graph_item; } else { auto & graph_item= instance->_qnn_graph_map[map_entry]; - graph_handle = std::get<0>(graph_item); - tensor_0 = std::get<1>(graph_item); + ggml_qnn_tensor_writer tensor_writer0(src0, std::get<1>(graph_item), ctx); tensor_1 = std::get<2>(graph_item); tensor_2 = std::get<3>(graph_item); - uint32_t dimensions_input_0[] = { - (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], - (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; uint32_t dimensions_input_1[] = { (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; uint32_t dimensions_output[] = { (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; - QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); - QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; @@ -2661,25 +2623,18 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; if (ctx->device != QNN_BACKEND_NPU) { - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - qnn_get_ggml_tensor_data_size(src0)}; QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, qnn_get_ggml_tensor_data_size(src1)}; QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, qnn_get_ggml_tensor_data_size(dst)}; } else { - uint8_t * qnn_buffer_0 = static_cast(ctx->instance->get_rpcmem_from_memhandle( - QNN_VER_PTR(*tensor_0)->memHandle)); - if (nullptr != qnn_buffer_0) - memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0)); - uint8_t * qnn_buffer_1 = static_cast(ctx->instance->get_rpcmem_from_memhandle( QNN_VER_PTR(*tensor_1)->memHandle)); if (nullptr != qnn_buffer_1) memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); } - Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; + Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, @@ -2705,7 +2660,6 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, failure: if (QNN_SUCCESS != error) { - QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 @@ -2724,7 +2678,6 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); } - QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions; QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions; perf.info(); From a5679ddd8e6f1b0ebfe2b876e6720e5d793e9bb5 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sun, 16 Jun 2024 22:01:14 +0800 Subject: [PATCH 021/166] use ggml_qnn_tensor_reader for output tensor --- ggml-qnn.cpp | 128 +++++---------------------------------------------- 1 file changed, 12 insertions(+), 116 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index 8d65b6a4e59ea..eda83597f53b1 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -2091,15 +2091,12 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src std::string graph_name = "ggml_op_qnn_add"; Qnn_GraphHandle_t graph_handle = nullptr; Qnn_Tensor_t * tensor_1 = nullptr; - Qnn_Tensor_t * tensor_2 = nullptr; Qnn_Param_t qnn_params[] = {}; enum ggml_op ggmlop = GGML_OP_ADD; Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; - Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; CHECK_PARAMS(ctx, src0, src1, dst); tensor_1 = (Qnn_Tensor_t *) src1->extra; - tensor_2 = (Qnn_Tensor_t *) dst->extra; instance = ctx->instance; QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; @@ -2107,17 +2104,12 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src perf.start(); QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; - QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); - dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); uint32_t dimensions_input_1[] = { (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; - uint32_t dimensions_output[] = { - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], - (uint32_t) dst->ne[3]}; std::string map_entry = std::string(ggml_op_name(ggmlop)); if (instance->_qnn_graph_map.find(map_entry) != @@ -2128,7 +2120,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src } uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions; - uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions; if (!graph_initialized) { graph_name = graph_name + "_" + std::to_string(ctx->threads) + @@ -2190,9 +2181,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src if (ctx->device == QNN_BACKEND_NPU) { QNN_VER_PTR(*tensor_1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; QNN_VER_PTR(*tensor_1)->clientBuf= {.data=nullptr, .dataSize=0}; - - QNN_VER_PTR(*tensor_2)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; - QNN_VER_PTR(*tensor_2)->clientBuf= {.data=nullptr, .dataSize=0}; } ggml_qnn_tensor_writer tensor_writer0(src0, graph_handle, ctx); @@ -2204,27 +2192,20 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QNN_LOG_INFO("error = %d\n", error); goto failure; } - error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); + ggml_qnn_tensor_reader tensor_reader(dst, graph_handle, ctx); + if (!tensor_writer0.is_valid()) { goto failure; } QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; - QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); - QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; if (ctx->device != QNN_BACKEND_NPU) { QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, qnn_get_ggml_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - qnn_get_ggml_tensor_data_size(dst)}; } else { uint8_t * qnn_buffer_1 = nullptr; - uint8_t * qnn_buffer_2 = nullptr; qnn_instance * instance = ctx->instance; qnn_buffer_1 = static_cast(instance->alloc_rpcmem( @@ -2237,20 +2218,10 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src } instance->register_rpcmem(qnn_buffer_1, tensor_1); memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); - - qnn_buffer_2 = static_cast(instance->alloc_rpcmem( - ggml_nbytes(dst), 4)); - if (nullptr == qnn_buffer_2) { - QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); - goto failure; - } else { - QNN_LOG_INFO("alloc rpcmem successfully\n"); - } - instance->register_rpcmem(qnn_buffer_2, tensor_2); } Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_1}; - Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; + Qnn_Tensor_t tensor_outputs[] = {*tensor_reader.get_qnn_tensor()}; Qnn_OpConfig_t op_config = { (Qnn_OpConfigVersion_t) 1, .v1 = {"ggml_op_add", @@ -2285,38 +2256,25 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src goto failure; } - if (ctx->device == QNN_BACKEND_NPU) { - uint8_t * qnn_buffer_2 = static_cast(ctx->instance->get_rpcmem_from_memhandle( - QNN_VER_PTR(*tensor_2)->memHandle)); - memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst)); - } - auto graph_item = std::make_tuple(graph_handle, tensor_writer0.get_qnn_tensor(), tensor_1, tensor_2); + auto graph_item = std::make_tuple(graph_handle, tensor_writer0.get_qnn_tensor(), tensor_1, tensor_reader.get_qnn_tensor()); instance->_qnn_graph_map[map_entry] = graph_item; } else { auto & graph_item = instance->_qnn_graph_map[map_entry]; ggml_qnn_tensor_writer tensor_writer0(src0, std::get<1>(graph_item), ctx); tensor_1 = std::get<2>(graph_item); - tensor_2 = std::get<3>(graph_item); + ggml_qnn_tensor_reader tensor_reader(dst, std::get<3>(graph_item), ctx); uint32_t dimensions_input_1[] = { (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; - uint32_t dimensions_output[] = { - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], - (uint32_t) dst->ne[3]}; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; - QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); - QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; if (ctx->device != QNN_BACKEND_NPU) { QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, qnn_get_ggml_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - qnn_get_ggml_tensor_data_size(dst)}; } else { uint8_t * qnn_buffer_1 = static_cast(ctx->instance->get_rpcmem_from_memhandle( QNN_VER_PTR(*tensor_1)->memHandle)); @@ -2325,7 +2283,7 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src } Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_1}; - Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; + Qnn_Tensor_t tensor_outputs[] = {*tensor_reader.get_qnn_tensor()}; error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs,2, tensor_outputs,1, @@ -2339,19 +2297,11 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QNN_LOG_INFO("error = %d\n", error); goto failure; } - - if (ctx->device == QNN_BACKEND_NPU) { - uint8_t * qnn_buffer_2 = static_cast(ctx->instance->get_rpcmem_from_memhandle( - QNN_VER_PTR(*tensor_2)->memHandle)); - if (nullptr != qnn_buffer_2) - memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst)); - } } failure: if (QNN_SUCCESS != error) { QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); - QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", src0->name, src0->type, ggml_type_name(src0->type), @@ -2370,7 +2320,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src } QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions; - QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions; perf.info(); } @@ -2395,15 +2344,12 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, std::string graph_name = "ggml_op_qnn_mul_mat"; Qnn_GraphHandle_t graph_handle = nullptr; Qnn_Tensor_t * tensor_1 = nullptr; - Qnn_Tensor_t * tensor_2 = nullptr; Qnn_Param_t qnn_params[] = {}; enum ggml_op ggmlop = GGML_OP_MUL_MAT; Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; - Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; CHECK_PARAMS(ctx, src0, src1, dst); tensor_1 = (Qnn_Tensor_t *) src1->extra; - tensor_2 = (Qnn_Tensor_t *) dst->extra; instance = ctx->instance; QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; @@ -2411,21 +2357,15 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, perf.start(); tensor_1 = (Qnn_Tensor_t *) src1->extra; - tensor_2 = (Qnn_Tensor_t *) dst->extra; instance = ctx->instance; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; - QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); - dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); uint32_t dimensions_input_1[] = { (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; - uint32_t dimensions_output[] = { - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], - (uint32_t) dst->ne[3]}; std::string map_entry = std::string(ggml_op_name(ggmlop)); if (instance->_qnn_graph_map.find(map_entry) != @@ -2436,7 +2376,6 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, } uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions; - uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions; //TODO: for scenarios of quantized data in src0 // pass-1: dequantize src0 to FP32 @@ -2500,9 +2439,6 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, if (ctx->device == QNN_BACKEND_NPU) { QNN_VER_PTR(*tensor_1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; QNN_VER_PTR(*tensor_1)->clientBuf= {.data=nullptr, .dataSize=0}; - - QNN_VER_PTR(*tensor_2)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; - QNN_VER_PTR(*tensor_2)->clientBuf= {.data=nullptr, .dataSize=0}; } ggml_qnn_tensor_writer tensor_writer0(src0, graph_handle, ctx); @@ -2514,27 +2450,20 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, QNN_LOG_INFO("error = %d\n", error); goto failure; } - error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); + ggml_qnn_tensor_reader tensor_reader(dst, graph_handle, ctx); + if (!tensor_writer0.is_valid()) { goto failure; } QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; - QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); - QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; if (ctx->device != QNN_BACKEND_NPU) { QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, qnn_get_ggml_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - qnn_get_ggml_tensor_data_size(dst)}; } else { uint8_t * qnn_buffer_1 = nullptr; - uint8_t * qnn_buffer_2 = nullptr; qnn_instance * instance = ctx->instance; qnn_buffer_1 = static_cast(instance->alloc_rpcmem( @@ -2547,20 +2476,10 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, } instance->register_rpcmem(qnn_buffer_1, tensor_1); memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); - - qnn_buffer_2 = static_cast(instance->alloc_rpcmem( - ggml_nbytes(dst), 4)); - if (nullptr == qnn_buffer_2) { - QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); - goto failure; - } else { - QNN_LOG_INFO("alloc rpcmem successfully\n"); - } - instance->register_rpcmem(qnn_buffer_2, tensor_2); } Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_1}; - Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; + Qnn_Tensor_t tensor_outputs[] = {*tensor_reader.get_qnn_tensor()}; Qnn_OpConfig_t op_config = { (Qnn_OpConfigVersion_t) 1, .v1 = {"ggml_op_mul_mat", @@ -2595,38 +2514,24 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, goto failure; } - if (ctx->device == QNN_BACKEND_NPU) { - uint8_t * qnn_buffer_2 = static_cast(ctx->instance->get_rpcmem_from_memhandle( - QNN_VER_PTR(*tensor_2)->memHandle)); - memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst)); - } - - auto graph_item = std::make_tuple(graph_handle, tensor_writer0.get_qnn_tensor(), tensor_1, tensor_2); + auto graph_item = std::make_tuple(graph_handle, tensor_writer0.get_qnn_tensor(), tensor_1, tensor_reader.get_qnn_tensor()); instance->_qnn_graph_map[map_entry] = graph_item; } else { auto & graph_item= instance->_qnn_graph_map[map_entry]; ggml_qnn_tensor_writer tensor_writer0(src0, std::get<1>(graph_item), ctx); tensor_1 = std::get<2>(graph_item); - tensor_2 = std::get<3>(graph_item); + ggml_qnn_tensor_reader tensor_reader(dst, std::get<3>(graph_item), ctx); uint32_t dimensions_input_1[] = { (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; - uint32_t dimensions_output[] = { - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], - (uint32_t) dst->ne[3]}; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; - QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); - QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; if (ctx->device != QNN_BACKEND_NPU) { QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, qnn_get_ggml_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - qnn_get_ggml_tensor_data_size(dst)}; } else { uint8_t * qnn_buffer_1 = static_cast(ctx->instance->get_rpcmem_from_memhandle( QNN_VER_PTR(*tensor_1)->memHandle)); @@ -2635,7 +2540,7 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, } Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_1}; - Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; + Qnn_Tensor_t tensor_outputs[] = {*tensor_reader.get_qnn_tensor()}; error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, @@ -2649,19 +2554,11 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, QNN_LOG_INFO("error = %d\n", error); goto failure; } - - if (ctx->device == QNN_BACKEND_NPU) { - uint8_t * qnn_buffer_2 = static_cast(ctx->instance->get_rpcmem_from_memhandle( - QNN_VER_PTR(*tensor_2)->memHandle)); - if (nullptr != qnn_buffer_2) - memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst)); - } } failure: if (QNN_SUCCESS != error) { QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); - QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", src0->name, src0->type, ggml_type_name(src0->type), @@ -2679,7 +2576,6 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, } QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions; - QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions; perf.info(); } From 5fe7b87ba1b850ddf896ea3ce48acf5c892b56d0 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sun, 16 Jun 2024 23:54:00 +0800 Subject: [PATCH 022/166] use ggml_qnn_tensor_writer for all parameters --- ggml-qnn.cpp | 161 +++++++-------------------------------------------- 1 file changed, 20 insertions(+), 141 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index eda83597f53b1..c23d67bb3affc 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -1991,7 +1991,6 @@ class ggml_qnn_tensor_readwrite QNN_VER_PTR(*_qnn_tensor)->dimensions = _dimensions; QNN_VER_PTR(*_qnn_tensor)->rank = qnn_get_ggml_tensor_rank(tensor); QNN_VER_PTR(*_qnn_tensor)->dataType = qnn_data_type; - if (is_npu) { qnn_instance * instance = ctx->instance; @@ -2090,27 +2089,16 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src qnn_instance * instance = nullptr; std::string graph_name = "ggml_op_qnn_add"; Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Tensor_t * tensor_1 = nullptr; Qnn_Param_t qnn_params[] = {}; enum ggml_op ggmlop = GGML_OP_ADD; - Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; CHECK_PARAMS(ctx, src0, src1, dst); - tensor_1 = (Qnn_Tensor_t *) src1->extra; instance = ctx->instance; QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; qnn_perf perf("ggml_qnn_add"); perf.start(); - QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; - - src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); - - uint32_t dimensions_input_1[] = { - (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2], - (uint32_t) src1->ne[3]}; - std::string map_entry = std::string(ggml_op_name(ggmlop)); if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { @@ -2119,8 +2107,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src graph_handle = std::get<0>(graph_item); } - uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions; - if (!graph_initialized) { graph_name = graph_name + "_" + std::to_string(ctx->threads) + "_" + src0->name + "_" + src1->name; @@ -2178,49 +2164,21 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QNN_LOG_INFO("create qnn graph handle with graph name %s ok\n", graph_name.c_str()); } - if (ctx->device == QNN_BACKEND_NPU) { - QNN_VER_PTR(*tensor_1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; - QNN_VER_PTR(*tensor_1)->clientBuf= {.data=nullptr, .dataSize=0}; - } - ggml_qnn_tensor_writer tensor_writer0(src0, graph_handle, ctx); if (!tensor_writer0.is_valid()) { goto failure; } - error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1); - if (QNN_SUCCESS != error) { + ggml_qnn_tensor_writer tensor_writer1(src1, graph_handle, ctx); + if (!tensor_writer1.is_valid()) { QNN_LOG_INFO("error = %d\n", error); goto failure; } ggml_qnn_tensor_reader tensor_reader(dst, graph_handle, ctx); - if (!tensor_writer0.is_valid()) { + if (!tensor_reader.is_valid()) { goto failure; } - QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); - QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; - - if (ctx->device != QNN_BACKEND_NPU) { - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - qnn_get_ggml_tensor_data_size(src1)}; - } else { - uint8_t * qnn_buffer_1 = nullptr; - qnn_instance * instance = ctx->instance; - - qnn_buffer_1 = static_cast(instance->alloc_rpcmem( - ggml_nbytes(src1), 4)); - if (nullptr == qnn_buffer_1) { - QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); - goto failure; - } else { - QNN_LOG_INFO("alloc rpcmem successfully\n"); - } - instance->register_rpcmem(qnn_buffer_1, tensor_1); - memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); - } - - Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_1}; + Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_writer1.get_qnn_tensor()}; Qnn_Tensor_t tensor_outputs[] = {*tensor_reader.get_qnn_tensor()}; Qnn_OpConfig_t op_config = { (Qnn_OpConfigVersion_t) 1, @@ -2256,33 +2214,18 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src goto failure; } - auto graph_item = std::make_tuple(graph_handle, tensor_writer0.get_qnn_tensor(), tensor_1, tensor_reader.get_qnn_tensor()); + auto graph_item = std::make_tuple(graph_handle, + tensor_writer0.get_qnn_tensor(), + tensor_writer1.get_qnn_tensor(), + tensor_reader.get_qnn_tensor()); instance->_qnn_graph_map[map_entry] = graph_item; } else { auto & graph_item = instance->_qnn_graph_map[map_entry]; ggml_qnn_tensor_writer tensor_writer0(src0, std::get<1>(graph_item), ctx); - tensor_1 = std::get<2>(graph_item); + ggml_qnn_tensor_writer tensor_writer1(src1, std::get<2>(graph_item), ctx); ggml_qnn_tensor_reader tensor_reader(dst, std::get<3>(graph_item), ctx); - uint32_t dimensions_input_1[] = { - (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], - (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; - - QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); - QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; - - if (ctx->device != QNN_BACKEND_NPU) { - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - qnn_get_ggml_tensor_data_size(src1)}; - } else { - uint8_t * qnn_buffer_1 = static_cast(ctx->instance->get_rpcmem_from_memhandle( - QNN_VER_PTR(*tensor_1)->memHandle)); - if (nullptr != qnn_buffer_1) - memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); - } - - Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_1}; + Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_writer1.get_qnn_tensor()}; Qnn_Tensor_t tensor_outputs[] = {*tensor_reader.get_qnn_tensor()}; error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs,2, @@ -2301,7 +2244,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src failure: if (QNN_SUCCESS != error) { - QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", src0->name, src0->type, ggml_type_name(src0->type), @@ -2319,8 +2261,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src dst->nb[1], dst->nb[2]); } - QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions; - perf.info(); } @@ -2343,30 +2283,16 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, qnn_instance * instance = nullptr; std::string graph_name = "ggml_op_qnn_mul_mat"; Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Tensor_t * tensor_1 = nullptr; Qnn_Param_t qnn_params[] = {}; enum ggml_op ggmlop = GGML_OP_MUL_MAT; - Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; CHECK_PARAMS(ctx, src0, src1, dst); - tensor_1 = (Qnn_Tensor_t *) src1->extra; instance = ctx->instance; QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; qnn_perf perf("ggml_qnn_mul_mat"); perf.start(); - tensor_1 = (Qnn_Tensor_t *) src1->extra; - instance = ctx->instance; - - QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; - - src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); - - uint32_t dimensions_input_1[] = { - (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2], - (uint32_t) src1->ne[3]}; - std::string map_entry = std::string(ggml_op_name(ggmlop)); if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { @@ -2375,8 +2301,6 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, graph_handle = std::get<0>(graph_item); } - uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions; - //TODO: for scenarios of quantized data in src0 // pass-1: dequantize src0 to FP32 // pass-2: dq-src0 * src1 @@ -2436,49 +2360,20 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, goto failure; } - if (ctx->device == QNN_BACKEND_NPU) { - QNN_VER_PTR(*tensor_1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; - QNN_VER_PTR(*tensor_1)->clientBuf= {.data=nullptr, .dataSize=0}; - } - ggml_qnn_tensor_writer tensor_writer0(src0, graph_handle, ctx); if (!tensor_writer0.is_valid()) { goto failure; } - error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); + ggml_qnn_tensor_writer tensor_writer1(src1, graph_handle, ctx); + if (!tensor_writer1.is_valid()) { goto failure; } ggml_qnn_tensor_reader tensor_reader(dst, graph_handle, ctx); - if (!tensor_writer0.is_valid()) { + if (!tensor_reader.is_valid()) { goto failure; } - QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); - QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; - - if (ctx->device != QNN_BACKEND_NPU) { - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - qnn_get_ggml_tensor_data_size(src1)}; - } else { - uint8_t * qnn_buffer_1 = nullptr; - qnn_instance * instance = ctx->instance; - - qnn_buffer_1 = static_cast(instance->alloc_rpcmem( - ggml_nbytes(src1), 4)); - if (nullptr == qnn_buffer_1) { - QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); - goto failure; - } else { - QNN_LOG_INFO("alloc rpcmem successfully\n"); - } - instance->register_rpcmem(qnn_buffer_1, tensor_1); - memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); - } - - Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_1}; + Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_writer1.get_qnn_tensor()}; Qnn_Tensor_t tensor_outputs[] = {*tensor_reader.get_qnn_tensor()}; Qnn_OpConfig_t op_config = { (Qnn_OpConfigVersion_t) 1, @@ -2514,32 +2409,18 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, goto failure; } - auto graph_item = std::make_tuple(graph_handle, tensor_writer0.get_qnn_tensor(), tensor_1, tensor_reader.get_qnn_tensor()); + auto graph_item = std::make_tuple(graph_handle, + tensor_writer0.get_qnn_tensor(), + tensor_writer1.get_qnn_tensor(), + tensor_reader.get_qnn_tensor()); instance->_qnn_graph_map[map_entry] = graph_item; } else { auto & graph_item= instance->_qnn_graph_map[map_entry]; ggml_qnn_tensor_writer tensor_writer0(src0, std::get<1>(graph_item), ctx); - tensor_1 = std::get<2>(graph_item); + ggml_qnn_tensor_writer tensor_writer1(src1, std::get<2>(graph_item), ctx); ggml_qnn_tensor_reader tensor_reader(dst, std::get<3>(graph_item), ctx); - uint32_t dimensions_input_1[] = { - (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], - (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; - QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); - QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; - - if (ctx->device != QNN_BACKEND_NPU) { - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - qnn_get_ggml_tensor_data_size(src1)}; - } else { - uint8_t * qnn_buffer_1 = static_cast(ctx->instance->get_rpcmem_from_memhandle( - QNN_VER_PTR(*tensor_1)->memHandle)); - if (nullptr != qnn_buffer_1) - memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); - } - - Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_1}; + Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_writer1.get_qnn_tensor()}; Qnn_Tensor_t tensor_outputs[] = {*tensor_reader.get_qnn_tensor()}; error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, @@ -2558,7 +2439,6 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, failure: if (QNN_SUCCESS != error) { - QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", src0->name, src0->type, ggml_type_name(src0->type), @@ -2575,7 +2455,6 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); } - QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions; perf.info(); } From 9456bba1210a6ec95f96adf92ff8b263d7786253 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Mon, 17 Jun 2024 18:44:19 +0800 Subject: [PATCH 023/166] rename --- ggml-qnn.cpp | 68 ++++++++++++++++++++++++++-------------------------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index c23d67bb3affc..b97b202453fa0 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -2077,8 +2077,8 @@ class ggml_qnn_tensor_readwrite void operator=(ggml_qnn_tensor_readwrite&&) = delete; }; -using ggml_qnn_tensor_reader = ggml_qnn_tensor_readwrite; -using ggml_qnn_tensor_writer = ggml_qnn_tensor_readwrite; +using ggml_qnn_tensor_output = ggml_qnn_tensor_readwrite; +using ggml_qnn_tensor_input = ggml_qnn_tensor_readwrite; //TODO: this function can be removed later because there are duplicated codes with ggml_qnn_mul_mat // keep it for illustrate how to implement a specified GGMPL OP using QNN API + QNN RPC @@ -2164,22 +2164,22 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QNN_LOG_INFO("create qnn graph handle with graph name %s ok\n", graph_name.c_str()); } - ggml_qnn_tensor_writer tensor_writer0(src0, graph_handle, ctx); - if (!tensor_writer0.is_valid()) { + ggml_qnn_tensor_input tensor_input0(src0, graph_handle, ctx); + if (!tensor_input0.is_valid()) { goto failure; } - ggml_qnn_tensor_writer tensor_writer1(src1, graph_handle, ctx); - if (!tensor_writer1.is_valid()) { + ggml_qnn_tensor_input tensor_input1(src1, graph_handle, ctx); + if (!tensor_input1.is_valid()) { QNN_LOG_INFO("error = %d\n", error); goto failure; } - ggml_qnn_tensor_reader tensor_reader(dst, graph_handle, ctx); - if (!tensor_reader.is_valid()) { + ggml_qnn_tensor_output tensor_output(dst, graph_handle, ctx); + if (!tensor_output.is_valid()) { goto failure; } - Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_writer1.get_qnn_tensor()}; - Qnn_Tensor_t tensor_outputs[] = {*tensor_reader.get_qnn_tensor()}; + Qnn_Tensor_t tensor_inputs[] = {*tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor()}; + Qnn_Tensor_t tensor_outputs[] = {*tensor_output.get_qnn_tensor()}; Qnn_OpConfig_t op_config = { (Qnn_OpConfigVersion_t) 1, .v1 = {"ggml_op_add", @@ -2215,18 +2215,18 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src } auto graph_item = std::make_tuple(graph_handle, - tensor_writer0.get_qnn_tensor(), - tensor_writer1.get_qnn_tensor(), - tensor_reader.get_qnn_tensor()); + tensor_input0.get_qnn_tensor(), + tensor_input1.get_qnn_tensor(), + tensor_output.get_qnn_tensor()); instance->_qnn_graph_map[map_entry] = graph_item; } else { auto & graph_item = instance->_qnn_graph_map[map_entry]; - ggml_qnn_tensor_writer tensor_writer0(src0, std::get<1>(graph_item), ctx); - ggml_qnn_tensor_writer tensor_writer1(src1, std::get<2>(graph_item), ctx); - ggml_qnn_tensor_reader tensor_reader(dst, std::get<3>(graph_item), ctx); + ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); + ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); + ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); - Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_writer1.get_qnn_tensor()}; - Qnn_Tensor_t tensor_outputs[] = {*tensor_reader.get_qnn_tensor()}; + Qnn_Tensor_t tensor_inputs[] = {*tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor()}; + Qnn_Tensor_t tensor_outputs[] = {*tensor_output.get_qnn_tensor()}; error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs,2, tensor_outputs,1, @@ -2360,21 +2360,21 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, goto failure; } - ggml_qnn_tensor_writer tensor_writer0(src0, graph_handle, ctx); - if (!tensor_writer0.is_valid()) { + ggml_qnn_tensor_input tensor_input0(src0, graph_handle, ctx); + if (!tensor_input0.is_valid()) { goto failure; } - ggml_qnn_tensor_writer tensor_writer1(src1, graph_handle, ctx); - if (!tensor_writer1.is_valid()) { + ggml_qnn_tensor_input tensor_input1(src1, graph_handle, ctx); + if (!tensor_input1.is_valid()) { goto failure; } - ggml_qnn_tensor_reader tensor_reader(dst, graph_handle, ctx); - if (!tensor_reader.is_valid()) { + ggml_qnn_tensor_output tensor_output(dst, graph_handle, ctx); + if (!tensor_output.is_valid()) { goto failure; } - Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_writer1.get_qnn_tensor()}; - Qnn_Tensor_t tensor_outputs[] = {*tensor_reader.get_qnn_tensor()}; + Qnn_Tensor_t tensor_inputs[] = {*tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor()}; + Qnn_Tensor_t tensor_outputs[] = {*tensor_output.get_qnn_tensor()}; Qnn_OpConfig_t op_config = { (Qnn_OpConfigVersion_t) 1, .v1 = {"ggml_op_mul_mat", @@ -2410,18 +2410,18 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, } auto graph_item = std::make_tuple(graph_handle, - tensor_writer0.get_qnn_tensor(), - tensor_writer1.get_qnn_tensor(), - tensor_reader.get_qnn_tensor()); + tensor_input0.get_qnn_tensor(), + tensor_input1.get_qnn_tensor(), + tensor_output.get_qnn_tensor()); instance->_qnn_graph_map[map_entry] = graph_item; } else { auto & graph_item= instance->_qnn_graph_map[map_entry]; - ggml_qnn_tensor_writer tensor_writer0(src0, std::get<1>(graph_item), ctx); - ggml_qnn_tensor_writer tensor_writer1(src1, std::get<2>(graph_item), ctx); - ggml_qnn_tensor_reader tensor_reader(dst, std::get<3>(graph_item), ctx); + ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); + ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); + ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); - Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_writer1.get_qnn_tensor()}; - Qnn_Tensor_t tensor_outputs[] = {*tensor_reader.get_qnn_tensor()}; + Qnn_Tensor_t tensor_inputs[] = {*tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor()}; + Qnn_Tensor_t tensor_outputs[] = {*tensor_output.get_qnn_tensor()}; error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, From 65a14d9e9a6977edd154e844b90108ef0d0725f0 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Tue, 18 Jun 2024 23:07:01 +0800 Subject: [PATCH 024/166] fix todo --- ggml-qnn.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index b97b202453fa0..47810c933ab75 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -1995,7 +1995,7 @@ class ggml_qnn_tensor_readwrite if (is_npu) { qnn_instance * instance = ctx->instance; uint8_t *qnn_buffer = static_cast(instance->alloc_rpcmem( - ggml_nbytes(tensor), 4)); // TODO: should we get the align param from device here? + ggml_nbytes(tensor), alignof(void*))); if (!qnn_buffer) { QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor)); From aeef0c68f498001495a18e251c16a4a3fcad2e88 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 19 Jun 2024 10:29:53 +0800 Subject: [PATCH 025/166] make the constant condition first --- ggml-qnn.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index 47810c933ab75..5b4d665dcecba 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -2051,8 +2051,8 @@ class ggml_qnn_tensor_readwrite } ~ggml_qnn_tensor_readwrite() { - if (_context && _context->device == QNN_BACKEND_NPU && - (_tensorType == QNN_TENSOR_TYPE_APP_READWRITE || _tensorType == QNN_TENSOR_TYPE_APP_READ)) { + if ((_tensorType == QNN_TENSOR_TYPE_APP_READWRITE || _tensorType == QNN_TENSOR_TYPE_APP_READ) && + _context && _context->device == QNN_BACKEND_NPU) { uint8_t * qnn_buffer = static_cast(_context->instance->get_rpcmem_from_memhandle( QNN_VER_PTR(*_qnn_tensor)->memHandle)); memcpy(_tensor->data, qnn_buffer, ggml_nbytes(_tensor)); From dfe159ffffcc82484f00d701a2076859ac0f88be Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 19 Jun 2024 10:58:12 +0800 Subject: [PATCH 026/166] remove TODO --- ggml-qnn.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index 5b4d665dcecba..f40efd72915df 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -2000,7 +2000,7 @@ class ggml_qnn_tensor_readwrite QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor)); _context = nullptr; - // TODO: should we free the tensor here? + // No free for _qnn_tensor, because it's not registered. return; } else { QNN_LOG_INFO("alloc rpcmem successfully\n"); From 99320620b07a344c701e0c574922ed061c4257c9 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 19 Jun 2024 12:25:32 +0800 Subject: [PATCH 027/166] split logger function, tensors and backend from main qnn source --- ggml-qnn.cpp | 1511 +-------------------------------- ggml-qnn/backend.hpp | 24 + ggml-qnn/logger.cpp | 78 ++ ggml-qnn/logger.hpp | 49 ++ ggml-qnn/qnn-types.hpp | 46 + ggml-qnn/qnn.hpp | 1139 +++++++++++++++++++++++++ ggml-qnn/tensor.hpp | 145 ++++ ggml-qnn/utils.hpp | 99 +++ tests/ggml-qnn/CMakeLists.txt | 1 + 9 files changed, 1606 insertions(+), 1486 deletions(-) create mode 100644 ggml-qnn/backend.hpp create mode 100644 ggml-qnn/logger.cpp create mode 100644 ggml-qnn/logger.hpp create mode 100644 ggml-qnn/qnn-types.hpp create mode 100644 ggml-qnn/qnn.hpp create mode 100644 ggml-qnn/tensor.hpp create mode 100644 ggml-qnn/utils.hpp diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index f40efd72915df..a552fd5ec935e 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -34,38 +34,20 @@ #include #include -#if (defined __ANDROID__) || (defined ANDROID) -#include -#endif - #include "ggml-qnn.h" #include "ggml-backend-impl.h" -// header file of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK -// https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct -#include "QnnTypes.h" -#include "QnnCommon.h" -#include "QnnContext.h" -#include "QnnBackend.h" -#include "QnnGraph.h" -#include "QnnProperty.h" -#include "QnnTensor.h" -#include "QnnInterface.h" -#include "Saver/QnnSaver.h" -#include "System/QnnSystemInterface.h" -#include "HTP/QnnHtpDevice.h" -#include "HTP/QnnHtpGraph.h" +#include "ggml-qnn/logger.hpp" +#include "ggml-qnn/utils.hpp" +#include "ggml-qnn/backend.hpp" +#include "ggml-qnn/tensor.hpp" // ================================================================================================= // // forward declaration // // ================================================================================================= -class qnn_instance; - -struct ggml_backend_qnn_context; - static int free_qnn_tensor(Qnn_Tensor_t & tensor); // ================================================================================================= @@ -74,16 +56,11 @@ static int free_qnn_tensor(Qnn_Tensor_t & tensor); // // ================================================================================================= #ifdef NDEBUG -#define ENABLE_QNNBACKEND_DEBUG 0 // for troubleshooting QNN backend -#define ENABLE_QNNSDK_LOG 0 // enable/disable QNN SDK's internal log #define ENABLE_QNNBACKEND_PERF 0 // enable/disable op's perf info #else -#define ENABLE_QNNBACKEND_DEBUG 1 // for troubleshooting QNN backend -#define ENABLE_QNNSDK_LOG 1 // enable/disable QNN SDK's internal log #define ENABLE_QNNBACKEND_PERF 1 // enable/disable op's perf info #endif -#define QNN_LOGBUF_LEN 4096 #define QNN_BACKEND_NAME "qnn" typedef void (*ggml_qnn_func_t)(ggml_backend_qnn_context * ctx, @@ -91,29 +68,7 @@ typedef void (*ggml_qnn_func_t)(ggml_backend_qnn_context * ctx, const ggml_tensor * src1, ggml_tensor * dst); -enum qcom_htp_arch { - NONE = 0, - V68 = 68, - V69 = 69, - V73 = 73, - V75 = 75, -}; - -enum qcom_chipset { - UNKNOWN_SM = 0, - SM8450 = 36, // v69 - SM8475 = 42, // v69 - SM8550 = 43, // v73 - SM8650 = 57, // v75 -}; - -struct qcom_socinfo { - uint32_t soc_model; - size_t htp_arch; - size_t vtcm_size_in_mb; -}; - -static struct qcom_socinfo g_qnn_soc_info_table[] = { +static struct qnn::qcom_socinfo g_qnn_soc_info_table[] = { /* Qualcomm SnapDragon 8 Gen 1 */ [SM8450] = { .soc_model = SM8450, @@ -140,18 +95,6 @@ static struct qcom_socinfo g_qnn_soc_info_table[] = { }; -struct ggml_backend_qnn_context { - int device; - int threads; - char name[GGML_MAX_NAME]; - char lib[GGML_MAX_NAME]; - qnn_instance * instance; - struct ggml_backend * backend; - QNN_INTERFACE_VER_TYPE raw_interface; - QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface; - struct qcom_socinfo socinfo; -}; - // according to the QNN SDK Reference Guide, // CPU - Choose a non-quantized model.Quantized models are currently incompatible with the CPU backend // GPU - Choose a non-quantized model.Quantized models are currently incompatible with the GPU backend @@ -235,65 +178,11 @@ struct ggml_backend_qnn_buffer_type_context { std::string name; }; -// ================================================================================================= -// -// QNN backend internal log function -// -// ================================================================================================= -static void qnn_internal_log(ggml_log_level level, const char * file, - const char * func, int line, - const char * format, ...); -#define QNN_LOG_ERROR(...) \ - qnn_internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) - -#define QNN_LOG_WARN(...) \ - qnn_internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) - -#define QNN_LOG_INFO(...) \ - qnn_internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) - -#if ENABLE_QNNBACKEND_DEBUG -#define QNN_LOG_DEBUG(...) \ - qnn_internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) -#else -#define QNN_LOG_DEBUG(...) -#endif - // ================================================================================================= // // QNN backend internal helper functions // // ================================================================================================= -static uint32_t qnn_get_ggml_tensor_rank(const ggml_tensor * tensor) { - uint32_t rank = 0; - for (int i = 0; i < GGML_MAX_DIMS; i++) { - if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) { - rank++; - } - } - return rank; -} - -// TODO: mapping more ggml data type to QNN data type -// ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684 -static Qnn_DataType_t qnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) { - switch (ggmltype) { - case GGML_TYPE_F16: - return QNN_DATATYPE_FLOAT_16; - case GGML_TYPE_F32: - return QNN_DATATYPE_FLOAT_32; - case GGML_TYPE_I8: - return QNN_DATATYPE_INT_8; - case GGML_TYPE_Q8_0: - return QNN_DATATYPE_SFIXED_POINT_8; - case GGML_TYPE_Q4_0: - return QNN_DATATYPE_SFIXED_POINT_4; - default: - break; - } - return QNN_DATATYPE_UNDEFINED; -} - // TODO: only support GGML_OP_ADD/GGML_OP_MUL/GGML_OP_MUL_MAT static const char * qnn_opname_from_ggmlop(enum ggml_op ggmlop) { switch (ggmlop) { @@ -322,79 +211,6 @@ static uint32_t qnn_get_ggml_tensor_data_size(const ggml_tensor * tensor) { return ggml_nbytes(tensor); } -static const char * qnn_get_backend_name(int n_backend_type) { - switch (n_backend_type) { - case QNN_BACKEND_CPU: - return "QNN-CPU"; - case QNN_BACKEND_GPU: - return "QNN-GPU"; - case QNN_BACKEND_NPU: - return "QNN-NPU"; - case QNN_BACKEND_GGML: - return "ggml"; //"fake" QNN backend, used for compare performance between QNN backend and original GGML - default: - return "unknown"; - } -} - -static const char * qnn_get_chipset_desc(uint32_t chipset_id) { - switch (chipset_id) { - case SM8450: - return "SM8450"; - case SM8475: - return "SM8475"; - case SM8550: - return "SM8550"; - case SM8650: - return "SM8650"; - default: - return "unknown"; - } -} - -static const char * qnn_get_htparch_desc(size_t htp_arch) { - switch (htp_arch) { - case V68: - return "QCOM_HTP_V68"; - case V69: - return "QCOM_HTP_V69"; - case V73: - return "QCOM_HTP_V73"; - case V75: - return "QCOM_HTP_V75"; - default: - return "unknown"; - } -} - -static void qnn_internal_log(ggml_log_level level, const char * file, - const char * func, int line, - const char * format, ...) { - static std::mutex qnn_internal_log_mutex; - static char s_qnn_internal_log_buf[QNN_LOGBUF_LEN]; - - { - std::lock_guard lock(qnn_internal_log_mutex); - va_list args; - - va_start(args, format); - int len_prefix = - snprintf(s_qnn_internal_log_buf, QNN_LOGBUF_LEN, - "[%s, %d]: ", func, line); - int len = vsnprintf(s_qnn_internal_log_buf + len_prefix, - QNN_LOGBUF_LEN - len_prefix, format, args); - if (len < (QNN_LOGBUF_LEN - len_prefix)) { -#if (defined __ANDROID__) || (defined ANDROID) - // for Android APK - __android_log_print(level, "ggml-qnn", "%s\n", s_qnn_internal_log_buf); -#endif - // for Android command line application or WoA(Windows on ARM) - printf("%s\n", s_qnn_internal_log_buf); - } - va_end(args); - } -} - static bool qnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { if ((nullptr == ctx) || (nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { @@ -467,29 +283,10 @@ class qnn_perf { }; #endif -// ================================================================================================= -// -// helper data type / data structure / macros / functions of -// Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK -// ref:https://github.com/pytorch/executorch/tree/main/backends/qualcomm -// ================================================================================================= -enum qnn_sdk_profile_level { - profile_off = 0, - profile_basic = 1, - profile_detail = 2 -}; - -using pfn_rpc_mem_init = void (*)(void); -using pfn_rpc_mem_deinit = void (*)(void); -using pfn_rpc_mem_alloc = void *(*) (int, uint32_t, int); -using pfn_rpc_mem_free = void (*)(void *); -using pfn_rpc_mem_to_fd = int (*)(void *); - using pfn_qnnsaver_initialize = decltype(QnnSaver_initialize); using pfn_qnninterface_getproviders = decltype(QnnInterface_getProviders); using pfn_qnnsysteminterface_getproviders = decltype(QnnSystemInterface_getProviders); -#define QNN_VER_PTR(x) (&((x).v1)) #define RPCMEM_DEFAULT_FLAGS 1 #define RPCMEM_HEAP_ID_SYSTEM 25 @@ -762,1144 +559,6 @@ static int free_qnn_tensor(Qnn_Tensor_t & tensor) { return err; } -template Fn load_qnn_functionpointers(void * handle, const char * function_name) { - return reinterpret_cast(dlsym(handle, function_name)); -} - -static intptr_t align_to(size_t alignment, intptr_t offset) { - return offset % alignment == 0 - ? offset - : offset + (static_cast(alignment) - - offset % static_cast(alignment)); -} - -static void qnn_sdk_logcallback(const char * fmt, QnnLog_Level_t level, - uint64_t timestamp, va_list argp) { - -#if ENABLE_QNNSDK_LOG - static std::mutex log_mutex; - static unsigned char s_ggml_qnn_logbuf[QNN_LOGBUF_LEN]; - - const char * log_level_desc = ""; - switch (level) { - case QNN_LOG_LEVEL_ERROR: - log_level_desc = "ERROR"; - break; - case QNN_LOG_LEVEL_WARN: - log_level_desc = "WARNING"; - break; - case QNN_LOG_LEVEL_INFO: - log_level_desc = "INFO"; - break; - case QNN_LOG_LEVEL_DEBUG: - log_level_desc = "DEBUG"; - break; - case QNN_LOG_LEVEL_VERBOSE: - log_level_desc = "VERBOSE"; - break; - case QNN_LOG_LEVEL_MAX: - log_level_desc = "UNKNOWN"; - break; - } - - double ms = (double) timestamp / 1000000.0; - { - std::lock_guard lock(log_mutex); - - memset(s_ggml_qnn_logbuf, 0, QNN_LOGBUF_LEN); - vsnprintf(reinterpret_cast(s_ggml_qnn_logbuf), QNN_LOGBUF_LEN, fmt, argp); - QNN_LOG_INFO("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggml_qnn_logbuf); - } -#endif -} - -// ================================================================================================= -// -// wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK -// ref:https://github.com/pytorch/executorch/tree/main/backends/qualcomm -// ================================================================================================= -class qnn_interface { - -#define DEFINE_SHIM_FUNCTION_INTERFACE(F, pointer_name) \ - template inline auto qnn_##F(Args... args) const { \ - return (_qnn_interface->QNN_INTERFACE_VER_NAME.pointer_name)( \ - std::forward(args)...); \ - } - -#define DEFINE_SHIM_FUNCTION_SYS_INTERFACE(F, pointer_name) \ - template inline auto qnn_##F(Args... args) const { \ - return ( \ - _qnn_sys_interface->QNN_SYSTEM_INTERFACE_VER_NAME.pointer_name)( \ - std::forward(args)...); \ - } - - friend class qnn_instance; - - public: - qnn_interface() = default; - - // QnnBackend - DEFINE_SHIM_FUNCTION_INTERFACE(backend_create, backendCreate); - - DEFINE_SHIM_FUNCTION_INTERFACE(backend_free, backendFree); - - DEFINE_SHIM_FUNCTION_INTERFACE(backend_register_op_package, - backendRegisterOpPackage); - - DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, - backendValidateOpConfig); - - DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, - backendGetApiVersion); - - // QnnDevice - DEFINE_SHIM_FUNCTION_INTERFACE(device_create, deviceCreate); - - DEFINE_SHIM_FUNCTION_INTERFACE(device_free, deviceFree); - - DEFINE_SHIM_FUNCTION_INTERFACE(device_get_infrastructure, - deviceGetInfrastructure); - - DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, - deviceGetPlatformInfo); - - DEFINE_SHIM_FUNCTION_INTERFACE(device_get_info, deviceGetInfo); - - // QnnContext - DEFINE_SHIM_FUNCTION_INTERFACE(context_create, contextCreate); - - DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary_size, - contextGetBinarySize); - - DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary, contextGetBinary); - - DEFINE_SHIM_FUNCTION_INTERFACE(context_create_from_binary, - contextCreateFromBinary); - - DEFINE_SHIM_FUNCTION_INTERFACE(context_free, contextFree); - - // QnnGraph - DEFINE_SHIM_FUNCTION_INTERFACE(graph_create, graphCreate); - - DEFINE_SHIM_FUNCTION_INTERFACE(graph_add_node, graphAddNode); - - DEFINE_SHIM_FUNCTION_INTERFACE(graph_finalize, graphFinalize); - - DEFINE_SHIM_FUNCTION_INTERFACE(graph_execute, graphExecute); - - DEFINE_SHIM_FUNCTION_INTERFACE(graph_retrieve, graphRetrieve); - - // QnnLog - DEFINE_SHIM_FUNCTION_INTERFACE(log_create, logCreate); - - DEFINE_SHIM_FUNCTION_INTERFACE(log_free, logFree); - - DEFINE_SHIM_FUNCTION_INTERFACE(log_set_log_level, logSetLogLevel); - - // QnnProfile - DEFINE_SHIM_FUNCTION_INTERFACE(profile_create, profileCreate); - - DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_events, profileGetEvents); - - DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_sub_events, profileGetSubEvents); - - DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_event_data, profileGetEventData); - - DEFINE_SHIM_FUNCTION_INTERFACE(profile_free, profileFree); - - // QnnMem - DEFINE_SHIM_FUNCTION_INTERFACE(mem_register, memRegister); - - DEFINE_SHIM_FUNCTION_INTERFACE(mem_de_register, memDeRegister); - - // QnnProperty - DEFINE_SHIM_FUNCTION_INTERFACE(property_has_capability, - propertyHasCapability); - - // QnnTensor - DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_context_tensor, - tensorCreateContextTensor); - - DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, - tensorCreateGraphTensor); - - // QnnSystem - DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_create, - systemContextCreate); - - DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, - systemContextGetBinaryInfo); - - DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree); - - void set_qnn_interface(const QnnInterface_t * qnn_interface) { - _qnn_interface = qnn_interface; - } - - void set_qnn_system_interface( - const QnnSystemInterface_t * qnn_sys_interface) { - _qnn_sys_interface = qnn_sys_interface; - } - - uint32_t get_backend_id() const { return _qnn_interface->backendId; } - - bool is_loaded() const { - return ((_qnn_sys_interface != nullptr) && (_qnn_interface != nullptr)); - } - - private: - const QnnInterface_t * _qnn_interface = nullptr; - - const QnnSystemInterface_t * _qnn_sys_interface = nullptr; -}; - -class qnn_instance { - public: - using BackendIdType = decltype(QnnInterface_t{}.backendId); - - explicit qnn_instance(const std::string & lib_path, - const std::string & backend_name, - const std::string & model_name) - : _lib_path(std::move(lib_path)) - , _backend_name(std::move(backend_name)) - , _model_name(std::move(model_name)){}; - - ~qnn_instance() {} - - int qnn_init(const QnnSaver_Config_t ** saver_config) { - BackendIdType backend_id = QNN_BACKEND_ID_NULL; - QNN_LOG_DEBUG("enter qni_init\n"); - - std::lock_guard lock(_init_mutex); - - if (0 != load_system()) { - QNN_LOG_WARN("can not load QNN system lib, pls check why?\n"); - return 1; - } else { - QNN_LOG_DEBUG("load QNN system lib successfully\n"); - } - - std::string backend_lib_path = _lib_path + _backend_name; - if (0 == _lib_path_to_backend_id.count(backend_lib_path)) { - int is_load_ok = load_backend(backend_lib_path, saver_config); - if (0 != is_load_ok) { - QNN_LOG_WARN("failed to load QNN backend\n"); - return 2; - } - } - - backend_id = _lib_path_to_backend_id[backend_lib_path]; - if (0 == _loaded_backend.count(backend_id) || - 0 == _loaded_lib_handle.count(backend_id)) { - QNN_LOG_WARN("library %s is loaded but loaded backend count=%zu, " - "loaded lib_handle count=%zu\n", - backend_lib_path.c_str(), _loaded_backend.count(backend_id), - _loaded_lib_handle.count(backend_id)); - return 3; - } - - _qnn_interface.set_qnn_interface(_loaded_backend[backend_id]); - - _qnn_interface.qnn_log_create(qnn_sdk_logcallback, _qnn_log_level, &_qnn_log_handle); - if (nullptr == _qnn_log_handle) { - // NPU backend not work on Qualcomm SoC equipped low-end phone - QNN_LOG_WARN("why failed to initialize qnn log\n"); - return 4; - } else { - QNN_LOG_DEBUG("initialize qnn log successfully\n"); - } - - std::vector temp_backend_config; - _qnn_interface.qnn_backend_create( - _qnn_log_handle, - temp_backend_config.empty() ? nullptr : temp_backend_config.data(), - &_qnn_backend_handle); - if (nullptr == _qnn_backend_handle) { - QNN_LOG_WARN("why failed to initialize qnn backend\n"); - return 5; - } else { - QNN_LOG_DEBUG("initialize qnn backend successfully\n"); - } - - if (nullptr != _qnn_raw_interface.propertyHasCapability) { - Qnn_ErrorHandle_t qnn_status = - _qnn_raw_interface.propertyHasCapability(QNN_PROPERTY_GROUP_DEVICE); - if (QNN_PROPERTY_NOT_SUPPORTED == qnn_status) { - QNN_LOG_WARN("device property is not supported\n"); - } - if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnn_status) { - QNN_LOG_WARN("device property is not known to backend\n"); - } - } - - Qnn_ErrorHandle_t qnn_status = QNN_SUCCESS; - if (_backend_name.find("Htp") != std::variant_npos) { - const QnnDevice_PlatformInfo_t * p_info = nullptr; - _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info); - QNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices); - QnnDevice_HardwareDeviceInfo_t * infos = p_info->v1.hwDevices; - QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = { }; - for (int i = 0; i < p_info->v1.numHwDevices; i++) { - QNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId, - infos[i].v1.deviceType, infos[i].v1.numCores); - QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension; - chipinfo = devinfo->onChipDevice; - QnnHtpDevice_Arch_t htp_arch = chipinfo.arch; - QNN_LOG_INFO("htp_type:%d(%s)", devinfo->devType, (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : ""); - QNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB", \ - chipinfo.socModel, qnn_get_chipset_desc(chipinfo.socModel), \ - htp_arch, qnn_get_htparch_desc(htp_arch), chipinfo.vtcmSize); - g_qnn_mgr[QNN_BACKEND_NPU].socinfo = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize }; - } - _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info); - - QnnHtpDevice_CustomConfig_t soc_customconfig; - soc_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_SOC; - soc_customconfig.socModel = chipinfo.socModel; - QnnDevice_Config_t soc_devconfig; - soc_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; - soc_devconfig.customConfig = &soc_customconfig; - - QnnHtpDevice_CustomConfig_t arch_customconfig; - arch_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_ARCH; - arch_customconfig.arch.arch = chipinfo.arch; - arch_customconfig.arch.deviceId = 0; // Id of device to be used. If single device is used by default 0. - QnnDevice_Config_t arch_devconfig; - arch_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; - arch_devconfig.customConfig = &arch_customconfig; - - const QnnDevice_Config_t * p_deviceconfig[] = {&soc_devconfig, &arch_devconfig, nullptr}; - qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, p_deviceconfig, &_qnn_device_handle); - } else { - qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, nullptr, &_qnn_device_handle); - } - if (QNN_SUCCESS != qnn_status && - QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnn_status) { - QNN_LOG_WARN("failed to create QNN device\n"); - } else { - QNN_LOG_INFO("create QNN device successfully\n"); - } - - if (qnn_sdk_profile_level::profile_off != _profile_level) { - QNN_LOG_INFO("profiling turned on; level = %d", _profile_level); - if (qnn_sdk_profile_level::profile_basic == _profile_level) { - QNN_LOG_INFO("basic profiling requested. creating Qnn Profile object\n"); - if (QNN_PROFILE_NO_ERROR != - _qnn_raw_interface.profileCreate(_qnn_backend_handle, - QNN_PROFILE_LEVEL_BASIC, - &_qnn_profile_handle)) { - QNN_LOG_WARN("unable to create profile handle in the backend\n"); - return 6; - } else { - QNN_LOG_DEBUG("initialize qnn profile successfully\n"); - } - } else if (qnn_sdk_profile_level::profile_detail == _profile_level) { - QNN_LOG_INFO("detailed profiling requested. Creating Qnn Profile object\n"); - if (QNN_PROFILE_NO_ERROR != - _qnn_raw_interface.profileCreate(_qnn_backend_handle, - QNN_PROFILE_LEVEL_DETAILED, - &_qnn_profile_handle)) { - QNN_LOG_WARN("unable to create profile handle in the backend\n"); - return 7; - } else { - QNN_LOG_DEBUG("initialize qnn profile successfully\n"); - } - } - } - - _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL); - if (nullptr == _rpc_lib_handle) { - QNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dlerror()); - return 8; - } else { - QNN_LOG_DEBUG("load rpcmem lib successfully\n"); - set_rpcmem_initialized(true); - } - _pfn_rpc_mem_init = reinterpret_cast( - dlsym(_rpc_lib_handle, "rpcmem_init")); - _pfn_rpc_mem_deinit = reinterpret_cast( - dlsym(_rpc_lib_handle, "rpcmem_deinit")); - _pfn_rpc_mem_alloc = reinterpret_cast( - dlsym(_rpc_lib_handle, "rpcmem_alloc")); - _pfn_rpc_mem_free = reinterpret_cast( - dlsym(_rpc_lib_handle, "rpcmem_free")); - _pfn_rpc_mem_to_fd = reinterpret_cast( - dlsym(_rpc_lib_handle, "rpcmem_to_fd")); - if (nullptr == _pfn_rpc_mem_alloc || nullptr == _pfn_rpc_mem_free || - nullptr == _pfn_rpc_mem_to_fd) { - QNN_LOG_WARN("unable to access symbols in QNN RPC lib. dlerror(): %s", dlerror()); - dlclose(_rpc_lib_handle); - return 9; - } - - if (nullptr != _pfn_rpc_mem_init) { // make Qualcomm's SoC equipped low-end phone happy - _pfn_rpc_mem_init(); - } - - /* TODO: not used, keep it for further usage - QnnContext_Config_t qnn_context_config = QNN_CONTEXT_CONFIG_INIT; - qnn_context_config.priority = QNN_PRIORITY_DEFAULT; - const QnnContext_Config_t * context_configs[] = {&qnn_context_config, nullptr}; - */ - _qnn_interface.qnn_context_create( - _qnn_backend_handle, _qnn_device_handle, - nullptr, - &_qnn_context_handle); - if (nullptr == _qnn_context_handle) { - QNN_LOG_WARN("why failed to initialize qnn context\n"); - return 10; - } else { - QNN_LOG_DEBUG("initialize qnn context successfully\n"); - } - - if (_backend_name.find("Htp") != std::variant_npos) { - //TODO: faster approach to probe the accurate capacity of rpc ion memory - size_t candidate_size = 0; - uint8_t * rpc_buffer = nullptr; - const int size_in_mb = (1 << 20); - size_t probe_slots[] = {1024, 1536, 2048 - 48, 2048}; - size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); - for (size_t idx = 0; idx < probe_counts; idx++) { - rpc_buffer = static_cast(alloc_rpcmem( - probe_slots[idx] * size_in_mb, 4)); - if (nullptr == rpc_buffer) { - QNN_LOG_INFO("alloc rpcmem %d (MB) failure, %s\n", - probe_slots[idx], strerror(errno)); - break; - } else { - candidate_size = probe_slots[idx]; - free_rpcmem(rpc_buffer); - rpc_buffer = nullptr; - } - } - if (candidate_size > _rpcmem_capacity) - _rpcmem_capacity = candidate_size; - QNN_LOG_INFO("capacity of QNN rpc ion memory is about %d MB\n", _rpcmem_capacity); - - if (0 != init_htp_perfinfra()) { - QNN_LOG_WARN("initialize HTP performance failure"); - } - if (0 != set_rpc_polling()) { - QNN_LOG_WARN("set RPC polling failure"); - } - if (0 != set_high_performance_mode()) { - QNN_LOG_WARN("set HTP high performance mode failure"); - } - } - - QNN_LOG_DEBUG("leave qni_init\n"); - - return 0; - } - - int qnn_finalize() { - int ret_status = 0; - Qnn_ErrorHandle_t error = QNN_SUCCESS; - - if (nullptr != _pfn_rpc_mem_deinit) // make Qualcomm's SoC equipped low-end phone happy - _pfn_rpc_mem_deinit(); - - if (dlclose(_rpc_lib_handle) != 0) { - QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dlerror()); - } else { - QNN_LOG_DEBUG("succeed to close rpcmem lib\n"); - } - - if (_backend_name.find("Htp") != std::variant_npos) { - _qnn_htp_perfinfra->destroyPowerConfigId(_qnn_power_configid); - } - - if (nullptr != _qnn_context_handle) { - error = _qnn_interface.qnn_context_free(_qnn_context_handle, - _qnn_profile_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), - QNN_GET_ERROR_CODE(error)); - } - _qnn_context_handle = nullptr; - } - - if (nullptr != _qnn_profile_handle) { - error = _qnn_interface.qnn_profile_free(_qnn_profile_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), - QNN_GET_ERROR_CODE(error)); - } - _qnn_profile_handle = nullptr; - } - - if (nullptr != _qnn_device_handle) { - error = _qnn_interface.qnn_device_free(_qnn_device_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), - QNN_GET_ERROR_CODE(error)); - } - _qnn_device_handle = nullptr; - } - - if (nullptr != _qnn_backend_handle) { - error = _qnn_interface.qnn_backend_free(_qnn_backend_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), - QNN_GET_ERROR_CODE(error)); - } - _qnn_backend_handle = nullptr; - } - - if (nullptr != _qnn_log_handle) { - error = _qnn_interface.qnn_log_free(_qnn_log_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), - QNN_GET_ERROR_CODE(error)); - } - _qnn_log_handle = nullptr; - } - - unload_backend(); - - unload_system(); - - return ret_status; - } - - //TODO:keep it for further usage of offload the entire cgraph to a single QNN DAG directly - // which was used in Qualcomm's dedicated AI technology -#if 0 - int init_qnn_graph(const char * graph_name, bool debug, - uint8_t do_node_validation = true, - const QnnGraph_Config_t ** graph_configs = nullptr) { - int result = 0; - - if (nullptr == graph_name) { - QNN_LOG_WARN("graph name is null\n"); - return 1; - } - - if (!_graph_name.empty()) { - QNN_LOG_WARN("qnn model for graph %s already initialized\n", graph_name); - return 2; - } - - if (!do_node_validation) { - QNN_LOG_WARN("node validation disabled, backend will not perform op " - "validation prior to adding node\n"); - } - - _graph_name = graph_name; - _debug_tensor = debug; - _do_node_validations = do_node_validation; - - result = _qnn_raw_interface.graphCreate(_qnn_context_handle, graph_name, - graph_configs, &_qnn_graph_handle); - if (result != QNN_GRAPH_NO_ERROR || nullptr == _qnn_graph_handle) { - QNN_LOG_WARN("failed to create graph in qnn context\n"); - return 3; - } else { - QNN_LOG_INFO("succeed to create graph %s, %p\n", graph_name, _qnn_graph_handle); - } - - return 0; - } - - int finalize_qnn_graph() { - if (nullptr != _qnn_graph_handle) { - if (_qnn_raw_interface.graphFinalize(_qnn_graph_handle, - _qnn_profile_handle, - nullptr) != QNN_GRAPH_NO_ERROR) { - QNN_LOG_WARN("finalizing graph failure\n"); - } - } else { - QNN_LOG_DEBUG("qnn graph handle is null\n"); - } - - return 0; - } -#endif - - const qnn_interface & get_qnn_interface() { - if (!_qnn_interface.is_loaded()) { - QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); - } - return _qnn_interface; - } - - const QNN_INTERFACE_VER_TYPE & get_qnn_raw_interface() { - if (!_qnn_interface.is_loaded()) { - QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); - } - return _qnn_raw_interface; - } - - const QNN_SYSTEM_INTERFACE_VER_TYPE & get_qnn_raw_system_interface() { - if (!_qnn_interface.is_loaded()) { - QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); - } - return _qnn_raw_system_interface; - } - - const Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; } - - const Qnn_ProfileHandle_t get_qnn_profile_handle() { - return _qnn_profile_handle; - } - - const Qnn_DeviceHandle_t get_qnn_device_handle() { - return _qnn_device_handle; - } - - const Qnn_BackendHandle_t get_qnn_backend_handle() { - return _qnn_backend_handle; - } - - const Qnn_ContextHandle_t get_qnn_context_handle() { - return _qnn_context_handle; - } - - const QnnSystemContext_Handle_t get_qnn_system_handle() { - return _qnn_system_handle; - } - - const Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; } - - int init_htp_perfinfra() { - QnnDevice_Infrastructure_t device_infra = nullptr; - int error = _qnn_raw_interface.deviceGetInfrastructure(&device_infra); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to get qnn device infra\n"); - return 1; - } else { - QNN_LOG_INFO("HTP backend perf_infrastructure creation ok\n"); - } - - QnnHtpDevice_Infrastructure_t * htp_infra = static_cast(device_infra); - QnnHtpDevice_PerfInfrastructure_t * htp_perfinfra = &htp_infra->perfInfra; - uint32_t power_configid = 1; - uint32_t device_id = 0; - uint32_t core_id = 0; - htp_perfinfra->createPowerConfigId(device_id, core_id, &power_configid); - if (htp_infra->infraType != QNN_HTP_DEVICE_INFRASTRUCTURE_TYPE_PERF) { - QNN_LOG_INFO("HTP infra type = %d, which is not perf infra type", htp_infra->infraType); - } else { - QNN_LOG_INFO("HTP infra type = %d, which is perf infra type\n", htp_infra->infraType); - } - _qnn_htp_perfinfra = htp_perfinfra; - _qnn_power_configid = power_configid; - - return 0; - } - - int set_rpc_polling() { - if (_qnn_htp_perfinfra) { - QnnHtpPerfInfrastructure_PowerConfig_t rpc_polling_time; - memset(&rpc_polling_time, 0, sizeof(rpc_polling_time)); - rpc_polling_time.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME; - //use rpc polling time recommended 0-10000 us - rpc_polling_time.rpcPollingTimeConfig = 9999; - - QnnHtpPerfInfrastructure_PowerConfig_t rpc_control_latency; - memset(&rpc_control_latency, 0, sizeof(rpc_control_latency)); - rpc_control_latency.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY; - //use rpc control latency recommended 100 us, refer hexagon sdk - rpc_control_latency.rpcControlLatencyConfig = 100; - - const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = { - &rpc_polling_time, - &rpc_control_latency, - nullptr}; - Qnn_ErrorHandle_t qnn_status = _qnn_htp_perfinfra->setPowerConfig( - _qnn_power_configid, - power_configs); - if (qnn_status != QNN_SUCCESS) { - QNN_LOG_WARN("set htp perf failed\n"); - } else { - QNN_LOG_INFO("set htp perf ok\n"); - } - } else { - QNN_LOG_WARN("can't set htp perf\n"); - } - - return 0; - } - - int set_high_performance_mode() { - if (nullptr == _qnn_htp_perfinfra) { - QNN_LOG_WARN("perf intra is null\n"); - return 1; - } - - QnnHtpPerfInfrastructure_PowerConfig_t power_config; - memset(&power_config, 0, sizeof(power_config)); - power_config.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3; - - power_config.dcvsV3Config.setDcvsEnable = 1; - power_config.dcvsV3Config.dcvsEnable = 0; - power_config.dcvsV3Config.contextId = _qnn_power_configid; - power_config.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE; - power_config.dcvsV3Config.setSleepLatency = - 1; // true to consider Latency parameter otherwise false - power_config.dcvsV3Config.sleepLatency = 40; - power_config.dcvsV3Config.setBusParams = - 1; // true to consider Bus parameter otherwise false - power_config.dcvsV3Config.setCoreParams = - 1; // true to consider Core parameter otherwise false - power_config.dcvsV3Config.sleepDisable = - 1; // true to consider sleep/LPM modes, false to enable - power_config.dcvsV3Config.setSleepDisable = - 1; // true to consider sleep disable/enable parameter otherwise false set sleep latency parameter - // set Bus Clock Parameters - power_config.dcvsV3Config.busVoltageCornerMin = - DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - power_config.dcvsV3Config.busVoltageCornerTarget = - DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - power_config.dcvsV3Config.busVoltageCornerMax = - DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - // set Core Clock Parameters - power_config.dcvsV3Config.coreVoltageCornerMin = - DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - power_config.dcvsV3Config.coreVoltageCornerTarget = - DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - power_config.dcvsV3Config.coreVoltageCornerMax = - DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - - // set power config with different performance parameters - const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = { - &power_config, nullptr}; - Qnn_ErrorHandle_t qnn_status = QNN_SUCCESS; - qnn_status = _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs); - if (qnn_status != QNN_SUCCESS) { - QNN_LOG_WARN("set htp high performance mode failed\n"); - } else { - QNN_LOG_INFO("set htp high performance mode ok\n"); - } - - return 0; - } - - std::string & get_qnn_graph_name() { return _graph_name; } - - bool is_rpcmem_initialized() { return _rpcmem_initialized; } - - void set_rpcmem_initialized(bool initialized) { - _rpcmem_initialized = initialized; - } - - size_t get_rpcmem_capacity() { return _rpcmem_capacity; } - - bool is_rpcmem_registered(Qnn_MemHandle_t handle) { - return _qnn_mem_set.count(handle) != 0U; - } - - void * alloc_rpcmem(size_t bytes, size_t alignment) { - if (!_rpcmem_initialized) { - QNN_LOG_WARN("rpc memory not initialized\n"); - return nullptr; - } - - auto allocate_bytes = static_cast(bytes + alignment); - void * buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, - allocate_bytes); - if (buf == nullptr) { - QNN_LOG_WARN("failed to allocate rpc memory\n"); - return nullptr; - } - - auto aligned_buf = reinterpret_cast( - align_to(alignment, reinterpret_cast(buf))); - bool status = - _rpcmem_store_map.insert(std::pair(aligned_buf, buf)).second; - if (!status) { - QNN_LOG_WARN("failed to allocate rpc memory\n"); - _pfn_rpc_mem_free(buf); - } - - return aligned_buf; - } - - void free_rpcmem(void * buf) { - if (!_rpcmem_initialized) { - QNN_LOG_WARN("rpc memory not initialized\n"); - } else if (0 == _rpcmem_store_map.count(buf)) { - QNN_LOG_WARN("no allocated tensor\n"); - } else { - _pfn_rpc_mem_free(_rpcmem_store_map[buf]); - _rpcmem_store_map.erase(buf); - } - } - - int32_t rpcmem_to_fd(void * buf) { - int32_t mem_fd = -1; - if (!is_rpcmem_initialized()) { - QNN_LOG_WARN("rpc memory not initialized\n"); - } else { - mem_fd = _pfn_rpc_mem_to_fd(buf); - } - - return mem_fd; - } - - int register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor) { - if (nullptr == p_data || (nullptr == p_tensor)) { - QNN_LOG_WARN("invalid param\n"); - return 1; - } - - if (!is_rpcmem_initialized()) { - QNN_LOG_WARN("rpc memory not initialized\n"); - return 2; - } - - if (is_rpcmem_allocated(p_data)) { - QNN_LOG_WARN("rpc memory already allocated\n"); - return 3; - } - - if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) { - QNN_LOG_WARN("tensor %s has been registered shared memory\n", - (QNN_VER_PTR(*p_tensor)->name)); - return 4; - } - - int32_t mem_fd = rpcmem_to_fd(p_data); - if (-1 == mem_fd) { - QNN_LOG_WARN("failed to get file descriptor\n"); - return 5; - } - QNN_LOG_INFO("mem_fd %d\n", mem_fd); - Qnn_MemDescriptor_t descriptor = {{QNN_VER_PTR(*p_tensor)->rank, - QNN_VER_PTR(*p_tensor)->dimensions, - nullptr}, - QNN_VER_PTR(*p_tensor)->dataType, - QNN_MEM_TYPE_ION, - {{mem_fd}}}; - Qnn_MemHandle_t handle = nullptr; - int error = QNN_SUCCESS; - error = _qnn_interface.qnn_mem_register(_qnn_context_handle, &descriptor, - /*numDescriptors=*/1, &handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to register shared memory, error %d, %s\n", - QNN_GET_ERROR_CODE(error), strerror(error)); - return 6; - } else { - QNN_LOG_INFO("tensor %s successfully register shared memory\n", - (QNN_VER_PTR(*p_tensor)->name)); - } - QNN_VER_PTR(*p_tensor)->memHandle = handle; - _qnn_mem_set.insert((std::pair(p_data, handle))); - - return 0; - } - - void * get_rpcmem_from_memhandle(Qnn_MemHandle_t mem_handle) { - for (std::unordered_map::iterator it = _qnn_mem_set.begin(); - it != _qnn_mem_set.end(); - it++) { - Qnn_MemHandle_t mem_handle = it->second; - if (it->second == mem_handle) { - return it->first; - } - } - QNN_LOG_WARN("can't find rpcmem from qnn mem handle %p", mem_handle); - return nullptr; - } - - void unregister_rpcmem() { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - - if (_qnn_mem_set.empty()) { - QNN_LOG_WARN("no rpcmem registered\n"); - } - - for (std::unordered_map::iterator it = _qnn_mem_set.begin(); - it != _qnn_mem_set.end(); - it++) { - Qnn_MemHandle_t mem_handle = it->second; - error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to unregister shared memory, error %d\n", - QNN_GET_ERROR_CODE(error)); - } - } - _qnn_mem_set.clear(); - } - - bool is_rpcmem_allocated(void * buf) { - return _qnn_mem_set.count(buf) != 0U; - } - - - public: - std::map> - _qnn_graph_map; - - private: - int load_system() { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - - std::string system_lib_path = _lib_path + "libQnnSystem.so"; - QNN_LOG_DEBUG("system_lib_path:%s\n", system_lib_path.c_str()); - - _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL); - if (nullptr == _system_lib_handle) { - QNN_LOG_WARN("can not open QNN library %s, error: %s\n", - system_lib_path.c_str(), dlerror()); - return 1; - } - - auto * get_providers = - reinterpret_cast( - dlsym(_system_lib_handle, "QnnSystemInterface_getProviders")); - if (nullptr == get_providers) { - QNN_LOG_WARN( - "can not load QNN symbol QnnSystemInterface_getProviders: %s\n", - dlerror()); - return 2; - } - - uint32_t num_providers = 0; - const QnnSystemInterface_t ** provider_list = nullptr; - error = get_providers(&provider_list, &num_providers); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to get providers, error %d\n", - QNN_GET_ERROR_CODE(error)); - return 3; - } - - if (num_providers != _required_num_providers) { - QNN_LOG_WARN("providers is %d instead of required %d\n", num_providers, - _required_num_providers); - return 4; - } - - if (nullptr == provider_list) { - QNN_LOG_WARN("can not get providers\n"); - return 5; - } - - QNN_SYSTEM_INTERFACE_VER_TYPE qnn_system_interface; - bool found_valid_system_interface = false; - for (size_t idx = 0; idx < num_providers; idx++) { - if (QNN_SYSTEM_API_VERSION_MAJOR == - provider_list[idx]->systemApiVersion.major && - QNN_SYSTEM_API_VERSION_MINOR <= - provider_list[idx]->systemApiVersion.minor) { - found_valid_system_interface = true; - qnn_system_interface = - provider_list[idx]->QNN_SYSTEM_INTERFACE_VER_NAME; - break; - } - } - if (!found_valid_system_interface) { - QNN_LOG_WARN("unable to find a valid qnn system interface\n"); - return 6; - } else { - QNN_LOG_INFO("find a valid qnn system interface\n"); - } - set_qnn_raw_system_interface(qnn_system_interface); - - _qnn_interface.set_qnn_system_interface(provider_list[0]); - - _qnn_interface.qnn_system_context_create(&_qnn_system_handle); - if (nullptr == _qnn_system_handle) { - QNN_LOG_WARN("can not create QNN system contenxt\n"); - } else { - QNN_LOG_INFO("initialize qnn system successfully\n"); - } - - return 0; - } - - int unload_system() { - int result = 0; - - if (nullptr == _system_lib_handle) { - QNN_LOG_WARN("system lib handle is null\n"); - return 1; - } - - if (nullptr != _qnn_system_handle) { - result = _qnn_interface.qnn_system_context_free(_qnn_system_handle); - if (result != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN system context\n"); - } - _qnn_system_handle = nullptr; - } - - int dlclose_error = dlclose(_system_lib_handle); - if (dlclose_error != 0) { - QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", dlerror()); - return 2; - } - - _system_lib_handle = nullptr; - - return result; - } - - int load_backend(std::string & lib_path, const QnnSaver_Config_t ** saver_config) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - QNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str()); - - void * lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL); - if (nullptr == lib_handle) { - QNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dlerror()); - return 1; - } - - auto get_providers = - load_qnn_functionpointers( - lib_handle, "QnnInterface_getProviders"); - if (nullptr == get_providers) { - QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dlerror()); - return 2; - } - - std::uint32_t num_providers = 0; - const QnnInterface_t ** provider_list = nullptr; - error = get_providers(&provider_list, &num_providers); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error)); - return 3; - } - QNN_LOG_DEBUG("num_providers=%d\n", num_providers); - if (num_providers != _required_num_providers) { - QNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers); - return 4; - } - - if (nullptr == provider_list) { - QNN_LOG_WARN("failed to get qnn interface providers\n"); - return 5; - } - bool found_valid_interface = false; - QNN_INTERFACE_VER_TYPE qnn_interface; - for (size_t idx = 0; idx < num_providers; idx++) { - if (QNN_API_VERSION_MAJOR == - provider_list[idx]->apiVersion.coreApiVersion.major && - QNN_API_VERSION_MINOR <= - provider_list[idx]->apiVersion.coreApiVersion.minor) { - found_valid_interface = true; - qnn_interface = provider_list[idx]->QNN_INTERFACE_VER_NAME; - break; - } - } - - if (!found_valid_interface) { - QNN_LOG_WARN("unable to find a valid qnn interface\n"); - return 6; - } else { - QNN_LOG_INFO("find a valid qnn interface\n"); - } - set_qnn_raw_interface(qnn_interface); - - BackendIdType backend_id = provider_list[0]->backendId; - _lib_path_to_backend_id[lib_path] = backend_id; - if (_loaded_backend.count(backend_id) > 0) { - QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n", lib_path.c_str(), backend_id); - } - _loaded_backend[backend_id] = provider_list[0]; - if (_loaded_lib_handle.count(backend_id) > 0) { - QNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]); - int dlclose_error = dlclose(_loaded_lib_handle[backend_id]); - if (dlclose_error != 0) { - QNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], dlerror()); - } - } - _loaded_lib_handle[backend_id] = lib_handle; - _backend_id = backend_id; - - return 0; - } - - int unload_backend() { - int dlclose_error = 0; - for (auto & it : _loaded_lib_handle) { - dlclose_error = dlclose(it.second); - if (dlclose_error != 0) { - QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, dlerror()); - } - } - - _loaded_lib_handle.clear(); - _lib_path_to_backend_id.clear(); - _loaded_backend.clear(); - - return 0; - } - - void set_qnn_raw_interface(QNN_INTERFACE_VER_TYPE & raw_interface) { - _qnn_raw_interface = raw_interface; - } - - void set_qnn_raw_system_interface(QNN_SYSTEM_INTERFACE_VER_TYPE & raw_interface) { - _qnn_raw_system_interface = raw_interface; - } - - private: - static constexpr const int _required_num_providers = 1; - - private: - std::string _lib_path; - std::string _backend_name; - std::string _model_name; // Qualcomm's dedicated prebuilt model name, keep it for further usage - BackendIdType _backend_id; - - bool _debug_tensor = false; - bool _do_node_validations = true; - - QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_DEBUG; - - qnn_sdk_profile_level _profile_level = qnn_sdk_profile_level::profile_detail; - - qnn_interface _qnn_interface; - - void * _system_lib_handle = nullptr; - - Qnn_GraphHandle_t _qnn_graph_handle = nullptr; - - Qnn_LogHandle_t _qnn_log_handle = nullptr; - - Qnn_ProfileHandle_t _qnn_profile_handle = nullptr; - - Qnn_DeviceHandle_t _qnn_device_handle = nullptr; - - Qnn_BackendHandle_t _qnn_backend_handle = nullptr; - - Qnn_ContextHandle_t _qnn_context_handle = nullptr; - - QnnSystemContext_Handle_t _qnn_system_handle = nullptr; - - QnnHtpDevice_PerfInfrastructure_t * _qnn_htp_perfinfra = nullptr; - uint32_t _qnn_power_configid = 1; - - QNN_INTERFACE_VER_TYPE _qnn_raw_interface; - QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface; - - std::unordered_map _qnn_mem_set; - - std::mutex _init_mutex; - std::unordered_map _loaded_lib_handle; - std::unordered_map _lib_path_to_backend_id; - std::unordered_map _loaded_backend; - - void * _rpc_lib_handle = nullptr; - std::atomic_bool _rpcmem_initialized{false}; - pfn_rpc_mem_alloc _pfn_rpc_mem_alloc; - pfn_rpc_mem_free _pfn_rpc_mem_free; - pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd; - pfn_rpc_mem_init _pfn_rpc_mem_init; - pfn_rpc_mem_deinit _pfn_rpc_mem_deinit; - std::unordered_map _rpcmem_store_map; - size_t _rpcmem_capacity = 512; - - std::string _graph_name; -}; - // ================================================================================================= // // implementation of QNN backend for GGML @@ -1959,126 +618,6 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, return true; } -template -class ggml_qnn_tensor_readwrite -{ -public: - ggml_qnn_tensor_readwrite(const ggml_tensor *tensor, Qnn_GraphHandle_t graph_handle, ggml_backend_qnn_context * ctx) - : _tensor(tensor) - , _qnn_tensor(reinterpret_cast(tensor->extra)) - , _context(ctx) { - _old_dimensions = QNN_VER_PTR(*_qnn_tensor)->dimensions; - const auto qnn_data_type = qnn_datatype_from_ggml_datatype(tensor->type); - const bool is_npu = ctx->device == QNN_BACKEND_NPU; - QNN_VER_PTR(*_qnn_tensor)->type = _tensorType; - if (is_npu) { - QNN_VER_PTR(*_qnn_tensor)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; - QNN_VER_PTR(*_qnn_tensor)->clientBuf= {.data=nullptr, .dataSize=0}; - } - - auto err = ctx->raw_interface.tensorCreateGraphTensor(graph_handle, _qnn_tensor); - if (err != QNN_SUCCESS) { - QNN_LOG_INFO("error = %d\n", err); - QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor)); - _context = nullptr; - return; - } - - _dimensions[0] = (uint32_t)tensor->ne[0]; - _dimensions[1] = (uint32_t)tensor->ne[1]; - _dimensions[2] = (uint32_t)tensor->ne[2]; - _dimensions[3] = (uint32_t)tensor->ne[3]; - QNN_VER_PTR(*_qnn_tensor)->dimensions = _dimensions; - QNN_VER_PTR(*_qnn_tensor)->rank = qnn_get_ggml_tensor_rank(tensor); - QNN_VER_PTR(*_qnn_tensor)->dataType = qnn_data_type; - - if (is_npu) { - qnn_instance * instance = ctx->instance; - uint8_t *qnn_buffer = static_cast(instance->alloc_rpcmem( - ggml_nbytes(tensor), alignof(void*))); - if (!qnn_buffer) { - QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); - QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor)); - _context = nullptr; - // No free for _qnn_tensor, because it's not registered. - return; - } else { - QNN_LOG_INFO("alloc rpcmem successfully\n"); - } - - instance->register_rpcmem(qnn_buffer, _qnn_tensor); - if (_tensorType == QNN_TENSOR_TYPE_APP_WRITE || _tensorType == QNN_TENSOR_TYPE_APP_READWRITE) { - memcpy(qnn_buffer, tensor->data, ggml_nbytes(tensor)); - } - } else { - QNN_VER_PTR(*_qnn_tensor)->clientBuf = {tensor->data, - qnn_get_ggml_tensor_data_size(tensor)}; - } - } - - ggml_qnn_tensor_readwrite(const ggml_tensor *tensor, Qnn_Tensor_t *qnn_tensor, ggml_backend_qnn_context * ctx) - : _tensor(tensor) - , _qnn_tensor(qnn_tensor) - , _context(ctx) { - _old_dimensions = QNN_VER_PTR(*_qnn_tensor)->dimensions; - const auto qnn_data_type = qnn_datatype_from_ggml_datatype(tensor->type); - const bool is_npu = ctx->device == QNN_BACKEND_NPU; - - _dimensions[0] = (uint32_t)tensor->ne[0]; - _dimensions[1] = (uint32_t)tensor->ne[1]; - _dimensions[2] = (uint32_t)tensor->ne[2]; - _dimensions[3] = (uint32_t)tensor->ne[3]; - QNN_VER_PTR(*_qnn_tensor)->dimensions = _dimensions; - QNN_VER_PTR(*_qnn_tensor)->rank = qnn_get_ggml_tensor_rank(tensor); - QNN_VER_PTR(*_qnn_tensor)->dataType = qnn_data_type; - - - if (is_npu) { - uint8_t * qnn_buffer = static_cast(ctx->instance->get_rpcmem_from_memhandle( - QNN_VER_PTR(*_qnn_tensor)->memHandle)); - if (qnn_buffer) { - memcpy(qnn_buffer, tensor->data, ggml_nbytes(tensor)); - } else { - QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n"); - QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor)); - _context = nullptr; - return; - } - } else { - QNN_VER_PTR(*_qnn_tensor)->clientBuf = {tensor->data, - qnn_get_ggml_tensor_data_size(tensor)}; - } - } - - ~ggml_qnn_tensor_readwrite() { - if ((_tensorType == QNN_TENSOR_TYPE_APP_READWRITE || _tensorType == QNN_TENSOR_TYPE_APP_READ) && - _context && _context->device == QNN_BACKEND_NPU) { - uint8_t * qnn_buffer = static_cast(_context->instance->get_rpcmem_from_memhandle( - QNN_VER_PTR(*_qnn_tensor)->memHandle)); - memcpy(_tensor->data, qnn_buffer, ggml_nbytes(_tensor)); - } - - QNN_VER_PTR(*_qnn_tensor)->dimensions = _old_dimensions; - } - - bool is_valid() const { return _context; } - Qnn_Tensor_t * get_qnn_tensor() const { return _qnn_tensor; } - -private: - const ggml_tensor *_tensor; - Qnn_Tensor_t *_qnn_tensor; - ggml_backend_qnn_context *_context; - uint32_t *_old_dimensions; - uint32_t _dimensions[4] = {}; - - ggml_qnn_tensor_readwrite(const ggml_qnn_tensor_readwrite&) = delete; - void operator=(const ggml_qnn_tensor_readwrite&) = delete; - ggml_qnn_tensor_readwrite(ggml_qnn_tensor_readwrite&&) = delete; - void operator=(ggml_qnn_tensor_readwrite&&) = delete; -}; - -using ggml_qnn_tensor_output = ggml_qnn_tensor_readwrite; -using ggml_qnn_tensor_input = ggml_qnn_tensor_readwrite; //TODO: this function can be removed later because there are duplicated codes with ggml_qnn_mul_mat // keep it for illustrate how to implement a specified GGMPL OP using QNN API + QNN RPC @@ -2164,16 +703,16 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QNN_LOG_INFO("create qnn graph handle with graph name %s ok\n", graph_name.c_str()); } - ggml_qnn_tensor_input tensor_input0(src0, graph_handle, ctx); + qnn::ggml_qnn_tensor_input tensor_input0(src0, graph_handle, ctx); if (!tensor_input0.is_valid()) { goto failure; } - ggml_qnn_tensor_input tensor_input1(src1, graph_handle, ctx); + qnn::ggml_qnn_tensor_input tensor_input1(src1, graph_handle, ctx); if (!tensor_input1.is_valid()) { QNN_LOG_INFO("error = %d\n", error); goto failure; } - ggml_qnn_tensor_output tensor_output(dst, graph_handle, ctx); + qnn::ggml_qnn_tensor_output tensor_output(dst, graph_handle, ctx); if (!tensor_output.is_valid()) { goto failure; } @@ -2221,9 +760,9 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src instance->_qnn_graph_map[map_entry] = graph_item; } else { auto & graph_item = instance->_qnn_graph_map[map_entry]; - ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); - ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); - ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); + qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); + qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); + qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); Qnn_Tensor_t tensor_inputs[] = {*tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor()}; Qnn_Tensor_t tensor_outputs[] = {*tensor_output.get_qnn_tensor()}; @@ -2360,15 +899,15 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, goto failure; } - ggml_qnn_tensor_input tensor_input0(src0, graph_handle, ctx); + qnn::ggml_qnn_tensor_input tensor_input0(src0, graph_handle, ctx); if (!tensor_input0.is_valid()) { goto failure; } - ggml_qnn_tensor_input tensor_input1(src1, graph_handle, ctx); + qnn::ggml_qnn_tensor_input tensor_input1(src1, graph_handle, ctx); if (!tensor_input1.is_valid()) { goto failure; } - ggml_qnn_tensor_output tensor_output(dst, graph_handle, ctx); + qnn::ggml_qnn_tensor_output tensor_output(dst, graph_handle, ctx); if (!tensor_output.is_valid()) { goto failure; } @@ -2416,9 +955,9 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, instance->_qnn_graph_map[map_entry] = graph_item; } else { auto & graph_item= instance->_qnn_graph_map[map_entry]; - ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); - ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); - ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); + qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); + qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); + qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); Qnn_Tensor_t tensor_inputs[] = {*tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor()}; Qnn_Tensor_t tensor_outputs[] = {*tensor_output.get_qnn_tensor()}; @@ -2785,7 +1324,7 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t (uint32_t) tensor->ne[2], (uint32_t) tensor->ne[3]}; Qnn_DataType_t qnn_data_type = - qnn_datatype_from_ggml_datatype(tensor->type); + qnn::datatype_from_ggml_datatype(tensor->type); Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; if (tensor->flags & GGML_TENSOR_FLAG_INPUT) { @@ -2812,7 +1351,7 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t QNN_QUANTIZATION_ENCODING_UNDEFINED, {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}}, - .rank = qnn_get_ggml_tensor_rank(tensor), + .rank = qnn::get_ggml_tensor_rank(tensor), .dimensions = dimensions, .memType = qnn_mem_type, {.clientBuf = {.data = nullptr, .dataSize = 0}}}}}; @@ -3070,7 +1609,7 @@ bool ggml_backend_is_qnn(ggml_backend_t backend) { void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int n_threads) { GGML_ASSERT(ggml_backend_is_qnn(backend)); - struct ggml_backend_qnn_context * ctx = (struct ggml_backend_qnn_context *) backend->context; + auto * ctx = (ggml_backend_qnn_context *) backend->context; ctx->threads = n_threads; } @@ -3175,10 +1714,10 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { } else { if (0 == setenv("LD_LIBRARY_PATH", path.c_str(), 1)) { QNN_LOG_INFO("%s backend setenv successfully\n", - qnn_get_backend_name(device)); + qnn::get_backend_name(device)); } else { QNN_LOG_ERROR("%s backend setenv failure\n", - qnn_get_backend_name(device)); + qnn::get_backend_name(device)); } } @@ -3188,18 +1727,18 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { if (0 != result) { QNN_LOG_WARN( "init qnn subsystem failed with qnn backend %s, pls check why\n", - qnn_get_backend_name(device)); + qnn::get_backend_name(device)); delete instance; return nullptr; } - qnn_interface qnn_interface = instance->get_qnn_interface(); + auto qnn_interface = instance->get_qnn_interface(); if (!qnn_interface.is_loaded()) { QNN_LOG_WARN("qnn subsystem failure\n"); delete instance; return nullptr; } - std::string device_name = qnn_get_backend_name(device); + std::string device_name = qnn::get_backend_name(device); QNN_LOG_INFO("qnn device name %s", device_name.c_str()); g_qnn_mgr[device].instance = instance; g_qnn_mgr[device].raw_interface = instance->get_qnn_raw_interface(); diff --git a/ggml-qnn/backend.hpp b/ggml-qnn/backend.hpp new file mode 100644 index 0000000000000..3a624eab050ac --- /dev/null +++ b/ggml-qnn/backend.hpp @@ -0,0 +1,24 @@ + +#pragma once + +#include "QnnTypes.h" +#include "QnnCommon.h" +#include "QnnContext.h" +#include "QnnBackend.h" + +#include "ggml.h" +#include "ggml-backend.h" + +#include "qnn.hpp" + +struct ggml_backend_qnn_context { + int device; + int threads; + char name[GGML_MAX_NAME]; + char lib[GGML_MAX_NAME]; + qnn_internal::qnn_instance* instance; + struct ggml_backend* backend; + QNN_INTERFACE_VER_TYPE raw_interface; + QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface; + struct qcom_socinfo socinfo; +}; diff --git a/ggml-qnn/logger.cpp b/ggml-qnn/logger.cpp new file mode 100644 index 0000000000000..43856c9f48a9f --- /dev/null +++ b/ggml-qnn/logger.cpp @@ -0,0 +1,78 @@ + +#include "logger.hpp" + +#include +#include + +#if (defined __ANDROID__) || (defined ANDROID) +#include +#endif + +#define QNN_LOGBUF_LEN 4096 + +void qnn::internal_log(ggml_log_level level, const char* file, + const char* func, int line, + const char* format, ...) { + static std::mutex qnn_internal_log_mutex; + static char s_qnn_internal_log_buf[QNN_LOGBUF_LEN]; + + { + std::lock_guard lock(qnn_internal_log_mutex); + va_list args; + + va_start(args, format); + int len_prefix = + snprintf(s_qnn_internal_log_buf, QNN_LOGBUF_LEN, + "[%s, %d]: ", func, line); + int len = vsnprintf(s_qnn_internal_log_buf + len_prefix, + QNN_LOGBUF_LEN - len_prefix, format, args); + if (len < (QNN_LOGBUF_LEN - len_prefix)) { +#if (defined __ANDROID__) || (defined ANDROID) + // for Android APK + __android_log_print(level, "ggml-qnn", "%s\n", s_qnn_internal_log_buf); +#endif + // for Android command line application or WoA(Windows on ARM) + printf("%s\n", s_qnn_internal_log_buf); + } + va_end(args); + } +} + +void qnn::sdk_logcallback(const char* fmt, QnnLog_Level_t level, + uint64_t timestamp, va_list argp) { +#if ENABLE_QNNSDK_LOG + static std::mutex log_mutex; + static unsigned char s_ggml_qnn_logbuf[QNN_LOGBUF_LEN]; + + const char* log_level_desc = ""; + switch (level) { + case QNN_LOG_LEVEL_ERROR: + log_level_desc = "ERROR"; + break; + case QNN_LOG_LEVEL_WARN: + log_level_desc = "WARNING"; + break; + case QNN_LOG_LEVEL_INFO: + log_level_desc = "INFO"; + break; + case QNN_LOG_LEVEL_DEBUG: + log_level_desc = "DEBUG"; + break; + case QNN_LOG_LEVEL_VERBOSE: + log_level_desc = "VERBOSE"; + break; + case QNN_LOG_LEVEL_MAX: + log_level_desc = "UNKNOWN"; + break; + } + + double ms = (double)timestamp / 1000000.0; + { + std::lock_guard lock(log_mutex); + + memset(s_ggml_qnn_logbuf, 0, QNN_LOGBUF_LEN); + vsnprintf(reinterpret_cast(s_ggml_qnn_logbuf), QNN_LOGBUF_LEN, fmt, argp); + QNN_LOG_INFO("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggml_qnn_logbuf); + } +#endif +} diff --git a/ggml-qnn/logger.hpp b/ggml-qnn/logger.hpp new file mode 100644 index 0000000000000..003436da10fae --- /dev/null +++ b/ggml-qnn/logger.hpp @@ -0,0 +1,49 @@ +#pragma once + +#include + +#include "QnnTypes.h" +#include "QnnCommon.h" +#include "QnnInterface.h" +#include "System/QnnSystemInterface.h" + +#include "ggml.h" + +namespace qnn { + void internal_log(ggml_log_level level, const char* file, + const char* func, int line, + const char* format, ...); + + + void sdk_logcallback(const char* fmt, QnnLog_Level_t level, + uint64_t timestamp, va_list argp); +} + +// ================================================================================================= +// +// QNN backend internal log function +// +// ================================================================================================= +#define QNN_LOG_ERROR(...) \ + qnn::internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) + +#define QNN_LOG_WARN(...) \ + qnn::internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) + +#define QNN_LOG_INFO(...) \ + qnn::internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) + +#ifdef NDEBUG +#define ENABLE_QNNBACKEND_DEBUG 0 // for troubleshooting QNN backend +#define ENABLE_QNNSDK_LOG 0 // enable/disable QNN SDK's internal log +#else +#define ENABLE_QNNBACKEND_DEBUG 1 // for troubleshooting QNN backend +#define ENABLE_QNNSDK_LOG 1 // enable/disable QNN SDK's internal log +#endif + +#if ENABLE_QNNBACKEND_DEBUG +#define QNN_LOG_DEBUG(...) \ + qnn::internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#else +#define QNN_LOG_DEBUG(...) +#endif diff --git a/ggml-qnn/qnn-types.hpp b/ggml-qnn/qnn-types.hpp new file mode 100644 index 0000000000000..33f468eb796d1 --- /dev/null +++ b/ggml-qnn/qnn-types.hpp @@ -0,0 +1,46 @@ + +#pragma once + +namespace qnn { + // ================================================================================================= + // + // helper data type / data structure / macros / functions of + // Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK + // ref:https://github.com/pytorch/executorch/tree/main/backends/qualcomm + // ================================================================================================= + enum sdk_profile_level { + profile_off = 0, + profile_basic = 1, + profile_detail = 2 + }; + + enum qcom_htp_arch { + NONE = 0, + V68 = 68, + V69 = 69, + V73 = 73, + V75 = 75, + }; + + enum qcom_chipset { + UNKNOWN_SM = 0, + SM8450 = 36, // v69 + SM8475 = 42, // v69 + SM8550 = 43, // v73 + SM8650 = 57, // v75 + }; + + using pfn_rpc_mem_init = void (*)(void); + using pfn_rpc_mem_deinit = void (*)(void); + using pfn_rpc_mem_alloc = void* (*) (int, uint32_t, int); + using pfn_rpc_mem_free = void (*)(void*); + using pfn_rpc_mem_to_fd = int (*)(void*); + + struct qcom_socinfo { + uint32_t soc_model; + size_t htp_arch; + size_t vtcm_size_in_mb; + }; +} + +#define QNN_VER_PTR(x) (&((x).v1)) // TODO: remove this macro after we have a separate header for QNN diff --git a/ggml-qnn/qnn.hpp b/ggml-qnn/qnn.hpp new file mode 100644 index 0000000000000..bd83a9f05e946 --- /dev/null +++ b/ggml-qnn/qnn.hpp @@ -0,0 +1,1139 @@ +#pragma once + +#include + +// header file of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK +// https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct +#include "QnnTypes.h" +#include "QnnCommon.h" +#include "QnnContext.h" +#include "QnnBackend.h" +#include "QnnGraph.h" +#include "QnnProperty.h" +#include "QnnTensor.h" +#include "QnnInterface.h" +#include "Saver/QnnSaver.h" +#include "System/QnnSystemInterface.h" +#include "HTP/QnnHtpDevice.h" +#include "HTP/QnnHtpGraph.h" + +#include "utils.hpp" +#include "logger.hpp" + +namespace qnn_internal { + + // ================================================================================================= + // + // wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK + // ref:https://github.com/pytorch/executorch/tree/main/backends/qualcomm + // ================================================================================================= + class qnn_interface { + +#define DEFINE_SHIM_FUNCTION_INTERFACE(F, pointer_name) \ + template inline auto qnn_##F(Args... args) const { \ + return (_qnn_interface->QNN_INTERFACE_VER_NAME.pointer_name)( \ + std::forward(args)...); \ + } + +#define DEFINE_SHIM_FUNCTION_SYS_INTERFACE(F, pointer_name) \ + template inline auto qnn_##F(Args... args) const { \ + return ( \ + _qnn_sys_interface->QNN_SYSTEM_INTERFACE_VER_NAME.pointer_name)( \ + std::forward(args)...); \ + } + + friend class qnn_instance; + + public: + qnn_interface() = default; + + // QnnBackend + DEFINE_SHIM_FUNCTION_INTERFACE(backend_create, backendCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(backend_free, backendFree); + + DEFINE_SHIM_FUNCTION_INTERFACE(backend_register_op_package, + backendRegisterOpPackage); + + DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, + backendValidateOpConfig); + + DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, + backendGetApiVersion); + + // QnnDevice + DEFINE_SHIM_FUNCTION_INTERFACE(device_create, deviceCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(device_free, deviceFree); + + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_infrastructure, + deviceGetInfrastructure); + + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, + deviceGetPlatformInfo); + + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_info, deviceGetInfo); + + // QnnContext + DEFINE_SHIM_FUNCTION_INTERFACE(context_create, contextCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary_size, + contextGetBinarySize); + + DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary, contextGetBinary); + + DEFINE_SHIM_FUNCTION_INTERFACE(context_create_from_binary, + contextCreateFromBinary); + + DEFINE_SHIM_FUNCTION_INTERFACE(context_free, contextFree); + + // QnnGraph + DEFINE_SHIM_FUNCTION_INTERFACE(graph_create, graphCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(graph_add_node, graphAddNode); + + DEFINE_SHIM_FUNCTION_INTERFACE(graph_finalize, graphFinalize); + + DEFINE_SHIM_FUNCTION_INTERFACE(graph_execute, graphExecute); + + DEFINE_SHIM_FUNCTION_INTERFACE(graph_retrieve, graphRetrieve); + + // QnnLog + DEFINE_SHIM_FUNCTION_INTERFACE(log_create, logCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(log_free, logFree); + + DEFINE_SHIM_FUNCTION_INTERFACE(log_set_log_level, logSetLogLevel); + + // QnnProfile + DEFINE_SHIM_FUNCTION_INTERFACE(profile_create, profileCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_events, profileGetEvents); + + DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_sub_events, profileGetSubEvents); + + DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_event_data, profileGetEventData); + + DEFINE_SHIM_FUNCTION_INTERFACE(profile_free, profileFree); + + // QnnMem + DEFINE_SHIM_FUNCTION_INTERFACE(mem_register, memRegister); + + DEFINE_SHIM_FUNCTION_INTERFACE(mem_de_register, memDeRegister); + + // QnnProperty + DEFINE_SHIM_FUNCTION_INTERFACE(property_has_capability, + propertyHasCapability); + + // QnnTensor + DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_context_tensor, + tensorCreateContextTensor); + + DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, + tensorCreateGraphTensor); + + // QnnSystem + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_create, + systemContextCreate); + + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, + systemContextGetBinaryInfo); + + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree); + + void set_qnn_interface(const QnnInterface_t* qnn_interface) { + _qnn_interface = qnn_interface; + } + + void set_qnn_system_interface( + const QnnSystemInterface_t* qnn_sys_interface) { + _qnn_sys_interface = qnn_sys_interface; + } + + uint32_t get_backend_id() const { return _qnn_interface->backendId; } + + bool is_loaded() const { + return ((_qnn_sys_interface != nullptr) && (_qnn_interface != nullptr)); + } + + private: + const QnnInterface_t* _qnn_interface = nullptr; + + const QnnSystemInterface_t* _qnn_sys_interface = nullptr; + }; + + + class qnn_instance { + public: + using BackendIdType = decltype(QnnInterface_t{}.backendId); + + explicit qnn_instance(const std::string& lib_path, + const std::string& backend_name, + const std::string& model_name) + : _lib_path(std::move(lib_path)) + , _backend_name(std::move(backend_name)) + , _model_name(std::move(model_name)) {}; + + ~qnn_instance() {} + + int qnn_init(const QnnSaver_Config_t** saver_config) { + BackendIdType backend_id = QNN_BACKEND_ID_NULL; + QNN_LOG_DEBUG("enter qni_init\n"); + + std::lock_guard lock(_init_mutex); + + if (0 != load_system()) { + QNN_LOG_WARN("can not load QNN system lib, pls check why?\n"); + return 1; + } + else { + QNN_LOG_DEBUG("load QNN system lib successfully\n"); + } + + std::string backend_lib_path = _lib_path + _backend_name; + if (0 == _lib_path_to_backend_id.count(backend_lib_path)) { + int is_load_ok = load_backend(backend_lib_path, saver_config); + if (0 != is_load_ok) { + QNN_LOG_WARN("failed to load QNN backend\n"); + return 2; + } + } + + backend_id = _lib_path_to_backend_id[backend_lib_path]; + if (0 == _loaded_backend.count(backend_id) || + 0 == _loaded_lib_handle.count(backend_id)) { + QNN_LOG_WARN("library %s is loaded but loaded backend count=%zu, " + "loaded lib_handle count=%zu\n", + backend_lib_path.c_str(), _loaded_backend.count(backend_id), + _loaded_lib_handle.count(backend_id)); + return 3; + } + + _qnn_interface.set_qnn_interface(_loaded_backend[backend_id]); + + _qnn_interface.qnn_log_create(qnn::sdk_logcallback, _qnn_log_level, &_qnn_log_handle); + if (nullptr == _qnn_log_handle) { + // NPU backend not work on Qualcomm SoC equipped low-end phone + QNN_LOG_WARN("why failed to initialize qnn log\n"); + return 4; + } + else { + QNN_LOG_DEBUG("initialize qnn log successfully\n"); + } + + std::vector temp_backend_config; + _qnn_interface.qnn_backend_create( + _qnn_log_handle, + temp_backend_config.empty() ? nullptr : temp_backend_config.data(), + &_qnn_backend_handle); + if (nullptr == _qnn_backend_handle) { + QNN_LOG_WARN("why failed to initialize qnn backend\n"); + return 5; + } + else { + QNN_LOG_DEBUG("initialize qnn backend successfully\n"); + } + + if (nullptr != _qnn_raw_interface.propertyHasCapability) { + Qnn_ErrorHandle_t qnn_status = + _qnn_raw_interface.propertyHasCapability(QNN_PROPERTY_GROUP_DEVICE); + if (QNN_PROPERTY_NOT_SUPPORTED == qnn_status) { + QNN_LOG_WARN("device property is not supported\n"); + } + if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnn_status) { + QNN_LOG_WARN("device property is not known to backend\n"); + } + } + + Qnn_ErrorHandle_t qnn_status = QNN_SUCCESS; + if (_backend_name.find("Htp") != std::variant_npos) { + const QnnDevice_PlatformInfo_t* p_info = nullptr; + _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info); + QNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices); + QnnDevice_HardwareDeviceInfo_t* infos = p_info->v1.hwDevices; + QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = { }; + for (int i = 0; i < p_info->v1.numHwDevices; i++) { + QNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId, + infos[i].v1.deviceType, infos[i].v1.numCores); + QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension; + chipinfo = devinfo->onChipDevice; + QnnHtpDevice_Arch_t htp_arch = chipinfo.arch; + QNN_LOG_INFO("htp_type:%d(%s)", devinfo->devType, (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : ""); + QNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB", + chipinfo.socModel, qnn::get_chipset_desc(chipinfo.socModel), + htp_arch, qnn::get_htparch_desc(htp_arch), chipinfo.vtcmSize); + g_qnn_mgr[QNN_BACKEND_NPU].socinfo = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize }; + } + _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info); + + QnnHtpDevice_CustomConfig_t soc_customconfig; + soc_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_SOC; + soc_customconfig.socModel = chipinfo.socModel; + QnnDevice_Config_t soc_devconfig; + soc_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; + soc_devconfig.customConfig = &soc_customconfig; + + QnnHtpDevice_CustomConfig_t arch_customconfig; + arch_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_ARCH; + arch_customconfig.arch.arch = chipinfo.arch; + arch_customconfig.arch.deviceId = 0; // Id of device to be used. If single device is used by default 0. + QnnDevice_Config_t arch_devconfig; + arch_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; + arch_devconfig.customConfig = &arch_customconfig; + + const QnnDevice_Config_t* p_deviceconfig[] = { &soc_devconfig, &arch_devconfig, nullptr }; + qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, p_deviceconfig, &_qnn_device_handle); + } + else { + qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, nullptr, &_qnn_device_handle); + } + if (QNN_SUCCESS != qnn_status && + QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnn_status) { + QNN_LOG_WARN("failed to create QNN device\n"); + } + else { + QNN_LOG_INFO("create QNN device successfully\n"); + } + + if (qnn::sdk_profile_level::profile_off != _profile_level) { + QNN_LOG_INFO("profiling turned on; level = %d", _profile_level); + if (qnn::sdk_profile_level::profile_basic == _profile_level) { + QNN_LOG_INFO("basic profiling requested. creating Qnn Profile object\n"); + if (QNN_PROFILE_NO_ERROR != + _qnn_raw_interface.profileCreate(_qnn_backend_handle, + QNN_PROFILE_LEVEL_BASIC, + &_qnn_profile_handle)) { + QNN_LOG_WARN("unable to create profile handle in the backend\n"); + return 6; + } + else { + QNN_LOG_DEBUG("initialize qnn profile successfully\n"); + } + } + else if (qnn::sdk_profile_level::profile_detail == _profile_level) { + QNN_LOG_INFO("detailed profiling requested. Creating Qnn Profile object\n"); + if (QNN_PROFILE_NO_ERROR != + _qnn_raw_interface.profileCreate(_qnn_backend_handle, + QNN_PROFILE_LEVEL_DETAILED, + &_qnn_profile_handle)) { + QNN_LOG_WARN("unable to create profile handle in the backend\n"); + return 7; + } + else { + QNN_LOG_DEBUG("initialize qnn profile successfully\n"); + } + } + } + + _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL); + if (nullptr == _rpc_lib_handle) { + QNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dlerror()); + return 8; + } + else { + QNN_LOG_DEBUG("load rpcmem lib successfully\n"); + set_rpcmem_initialized(true); + } + _pfn_rpc_mem_init = reinterpret_cast( + dlsym(_rpc_lib_handle, "rpcmem_init")); + _pfn_rpc_mem_deinit = reinterpret_cast( + dlsym(_rpc_lib_handle, "rpcmem_deinit")); + _pfn_rpc_mem_alloc = reinterpret_cast( + dlsym(_rpc_lib_handle, "rpcmem_alloc")); + _pfn_rpc_mem_free = reinterpret_cast( + dlsym(_rpc_lib_handle, "rpcmem_free")); + _pfn_rpc_mem_to_fd = reinterpret_cast( + dlsym(_rpc_lib_handle, "rpcmem_to_fd")); + if (nullptr == _pfn_rpc_mem_alloc || nullptr == _pfn_rpc_mem_free || + nullptr == _pfn_rpc_mem_to_fd) { + QNN_LOG_WARN("unable to access symbols in QNN RPC lib. dlerror(): %s", dlerror()); + dlclose(_rpc_lib_handle); + return 9; + } + + if (nullptr != _pfn_rpc_mem_init) { // make Qualcomm's SoC equipped low-end phone happy + _pfn_rpc_mem_init(); + } + + /* TODO: not used, keep it for further usage + QnnContext_Config_t qnn_context_config = QNN_CONTEXT_CONFIG_INIT; + qnn_context_config.priority = QNN_PRIORITY_DEFAULT; + const QnnContext_Config_t * context_configs[] = {&qnn_context_config, nullptr}; + */ + _qnn_interface.qnn_context_create( + _qnn_backend_handle, _qnn_device_handle, + nullptr, + &_qnn_context_handle); + if (nullptr == _qnn_context_handle) { + QNN_LOG_WARN("why failed to initialize qnn context\n"); + return 10; + } + else { + QNN_LOG_DEBUG("initialize qnn context successfully\n"); + } + + if (_backend_name.find("Htp") != std::variant_npos) { + //TODO: faster approach to probe the accurate capacity of rpc ion memory + size_t candidate_size = 0; + uint8_t* rpc_buffer = nullptr; + const int size_in_mb = (1 << 20); + size_t probe_slots[] = { 1024, 1536, 2048 - 48, 2048 }; + size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); + for (size_t idx = 0; idx < probe_counts; idx++) { + rpc_buffer = static_cast(alloc_rpcmem( + probe_slots[idx] * size_in_mb, 4)); + if (nullptr == rpc_buffer) { + QNN_LOG_INFO("alloc rpcmem %d (MB) failure, %s\n", + probe_slots[idx], strerror(errno)); + break; + } + else { + candidate_size = probe_slots[idx]; + free_rpcmem(rpc_buffer); + rpc_buffer = nullptr; + } + } + if (candidate_size > _rpcmem_capacity) + _rpcmem_capacity = candidate_size; + QNN_LOG_INFO("capacity of QNN rpc ion memory is about %d MB\n", _rpcmem_capacity); + + if (0 != init_htp_perfinfra()) { + QNN_LOG_WARN("initialize HTP performance failure"); + } + if (0 != set_rpc_polling()) { + QNN_LOG_WARN("set RPC polling failure"); + } + if (0 != set_high_performance_mode()) { + QNN_LOG_WARN("set HTP high performance mode failure"); + } + } + + QNN_LOG_DEBUG("leave qni_init\n"); + + return 0; + } + + int qnn_finalize() { + int ret_status = 0; + Qnn_ErrorHandle_t error = QNN_SUCCESS; + + if (nullptr != _pfn_rpc_mem_deinit) // make Qualcomm's SoC equipped low-end phone happy + _pfn_rpc_mem_deinit(); + + if (dlclose(_rpc_lib_handle) != 0) { + QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dlerror()); + } + else { + QNN_LOG_DEBUG("succeed to close rpcmem lib\n"); + } + + if (_backend_name.find("Htp") != std::variant_npos) { + _qnn_htp_perfinfra->destroyPowerConfigId(_qnn_power_configid); + } + + if (nullptr != _qnn_context_handle) { + error = _qnn_interface.qnn_context_free(_qnn_context_handle, + _qnn_profile_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); + } + _qnn_context_handle = nullptr; + } + + if (nullptr != _qnn_profile_handle) { + error = _qnn_interface.qnn_profile_free(_qnn_profile_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); + } + _qnn_profile_handle = nullptr; + } + + if (nullptr != _qnn_device_handle) { + error = _qnn_interface.qnn_device_free(_qnn_device_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); + } + _qnn_device_handle = nullptr; + } + + if (nullptr != _qnn_backend_handle) { + error = _qnn_interface.qnn_backend_free(_qnn_backend_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); + } + _qnn_backend_handle = nullptr; + } + + if (nullptr != _qnn_log_handle) { + error = _qnn_interface.qnn_log_free(_qnn_log_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); + } + _qnn_log_handle = nullptr; + } + + unload_backend(); + + unload_system(); + + return ret_status; + } + + //TODO:keep it for further usage of offload the entire cgraph to a single QNN DAG directly + // which was used in Qualcomm's dedicated AI technology +#if 0 + int init_qnn_graph(const char* graph_name, bool debug, + uint8_t do_node_validation = true, + const QnnGraph_Config_t** graph_configs = nullptr) { + int result = 0; + + if (nullptr == graph_name) { + QNN_LOG_WARN("graph name is null\n"); + return 1; + } + + if (!_graph_name.empty()) { + QNN_LOG_WARN("qnn model for graph %s already initialized\n", graph_name); + return 2; + } + + if (!do_node_validation) { + QNN_LOG_WARN("node validation disabled, backend will not perform op " + "validation prior to adding node\n"); + } + + _graph_name = graph_name; + _debug_tensor = debug; + _do_node_validations = do_node_validation; + + result = _qnn_raw_interface.graphCreate(_qnn_context_handle, graph_name, + graph_configs, &_qnn_graph_handle); + if (result != QNN_GRAPH_NO_ERROR || nullptr == _qnn_graph_handle) { + QNN_LOG_WARN("failed to create graph in qnn context\n"); + return 3; + } + else { + QNN_LOG_INFO("succeed to create graph %s, %p\n", graph_name, _qnn_graph_handle); + } + + return 0; + } + + int finalize_qnn_graph() { + if (nullptr != _qnn_graph_handle) { + if (_qnn_raw_interface.graphFinalize(_qnn_graph_handle, + _qnn_profile_handle, + nullptr) != QNN_GRAPH_NO_ERROR) { + QNN_LOG_WARN("finalizing graph failure\n"); + } + } + else { + QNN_LOG_DEBUG("qnn graph handle is null\n"); + } + + return 0; + } +#endif + + const qnn_interface& get_qnn_interface() { + if (!_qnn_interface.is_loaded()) { + QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + } + return _qnn_interface; + } + + const QNN_INTERFACE_VER_TYPE& get_qnn_raw_interface() { + if (!_qnn_interface.is_loaded()) { + QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + } + return _qnn_raw_interface; + } + + const QNN_SYSTEM_INTERFACE_VER_TYPE& get_qnn_raw_system_interface() { + if (!_qnn_interface.is_loaded()) { + QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + } + return _qnn_raw_system_interface; + } + + const Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; } + + const Qnn_ProfileHandle_t get_qnn_profile_handle() { + return _qnn_profile_handle; + } + + const Qnn_DeviceHandle_t get_qnn_device_handle() { + return _qnn_device_handle; + } + + const Qnn_BackendHandle_t get_qnn_backend_handle() { + return _qnn_backend_handle; + } + + const Qnn_ContextHandle_t get_qnn_context_handle() { + return _qnn_context_handle; + } + + const QnnSystemContext_Handle_t get_qnn_system_handle() { + return _qnn_system_handle; + } + + const Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; } + + int init_htp_perfinfra() { + QnnDevice_Infrastructure_t device_infra = nullptr; + int error = _qnn_raw_interface.deviceGetInfrastructure(&device_infra); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get qnn device infra\n"); + return 1; + } + else { + QNN_LOG_INFO("HTP backend perf_infrastructure creation ok\n"); + } + + QnnHtpDevice_Infrastructure_t* htp_infra = static_cast(device_infra); + QnnHtpDevice_PerfInfrastructure_t* htp_perfinfra = &htp_infra->perfInfra; + uint32_t power_configid = 1; + uint32_t device_id = 0; + uint32_t core_id = 0; + htp_perfinfra->createPowerConfigId(device_id, core_id, &power_configid); + if (htp_infra->infraType != QNN_HTP_DEVICE_INFRASTRUCTURE_TYPE_PERF) { + QNN_LOG_INFO("HTP infra type = %d, which is not perf infra type", htp_infra->infraType); + } + else { + QNN_LOG_INFO("HTP infra type = %d, which is perf infra type\n", htp_infra->infraType); + } + _qnn_htp_perfinfra = htp_perfinfra; + _qnn_power_configid = power_configid; + + return 0; + } + + int set_rpc_polling() { + if (_qnn_htp_perfinfra) { + QnnHtpPerfInfrastructure_PowerConfig_t rpc_polling_time; + memset(&rpc_polling_time, 0, sizeof(rpc_polling_time)); + rpc_polling_time.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME; + //use rpc polling time recommended 0-10000 us + rpc_polling_time.rpcPollingTimeConfig = 9999; + + QnnHtpPerfInfrastructure_PowerConfig_t rpc_control_latency; + memset(&rpc_control_latency, 0, sizeof(rpc_control_latency)); + rpc_control_latency.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY; + //use rpc control latency recommended 100 us, refer hexagon sdk + rpc_control_latency.rpcControlLatencyConfig = 100; + + const QnnHtpPerfInfrastructure_PowerConfig_t* power_configs[] = { + &rpc_polling_time, + &rpc_control_latency, + nullptr }; + Qnn_ErrorHandle_t qnn_status = _qnn_htp_perfinfra->setPowerConfig( + _qnn_power_configid, + power_configs); + if (qnn_status != QNN_SUCCESS) { + QNN_LOG_WARN("set htp perf failed\n"); + } + else { + QNN_LOG_INFO("set htp perf ok\n"); + } + } + else { + QNN_LOG_WARN("can't set htp perf\n"); + } + + return 0; + } + + int set_high_performance_mode() { + if (nullptr == _qnn_htp_perfinfra) { + QNN_LOG_WARN("perf intra is null\n"); + return 1; + } + + QnnHtpPerfInfrastructure_PowerConfig_t power_config; + memset(&power_config, 0, sizeof(power_config)); + power_config.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3; + + power_config.dcvsV3Config.setDcvsEnable = 1; + power_config.dcvsV3Config.dcvsEnable = 0; + power_config.dcvsV3Config.contextId = _qnn_power_configid; + power_config.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE; + power_config.dcvsV3Config.setSleepLatency = + 1; // true to consider Latency parameter otherwise false + power_config.dcvsV3Config.sleepLatency = 40; + power_config.dcvsV3Config.setBusParams = + 1; // true to consider Bus parameter otherwise false + power_config.dcvsV3Config.setCoreParams = + 1; // true to consider Core parameter otherwise false + power_config.dcvsV3Config.sleepDisable = + 1; // true to consider sleep/LPM modes, false to enable + power_config.dcvsV3Config.setSleepDisable = + 1; // true to consider sleep disable/enable parameter otherwise false set sleep latency parameter + // set Bus Clock Parameters + power_config.dcvsV3Config.busVoltageCornerMin = + DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + power_config.dcvsV3Config.busVoltageCornerTarget = + DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + power_config.dcvsV3Config.busVoltageCornerMax = + DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + // set Core Clock Parameters + power_config.dcvsV3Config.coreVoltageCornerMin = + DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + power_config.dcvsV3Config.coreVoltageCornerTarget = + DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + power_config.dcvsV3Config.coreVoltageCornerMax = + DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + + // set power config with different performance parameters + const QnnHtpPerfInfrastructure_PowerConfig_t* power_configs[] = { + &power_config, nullptr }; + Qnn_ErrorHandle_t qnn_status = QNN_SUCCESS; + qnn_status = _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs); + if (qnn_status != QNN_SUCCESS) { + QNN_LOG_WARN("set htp high performance mode failed\n"); + } + else { + QNN_LOG_INFO("set htp high performance mode ok\n"); + } + + return 0; + } + + std::string& get_qnn_graph_name() { return _graph_name; } + + bool is_rpcmem_initialized() { return _rpcmem_initialized; } + + void set_rpcmem_initialized(bool initialized) { + _rpcmem_initialized = initialized; + } + + size_t get_rpcmem_capacity() { return _rpcmem_capacity; } + + bool is_rpcmem_registered(Qnn_MemHandle_t handle) { + return _qnn_mem_set.count(handle) != 0U; + } + + void* alloc_rpcmem(size_t bytes, size_t alignment) { + if (!_rpcmem_initialized) { + QNN_LOG_WARN("rpc memory not initialized\n"); + return nullptr; + } + + auto allocate_bytes = static_cast(bytes + alignment); + void* buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, + allocate_bytes); + if (buf == nullptr) { + QNN_LOG_WARN("failed to allocate rpc memory\n"); + return nullptr; + } + + auto aligned_buf = reinterpret_cast( + qnn::align_to(alignment, reinterpret_cast(buf))); + bool status = + _rpcmem_store_map.insert(std::pair(aligned_buf, buf)).second; + if (!status) { + QNN_LOG_WARN("failed to allocate rpc memory\n"); + _pfn_rpc_mem_free(buf); + } + + return aligned_buf; + } + + void free_rpcmem(void* buf) { + if (!_rpcmem_initialized) { + QNN_LOG_WARN("rpc memory not initialized\n"); + } + else if (0 == _rpcmem_store_map.count(buf)) { + QNN_LOG_WARN("no allocated tensor\n"); + } + else { + _pfn_rpc_mem_free(_rpcmem_store_map[buf]); + _rpcmem_store_map.erase(buf); + } + } + + int32_t rpcmem_to_fd(void* buf) { + int32_t mem_fd = -1; + if (!is_rpcmem_initialized()) { + QNN_LOG_WARN("rpc memory not initialized\n"); + } + else { + mem_fd = _pfn_rpc_mem_to_fd(buf); + } + + return mem_fd; + } + + int register_rpcmem(void* p_data, Qnn_Tensor_t* p_tensor) { + if (nullptr == p_data || (nullptr == p_tensor)) { + QNN_LOG_WARN("invalid param\n"); + return 1; + } + + if (!is_rpcmem_initialized()) { + QNN_LOG_WARN("rpc memory not initialized\n"); + return 2; + } + + if (is_rpcmem_allocated(p_data)) { + QNN_LOG_WARN("rpc memory already allocated\n"); + return 3; + } + + if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) { + QNN_LOG_WARN("tensor %s has been registered shared memory\n", + (QNN_VER_PTR(*p_tensor)->name)); + return 4; + } + + int32_t mem_fd = rpcmem_to_fd(p_data); + if (-1 == mem_fd) { + QNN_LOG_WARN("failed to get file descriptor\n"); + return 5; + } + QNN_LOG_INFO("mem_fd %d\n", mem_fd); + Qnn_MemDescriptor_t descriptor = { {QNN_VER_PTR(*p_tensor)->rank, + QNN_VER_PTR(*p_tensor)->dimensions, + nullptr}, + QNN_VER_PTR(*p_tensor)->dataType, + QNN_MEM_TYPE_ION, + {{mem_fd}} }; + Qnn_MemHandle_t handle = nullptr; + int error = QNN_SUCCESS; + error = _qnn_interface.qnn_mem_register(_qnn_context_handle, &descriptor, + /*numDescriptors=*/1, &handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to register shared memory, error %d, %s\n", + QNN_GET_ERROR_CODE(error), strerror(error)); + return 6; + } + else { + QNN_LOG_INFO("tensor %s successfully register shared memory\n", + (QNN_VER_PTR(*p_tensor)->name)); + } + QNN_VER_PTR(*p_tensor)->memHandle = handle; + _qnn_mem_set.insert((std::pair(p_data, handle))); + + return 0; + } + + void* get_rpcmem_from_memhandle(Qnn_MemHandle_t mem_handle) { + for (std::unordered_map::iterator it = _qnn_mem_set.begin(); + it != _qnn_mem_set.end(); + it++) { + Qnn_MemHandle_t mem_handle = it->second; + if (it->second == mem_handle) { + return it->first; + } + } + QNN_LOG_WARN("can't find rpcmem from qnn mem handle %p", mem_handle); + return nullptr; + } + + void unregister_rpcmem() { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + + if (_qnn_mem_set.empty()) { + QNN_LOG_WARN("no rpcmem registered\n"); + } + + for (std::unordered_map::iterator it = _qnn_mem_set.begin(); + it != _qnn_mem_set.end(); + it++) { + Qnn_MemHandle_t mem_handle = it->second; + error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to unregister shared memory, error %d\n", + QNN_GET_ERROR_CODE(error)); + } + } + _qnn_mem_set.clear(); + } + + bool is_rpcmem_allocated(void* buf) { + return _qnn_mem_set.count(buf) != 0U; + } + + + public: + std::map> + _qnn_graph_map; + + private: + int load_system() { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + + std::string system_lib_path = _lib_path + "libQnnSystem.so"; + QNN_LOG_DEBUG("system_lib_path:%s\n", system_lib_path.c_str()); + + _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL); + if (nullptr == _system_lib_handle) { + QNN_LOG_WARN("can not open QNN library %s, error: %s\n", + system_lib_path.c_str(), dlerror()); + return 1; + } + + auto* get_providers = + reinterpret_cast( + dlsym(_system_lib_handle, "QnnSystemInterface_getProviders")); + if (nullptr == get_providers) { + QNN_LOG_WARN( + "can not load QNN symbol QnnSystemInterface_getProviders: %s\n", + dlerror()); + return 2; + } + + uint32_t num_providers = 0; + const QnnSystemInterface_t** provider_list = nullptr; + error = get_providers(&provider_list, &num_providers); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get providers, error %d\n", + QNN_GET_ERROR_CODE(error)); + return 3; + } + + if (num_providers != _required_num_providers) { + QNN_LOG_WARN("providers is %d instead of required %d\n", num_providers, + _required_num_providers); + return 4; + } + + if (nullptr == provider_list) { + QNN_LOG_WARN("can not get providers\n"); + return 5; + } + + QNN_SYSTEM_INTERFACE_VER_TYPE qnn_system_interface; + bool found_valid_system_interface = false; + for (size_t idx = 0; idx < num_providers; idx++) { + if (QNN_SYSTEM_API_VERSION_MAJOR == + provider_list[idx]->systemApiVersion.major && + QNN_SYSTEM_API_VERSION_MINOR <= + provider_list[idx]->systemApiVersion.minor) { + found_valid_system_interface = true; + qnn_system_interface = + provider_list[idx]->QNN_SYSTEM_INTERFACE_VER_NAME; + break; + } + } + if (!found_valid_system_interface) { + QNN_LOG_WARN("unable to find a valid qnn system interface\n"); + return 6; + } + else { + QNN_LOG_INFO("find a valid qnn system interface\n"); + } + set_qnn_raw_system_interface(qnn_system_interface); + + _qnn_interface.set_qnn_system_interface(provider_list[0]); + + _qnn_interface.qnn_system_context_create(&_qnn_system_handle); + if (nullptr == _qnn_system_handle) { + QNN_LOG_WARN("can not create QNN system contenxt\n"); + } + else { + QNN_LOG_INFO("initialize qnn system successfully\n"); + } + + return 0; + } + + int unload_system() { + int result = 0; + + if (nullptr == _system_lib_handle) { + QNN_LOG_WARN("system lib handle is null\n"); + return 1; + } + + if (nullptr != _qnn_system_handle) { + result = _qnn_interface.qnn_system_context_free(_qnn_system_handle); + if (result != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN system context\n"); + } + _qnn_system_handle = nullptr; + } + + int dlclose_error = dlclose(_system_lib_handle); + if (dlclose_error != 0) { + QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", dlerror()); + return 2; + } + + _system_lib_handle = nullptr; + + return result; + } + + int load_backend(std::string& lib_path, const QnnSaver_Config_t** saver_config) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + QNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str()); + + void* lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL); + if (nullptr == lib_handle) { + QNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dlerror()); + return 1; + } + + auto get_providers = + load_qnn_functionpointers( + lib_handle, "QnnInterface_getProviders"); + if (nullptr == get_providers) { + QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dlerror()); + return 2; + } + + std::uint32_t num_providers = 0; + const QnnInterface_t** provider_list = nullptr; + error = get_providers(&provider_list, &num_providers); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error)); + return 3; + } + QNN_LOG_DEBUG("num_providers=%d\n", num_providers); + if (num_providers != _required_num_providers) { + QNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers); + return 4; + } + + if (nullptr == provider_list) { + QNN_LOG_WARN("failed to get qnn interface providers\n"); + return 5; + } + bool found_valid_interface = false; + QNN_INTERFACE_VER_TYPE qnn_interface; + for (size_t idx = 0; idx < num_providers; idx++) { + if (QNN_API_VERSION_MAJOR == + provider_list[idx]->apiVersion.coreApiVersion.major && + QNN_API_VERSION_MINOR <= + provider_list[idx]->apiVersion.coreApiVersion.minor) { + found_valid_interface = true; + qnn_interface = provider_list[idx]->QNN_INTERFACE_VER_NAME; + break; + } + } + + if (!found_valid_interface) { + QNN_LOG_WARN("unable to find a valid qnn interface\n"); + return 6; + } + else { + QNN_LOG_INFO("find a valid qnn interface\n"); + } + set_qnn_raw_interface(qnn_interface); + + BackendIdType backend_id = provider_list[0]->backendId; + _lib_path_to_backend_id[lib_path] = backend_id; + if (_loaded_backend.count(backend_id) > 0) { + QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n", lib_path.c_str(), backend_id); + } + _loaded_backend[backend_id] = provider_list[0]; + if (_loaded_lib_handle.count(backend_id) > 0) { + QNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]); + int dlclose_error = dlclose(_loaded_lib_handle[backend_id]); + if (dlclose_error != 0) { + QNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], dlerror()); + } + } + _loaded_lib_handle[backend_id] = lib_handle; + _backend_id = backend_id; + + return 0; + } + + int unload_backend() { + int dlclose_error = 0; + for (auto& it : _loaded_lib_handle) { + dlclose_error = dlclose(it.second); + if (dlclose_error != 0) { + QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, dlerror()); + } + } + + _loaded_lib_handle.clear(); + _lib_path_to_backend_id.clear(); + _loaded_backend.clear(); + + return 0; + } + + void set_qnn_raw_interface(QNN_INTERFACE_VER_TYPE& raw_interface) { + _qnn_raw_interface = raw_interface; + } + + void set_qnn_raw_system_interface(QNN_SYSTEM_INTERFACE_VER_TYPE& raw_interface) { + _qnn_raw_system_interface = raw_interface; + } + + private: + static constexpr const int _required_num_providers = 1; + + private: + std::string _lib_path; + std::string _backend_name; + std::string _model_name; // Qualcomm's dedicated prebuilt model name, keep it for further usage + BackendIdType _backend_id; + + bool _debug_tensor = false; + bool _do_node_validations = true; + + QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_DEBUG; + + qnn::sdk_profile_level _profile_level = qnn::sdk_profile_level::profile_detail; + + qnn_interface _qnn_interface; + + void* _system_lib_handle = nullptr; + + Qnn_GraphHandle_t _qnn_graph_handle = nullptr; + + Qnn_LogHandle_t _qnn_log_handle = nullptr; + + Qnn_ProfileHandle_t _qnn_profile_handle = nullptr; + + Qnn_DeviceHandle_t _qnn_device_handle = nullptr; + + Qnn_BackendHandle_t _qnn_backend_handle = nullptr; + + Qnn_ContextHandle_t _qnn_context_handle = nullptr; + + QnnSystemContext_Handle_t _qnn_system_handle = nullptr; + + QnnHtpDevice_PerfInfrastructure_t* _qnn_htp_perfinfra = nullptr; + uint32_t _qnn_power_configid = 1; + + QNN_INTERFACE_VER_TYPE _qnn_raw_interface; + QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface; + + std::unordered_map _qnn_mem_set; + + std::mutex _init_mutex; + std::unordered_map _loaded_lib_handle; + std::unordered_map _lib_path_to_backend_id; + std::unordered_map _loaded_backend; + + void* _rpc_lib_handle = nullptr; + std::atomic_bool _rpcmem_initialized{ false }; + qnn::pfn_rpc_mem_alloc _pfn_rpc_mem_alloc; + qnn::pfn_rpc_mem_free _pfn_rpc_mem_free; + qnn::pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd; + qnn::pfn_rpc_mem_init _pfn_rpc_mem_init; + qnn::pfn_rpc_mem_deinit _pfn_rpc_mem_deinit; + std::unordered_map _rpcmem_store_map; + size_t _rpcmem_capacity = 512; + + std::string _graph_name; + }; + +} diff --git a/ggml-qnn/tensor.hpp b/ggml-qnn/tensor.hpp new file mode 100644 index 0000000000000..514061146e840 --- /dev/null +++ b/ggml-qnn/tensor.hpp @@ -0,0 +1,145 @@ + +#pragma once + +#include "QnnTensor.h" +#include "System/QnnSystemInterface.h" + +#include "backend.hpp" +#include "qnn.hpp" + +namespace qnn { + + template class ggml_qnn_tensor_readwrite { + public: + ggml_qnn_tensor_readwrite(const ggml_tensor* tensor, + Qnn_GraphHandle_t graph_handle, + ggml_backend_qnn_context* ctx) + : _tensor(tensor), + _qnn_tensor(reinterpret_cast(tensor->extra)), + _context(ctx) { + _old_dimensions = QNN_VER_PTR(*_qnn_tensor)->dimensions; + const auto qnn_data_type = datatype_from_ggml_datatype(tensor->type); + const bool is_npu = ctx->device == QNN_BACKEND_NPU; + QNN_VER_PTR(*_qnn_tensor)->type = _tensorType; + if (is_npu) { + QNN_VER_PTR(*_qnn_tensor)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; + QNN_VER_PTR(*_qnn_tensor)->clientBuf = { .data = nullptr, .dataSize = 0 }; + } + + auto err = + ctx->raw_interface.tensorCreateGraphTensor(graph_handle, _qnn_tensor); + if (err != QNN_SUCCESS) { + QNN_LOG_INFO("error = %d\n", err); + QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, + QNN_TENSOR_GET_NAME(*_qnn_tensor)); + _context = nullptr; + return; + } + + _dimensions[0] = (uint32_t)tensor->ne[0]; + _dimensions[1] = (uint32_t)tensor->ne[1]; + _dimensions[2] = (uint32_t)tensor->ne[2]; + _dimensions[3] = (uint32_t)tensor->ne[3]; + QNN_VER_PTR(*_qnn_tensor)->dimensions = _dimensions; + QNN_VER_PTR(*_qnn_tensor)->rank = qnn::get_ggml_tensor_rank(tensor); + QNN_VER_PTR(*_qnn_tensor)->dataType = qnn_data_type; + + if (is_npu) { + qnn_instance* instance = ctx->instance; + uint8_t* qnn_buffer = static_cast( + instance->alloc_rpcmem(ggml_nbytes(tensor), alignof(void*))); + if (!qnn_buffer) { + QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, + QNN_TENSOR_GET_NAME(*_qnn_tensor)); + _context = nullptr; + // No free for _qnn_tensor, because it's not registered. + return; + } + else { + QNN_LOG_INFO("alloc rpcmem successfully\n"); + } + + instance->register_rpcmem(qnn_buffer, _qnn_tensor); + if (_tensorType == QNN_TENSOR_TYPE_APP_WRITE || + _tensorType == QNN_TENSOR_TYPE_APP_READWRITE) { + memcpy(qnn_buffer, tensor->data, ggml_nbytes(tensor)); + } + } + else { + QNN_VER_PTR(*_qnn_tensor)->clientBuf = { + tensor->data, qnn_get_ggml_tensor_data_size(tensor) }; + } + } + + ggml_qnn_tensor_readwrite(const ggml_tensor* tensor, Qnn_Tensor_t* qnn_tensor, + ggml_backend_qnn_context* ctx) + : _tensor(tensor), _qnn_tensor(qnn_tensor), _context(ctx) { + _old_dimensions = QNN_VER_PTR(*_qnn_tensor)->dimensions; + const auto qnn_data_type = qnn_datatype_from_ggml_datatype(tensor->type); + const bool is_npu = ctx->device == QNN_BACKEND_NPU; + + _dimensions[0] = (uint32_t)tensor->ne[0]; + _dimensions[1] = (uint32_t)tensor->ne[1]; + _dimensions[2] = (uint32_t)tensor->ne[2]; + _dimensions[3] = (uint32_t)tensor->ne[3]; + QNN_VER_PTR(*_qnn_tensor)->dimensions = _dimensions; + QNN_VER_PTR(*_qnn_tensor)->rank = qnn_get_ggml_tensor_rank(tensor); + QNN_VER_PTR(*_qnn_tensor)->dataType = qnn_data_type; + + if (is_npu) { + uint8_t* qnn_buffer = + static_cast(ctx->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*_qnn_tensor)->memHandle)); + if (qnn_buffer) { + memcpy(qnn_buffer, tensor->data, ggml_nbytes(tensor)); + } + else { + QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n"); + QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, + QNN_TENSOR_GET_NAME(*_qnn_tensor)); + _context = nullptr; + return; + } + } + else { + QNN_VER_PTR(*_qnn_tensor)->clientBuf = { + tensor->data, qnn_get_ggml_tensor_data_size(tensor) }; + } + } + + ~ggml_qnn_tensor_readwrite() { + if ((_tensorType == QNN_TENSOR_TYPE_APP_READWRITE || + _tensorType == QNN_TENSOR_TYPE_APP_READ) && + _context && _context->device == QNN_BACKEND_NPU) { + uint8_t* qnn_buffer = + static_cast(_context->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*_qnn_tensor)->memHandle)); + memcpy(_tensor->data, qnn_buffer, ggml_nbytes(_tensor)); + } + + QNN_VER_PTR(*_qnn_tensor)->dimensions = _old_dimensions; + } + + bool is_valid() const { return _context; } + Qnn_Tensor_t* get_qnn_tensor() const { return _qnn_tensor; } + + private: + const ggml_tensor* _tensor; + Qnn_Tensor_t* _qnn_tensor; + ggml_backend_qnn_context* _context; + uint32_t* _old_dimensions; + uint32_t _dimensions[4] = {}; + + ggml_qnn_tensor_readwrite(const ggml_qnn_tensor_readwrite&) = delete; + void operator=(const ggml_qnn_tensor_readwrite&) = delete; + ggml_qnn_tensor_readwrite(ggml_qnn_tensor_readwrite&&) = delete; + void operator=(ggml_qnn_tensor_readwrite&&) = delete; + }; + + using ggml_qnn_tensor_output = + ggml_qnn_tensor_readwrite; + using ggml_qnn_tensor_input = + ggml_qnn_tensor_readwrite; + +} // namespace qnn diff --git a/ggml-qnn/utils.hpp b/ggml-qnn/utils.hpp new file mode 100644 index 0000000000000..4141c4e33c79d --- /dev/null +++ b/ggml-qnn/utils.hpp @@ -0,0 +1,99 @@ +#pragma once + +#include "QnnTypes.h" + +#include "ggml.h" + +#include "qnn-types.hpp" + +namespace qnn { + + // TODO: mapping more ggml data type to QNN data type + // ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684 + Qnn_DataType_t datatype_from_ggml_datatype(enum ggml_type ggmltype) { + switch (ggmltype) { + case GGML_TYPE_F16: + return QNN_DATATYPE_FLOAT_16; + case GGML_TYPE_F32: + return QNN_DATATYPE_FLOAT_32; + case GGML_TYPE_I8: + return QNN_DATATYPE_INT_8; + case GGML_TYPE_Q8_0: + return QNN_DATATYPE_SFIXED_POINT_8; + case GGML_TYPE_Q4_0: + return QNN_DATATYPE_SFIXED_POINT_4; + default: + break; + } + return QNN_DATATYPE_UNDEFINED; + } + + + uint32_t get_ggml_tensor_rank(const ggml_tensor* tensor) { + uint32_t rank = 0; + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) { + rank++; + } + } + return rank; + } + + + const char* get_backend_name(int n_backend_type) { + switch (n_backend_type) { + case QNN_BACKEND_CPU: + return "QNN-CPU"; + case QNN_BACKEND_GPU: + return "QNN-GPU"; + case QNN_BACKEND_NPU: + return "QNN-NPU"; + case QNN_BACKEND_GGML: + return "ggml"; //"fake" QNN backend, used for compare performance between QNN backend and original GGML + default: + return "unknown"; + } + } + + const char* get_chipset_desc(uint32_t chipset_id) { + switch (chipset_id) { + case SM8450: + return "SM8450"; + case SM8475: + return "SM8475"; + case SM8550: + return "SM8550"; + case SM8650: + return "SM8650"; + default: + return "unknown"; + } + } + + const char* get_htparch_desc(size_t htp_arch) { + switch (htp_arch) { + case V68: + return "QCOM_HTP_V68"; + case V69: + return "QCOM_HTP_V69"; + case V73: + return "QCOM_HTP_V73"; + case V75: + return "QCOM_HTP_V75"; + default: + return "unknown"; + } + } + + template Fn load_qnn_functionpointers(void* handle, const char* function_name) { + return reinterpret_cast(dlsym(handle, function_name)); + } + + intptr_t align_to(size_t alignment, intptr_t offset) { + return offset % alignment == 0 + ? offset + : offset + (static_cast(alignment) - + offset % static_cast(alignment)); + } + +} diff --git a/tests/ggml-qnn/CMakeLists.txt b/tests/ggml-qnn/CMakeLists.txt index bf061e6c7c3a1..77a2059ed0f0c 100644 --- a/tests/ggml-qnn/CMakeLists.txt +++ b/tests/ggml-qnn/CMakeLists.txt @@ -20,6 +20,7 @@ set(SOURCE_FILES ../../ggml-alloc.c ../../ggml-backend.c ../../ggml-quants.c + ../../ggml-qnn/logger.cpp ../../ggml-qnn.cpp ggml-qnn-ut.cpp ) From 3c491a32634cafa011fed756783a8fe655d988cc Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 19 Jun 2024 14:43:22 +0800 Subject: [PATCH 028/166] remove reference of g_qnn_mgr in qnn_instance --- ggml-qnn.cpp | 1 + ggml-qnn/qnn.hpp | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index a552fd5ec935e..b59126067595c 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -1743,6 +1743,7 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { g_qnn_mgr[device].instance = instance; g_qnn_mgr[device].raw_interface = instance->get_qnn_raw_interface(); g_qnn_mgr[device].raw_system_interface = instance->get_qnn_raw_system_interface(); + g_qnn_mgr[device].socinfo = instance->get_soc_info(); ggml_backend_t qnn_backend = new ggml_backend{/* .guid = */ ggml_backend_qnn_guid(), diff --git a/ggml-qnn/qnn.hpp b/ggml-qnn/qnn.hpp index bd83a9f05e946..15df7dcbbe300 100644 --- a/ggml-qnn/qnn.hpp +++ b/ggml-qnn/qnn.hpp @@ -262,7 +262,7 @@ namespace qnn_internal { QNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB", chipinfo.socModel, qnn::get_chipset_desc(chipinfo.socModel), htp_arch, qnn::get_htparch_desc(htp_arch), chipinfo.vtcmSize); - g_qnn_mgr[QNN_BACKEND_NPU].socinfo = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize }; + _soc_info = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize }; } _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info); @@ -864,6 +864,7 @@ namespace qnn_internal { return _qnn_mem_set.count(buf) != 0U; } + const qnn::qcom_socinfo &get_soc_info() { return _soc_info; } public: std::map Date: Wed, 19 Jun 2024 14:47:41 +0800 Subject: [PATCH 029/166] fix compiling error --- ggml-qnn.cpp | 84 ++++++++++++++++-------------------------- ggml-qnn/backend.hpp | 2 +- ggml-qnn/qnn-types.hpp | 23 +++++++++--- ggml-qnn/qnn.hpp | 9 ++--- ggml-qnn/tensor.hpp | 10 ++--- ggml-qnn/utils.hpp | 13 +++++++ 6 files changed, 72 insertions(+), 69 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index b59126067595c..fdbcbdafb6641 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -70,27 +70,27 @@ typedef void (*ggml_qnn_func_t)(ggml_backend_qnn_context * ctx, static struct qnn::qcom_socinfo g_qnn_soc_info_table[] = { /* Qualcomm SnapDragon 8 Gen 1 */ - [SM8450] = { - .soc_model = SM8450, - .htp_arch = V69, + [qnn::SM8450] = { + .soc_model = qnn::SM8450, + .htp_arch = qnn::V69, .vtcm_size_in_mb = 8}, /* Qualcomm SnapDragon 8 Gen 1+ */ - [SM8475] = { - .soc_model = SM8475, - .htp_arch = V69, + [qnn::SM8475] = { + .soc_model = qnn::SM8475, + .htp_arch = qnn::V69, .vtcm_size_in_mb = 8}, /* Qualcomm SnapDragon 8 Gen 2 */ - [SM8550] = { - .soc_model = SM8550, - .htp_arch = V73, + [qnn::SM8550] = { + .soc_model = qnn::SM8550, + .htp_arch = qnn::V73, .vtcm_size_in_mb = 8}, /* Qualcomm SnapDragon 8 Gen 3 */ - [SM8650] = { - .soc_model = SM8650, - .htp_arch = V75, + [qnn::SM8650] = { + .soc_model = qnn::SM8650, + .htp_arch = qnn::V75, .vtcm_size_in_mb = 8}, }; @@ -198,19 +198,6 @@ static const char * qnn_opname_from_ggmlop(enum ggml_op ggmlop) { return nullptr; } -static uint32_t qnn_get_ggml_tensor_data_size(const ggml_tensor * tensor) { - /* - size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); - size_t n_dims = qnn_get_ggml_tensor_rank(tensor); - for (int i = 1; i < n_dims; i++) { - data_size *= tensor->ne[i]; - } - - return data_size; - */ - return ggml_nbytes(tensor); -} - static bool qnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { if ((nullptr == ctx) || (nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { @@ -218,10 +205,10 @@ static bool qnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tenso return false; } - qnn_instance * instance = nullptr; - Qnn_Tensor_t * tensor_0 = nullptr; - Qnn_Tensor_t * tensor_1 = nullptr; - Qnn_Tensor_t * tensor_2 = nullptr; + qnn_internal::qnn_instance *instance = nullptr; + Qnn_Tensor_t *tensor_0 = nullptr; + Qnn_Tensor_t *tensor_1 = nullptr; + Qnn_Tensor_t *tensor_2 = nullptr; tensor_0 = (Qnn_Tensor_t *) src0->extra; tensor_1 = (Qnn_Tensor_t *) src1->extra; tensor_2 = (Qnn_Tensor_t *) dst->extra; @@ -283,13 +270,6 @@ class qnn_perf { }; #endif -using pfn_qnnsaver_initialize = decltype(QnnSaver_initialize); -using pfn_qnninterface_getproviders = decltype(QnnInterface_getProviders); -using pfn_qnnsysteminterface_getproviders = decltype(QnnSystemInterface_getProviders); - -#define RPCMEM_DEFAULT_FLAGS 1 -#define RPCMEM_HEAP_ID_SYSTEM 25 - #define VALIDATE(value, status) \ do { \ status = value; \ @@ -625,11 +605,11 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src const ggml_tensor * src1, ggml_tensor * dst) { Qnn_ErrorHandle_t error = QNN_SUCCESS; bool graph_initialized = false; - qnn_instance * instance = nullptr; - std::string graph_name = "ggml_op_qnn_add"; - Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Param_t qnn_params[] = {}; - enum ggml_op ggmlop = GGML_OP_ADD; + qnn_internal::qnn_instance *instance = nullptr; + std::string graph_name = "ggml_op_qnn_add"; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Param_t qnn_params[] = {}; + enum ggml_op ggmlop = GGML_OP_ADD; CHECK_PARAMS(ctx, src0, src1, dst); instance = ctx->instance; @@ -817,13 +797,13 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - bool graph_initialized = false; - qnn_instance * instance = nullptr; - std::string graph_name = "ggml_op_qnn_mul_mat"; - Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Param_t qnn_params[] = {}; - enum ggml_op ggmlop = GGML_OP_MUL_MAT; + Qnn_ErrorHandle_t error = QNN_SUCCESS; + bool graph_initialized = false; + qnn_internal::qnn_instance *instance = nullptr; + std::string graph_name = "ggml_op_qnn_mul_mat"; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Param_t qnn_params[] = {}; + enum ggml_op ggmlop = GGML_OP_MUL_MAT; CHECK_PARAMS(ctx, src0, src1, dst); instance = ctx->instance; @@ -1492,8 +1472,9 @@ GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; QNN_LOG_INFO("idx %d, name:%s", ctx->device, g_qnn_mgr[ctx->device].name); - qnn_instance * instance = (qnn_instance *)g_qnn_mgr[ctx->device].instance; + auto *instance = g_qnn_mgr[ctx->device].instance; if (instance != nullptr) { + // TODO: this should be done inside the destructor std::map>::iterator graph_it; @@ -1721,9 +1702,8 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { } } - qnn_instance * instance = nullptr; - instance = new qnn_instance(qnn_lib_path, g_qnn_mgr[device].lib, ""); - result = instance->qnn_init(nullptr); + auto *instance = new qnn_internal::qnn_instance(qnn_lib_path, g_qnn_mgr[device].lib, ""); + result = instance->qnn_init(nullptr); if (0 != result) { QNN_LOG_WARN( "init qnn subsystem failed with qnn backend %s, pls check why\n", diff --git a/ggml-qnn/backend.hpp b/ggml-qnn/backend.hpp index 3a624eab050ac..fd40d8ad24066 100644 --- a/ggml-qnn/backend.hpp +++ b/ggml-qnn/backend.hpp @@ -20,5 +20,5 @@ struct ggml_backend_qnn_context { struct ggml_backend* backend; QNN_INTERFACE_VER_TYPE raw_interface; QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface; - struct qcom_socinfo socinfo; + qnn::qcom_socinfo socinfo; }; diff --git a/ggml-qnn/qnn-types.hpp b/ggml-qnn/qnn-types.hpp index 33f468eb796d1..db1d592f08a20 100644 --- a/ggml-qnn/qnn-types.hpp +++ b/ggml-qnn/qnn-types.hpp @@ -1,6 +1,12 @@ #pragma once +#include "QnnTypes.h" +#include "QnnCommon.h" +#include "QnnInterface.h" +#include "Saver/QnnSaver.h" +#include "System/QnnSystemInterface.h" + namespace qnn { // ================================================================================================= // @@ -30,17 +36,24 @@ namespace qnn { SM8650 = 57, // v75 }; + struct qcom_socinfo { + uint32_t soc_model; + size_t htp_arch; + size_t vtcm_size_in_mb; + }; + using pfn_rpc_mem_init = void (*)(void); using pfn_rpc_mem_deinit = void (*)(void); using pfn_rpc_mem_alloc = void* (*) (int, uint32_t, int); using pfn_rpc_mem_free = void (*)(void*); using pfn_rpc_mem_to_fd = int (*)(void*); - struct qcom_socinfo { - uint32_t soc_model; - size_t htp_arch; - size_t vtcm_size_in_mb; - }; + using pfn_qnnsaver_initialize = decltype(QnnSaver_initialize); + using pfn_qnninterface_getproviders = decltype(QnnInterface_getProviders); + using pfn_qnnsysteminterface_getproviders = decltype(QnnSystemInterface_getProviders); } #define QNN_VER_PTR(x) (&((x).v1)) // TODO: remove this macro after we have a separate header for QNN + +#define RPCMEM_DEFAULT_FLAGS 1 +#define RPCMEM_HEAP_ID_SYSTEM 25 diff --git a/ggml-qnn/qnn.hpp b/ggml-qnn/qnn.hpp index 15df7dcbbe300..8d8ab72b446e3 100644 --- a/ggml-qnn/qnn.hpp +++ b/ggml-qnn/qnn.hpp @@ -11,9 +11,6 @@ #include "QnnGraph.h" #include "QnnProperty.h" #include "QnnTensor.h" -#include "QnnInterface.h" -#include "Saver/QnnSaver.h" -#include "System/QnnSystemInterface.h" #include "HTP/QnnHtpDevice.h" #include "HTP/QnnHtpGraph.h" @@ -864,7 +861,7 @@ namespace qnn_internal { return _qnn_mem_set.count(buf) != 0U; } - const qnn::qcom_socinfo &get_soc_info() { return _soc_info; } + const qnn::qcom_socinfo& get_soc_info() { return _soc_info; } public: std::map( + reinterpret_cast( dlsym(_system_lib_handle, "QnnSystemInterface_getProviders")); if (nullptr == get_providers) { QNN_LOG_WARN( @@ -988,7 +985,7 @@ namespace qnn_internal { } auto get_providers = - load_qnn_functionpointers( + qnn::load_qnn_functionpointers( lib_handle, "QnnInterface_getProviders"); if (nullptr == get_providers) { QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dlerror()); diff --git a/ggml-qnn/tensor.hpp b/ggml-qnn/tensor.hpp index 514061146e840..687ebd8905ef4 100644 --- a/ggml-qnn/tensor.hpp +++ b/ggml-qnn/tensor.hpp @@ -45,7 +45,7 @@ namespace qnn { QNN_VER_PTR(*_qnn_tensor)->dataType = qnn_data_type; if (is_npu) { - qnn_instance* instance = ctx->instance; + auto* instance = ctx->instance; uint8_t* qnn_buffer = static_cast( instance->alloc_rpcmem(ggml_nbytes(tensor), alignof(void*))); if (!qnn_buffer) { @@ -68,7 +68,7 @@ namespace qnn { } else { QNN_VER_PTR(*_qnn_tensor)->clientBuf = { - tensor->data, qnn_get_ggml_tensor_data_size(tensor) }; + tensor->data, get_ggml_tensor_data_size(tensor) }; } } @@ -76,7 +76,7 @@ namespace qnn { ggml_backend_qnn_context* ctx) : _tensor(tensor), _qnn_tensor(qnn_tensor), _context(ctx) { _old_dimensions = QNN_VER_PTR(*_qnn_tensor)->dimensions; - const auto qnn_data_type = qnn_datatype_from_ggml_datatype(tensor->type); + const auto qnn_data_type = qnn::datatype_from_ggml_datatype(tensor->type); const bool is_npu = ctx->device == QNN_BACKEND_NPU; _dimensions[0] = (uint32_t)tensor->ne[0]; @@ -84,7 +84,7 @@ namespace qnn { _dimensions[2] = (uint32_t)tensor->ne[2]; _dimensions[3] = (uint32_t)tensor->ne[3]; QNN_VER_PTR(*_qnn_tensor)->dimensions = _dimensions; - QNN_VER_PTR(*_qnn_tensor)->rank = qnn_get_ggml_tensor_rank(tensor); + QNN_VER_PTR(*_qnn_tensor)->rank = get_ggml_tensor_rank(tensor); QNN_VER_PTR(*_qnn_tensor)->dataType = qnn_data_type; if (is_npu) { @@ -104,7 +104,7 @@ namespace qnn { } else { QNN_VER_PTR(*_qnn_tensor)->clientBuf = { - tensor->data, qnn_get_ggml_tensor_data_size(tensor) }; + tensor->data, get_ggml_tensor_data_size(tensor) }; } } diff --git a/ggml-qnn/utils.hpp b/ggml-qnn/utils.hpp index 4141c4e33c79d..c952e8bc298c6 100644 --- a/ggml-qnn/utils.hpp +++ b/ggml-qnn/utils.hpp @@ -96,4 +96,17 @@ namespace qnn { offset % static_cast(alignment)); } + uint32_t get_ggml_tensor_data_size(const ggml_tensor* tensor) { + /* + size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); + size_t n_dims = qnn_get_ggml_tensor_rank(tensor); + for (int i = 1; i < n_dims; i++) { + data_size *= tensor->ne[i]; + } + + return data_size; + */ + return ggml_nbytes(tensor); + } + } From 37a1585eade7b45b4a0ce01f50b5640ec22f0418 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 19 Jun 2024 17:36:50 +0800 Subject: [PATCH 030/166] rename --- ggml-qnn.cpp | 38 +++++++++++++++++++------------------- ggml-qnn/backend.hpp | 2 +- ggml-qnn/qnn.hpp | 2 +- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index fdbcbdafb6641..b9599293ba177 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -205,10 +205,10 @@ static bool qnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tenso return false; } - qnn_internal::qnn_instance *instance = nullptr; - Qnn_Tensor_t *tensor_0 = nullptr; - Qnn_Tensor_t *tensor_1 = nullptr; - Qnn_Tensor_t *tensor_2 = nullptr; + qnn::qnn_instance *instance = nullptr; + Qnn_Tensor_t *tensor_0 = nullptr; + Qnn_Tensor_t *tensor_1 = nullptr; + Qnn_Tensor_t *tensor_2 = nullptr; tensor_0 = (Qnn_Tensor_t *) src0->extra; tensor_1 = (Qnn_Tensor_t *) src1->extra; tensor_2 = (Qnn_Tensor_t *) dst->extra; @@ -603,13 +603,13 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, // keep it for illustrate how to implement a specified GGMPL OP using QNN API + QNN RPC static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - bool graph_initialized = false; - qnn_internal::qnn_instance *instance = nullptr; - std::string graph_name = "ggml_op_qnn_add"; - Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Param_t qnn_params[] = {}; - enum ggml_op ggmlop = GGML_OP_ADD; + Qnn_ErrorHandle_t error = QNN_SUCCESS; + bool graph_initialized = false; + qnn::qnn_instance *instance = nullptr; + std::string graph_name = "ggml_op_qnn_add"; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Param_t qnn_params[] = {}; + enum ggml_op ggmlop = GGML_OP_ADD; CHECK_PARAMS(ctx, src0, src1, dst); instance = ctx->instance; @@ -797,13 +797,13 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - bool graph_initialized = false; - qnn_internal::qnn_instance *instance = nullptr; - std::string graph_name = "ggml_op_qnn_mul_mat"; - Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Param_t qnn_params[] = {}; - enum ggml_op ggmlop = GGML_OP_MUL_MAT; + Qnn_ErrorHandle_t error = QNN_SUCCESS; + bool graph_initialized = false; + qnn::qnn_instance *instance = nullptr; + std::string graph_name = "ggml_op_qnn_mul_mat"; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Param_t qnn_params[] = {}; + enum ggml_op ggmlop = GGML_OP_MUL_MAT; CHECK_PARAMS(ctx, src0, src1, dst); instance = ctx->instance; @@ -1702,7 +1702,7 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { } } - auto *instance = new qnn_internal::qnn_instance(qnn_lib_path, g_qnn_mgr[device].lib, ""); + auto *instance = new qnn::qnn_instance(qnn_lib_path, g_qnn_mgr[device].lib, ""); result = instance->qnn_init(nullptr); if (0 != result) { QNN_LOG_WARN( diff --git a/ggml-qnn/backend.hpp b/ggml-qnn/backend.hpp index fd40d8ad24066..1f674103d29ac 100644 --- a/ggml-qnn/backend.hpp +++ b/ggml-qnn/backend.hpp @@ -16,7 +16,7 @@ struct ggml_backend_qnn_context { int threads; char name[GGML_MAX_NAME]; char lib[GGML_MAX_NAME]; - qnn_internal::qnn_instance* instance; + qnn::qnn_instance* instance; struct ggml_backend* backend; QNN_INTERFACE_VER_TYPE raw_interface; QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface; diff --git a/ggml-qnn/qnn.hpp b/ggml-qnn/qnn.hpp index 8d8ab72b446e3..212b6f8521745 100644 --- a/ggml-qnn/qnn.hpp +++ b/ggml-qnn/qnn.hpp @@ -17,7 +17,7 @@ #include "utils.hpp" #include "logger.hpp" -namespace qnn_internal { +namespace qnn { // ================================================================================================= // From ff0359d6f4190c97668680f7ac27477c6d0a21af Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 19 Jun 2024 18:16:11 +0800 Subject: [PATCH 031/166] move qnn helper function into utility files --- ggml-qnn.cpp | 194 +----------------------------------------- ggml-qnn/backend.hpp | 14 ++-- ggml-qnn/utils.hpp | 196 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 205 insertions(+), 199 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index b9599293ba177..3a667a1970aba 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -183,21 +183,6 @@ struct ggml_backend_qnn_buffer_type_context { // QNN backend internal helper functions // // ================================================================================================= -// TODO: only support GGML_OP_ADD/GGML_OP_MUL/GGML_OP_MUL_MAT -static const char * qnn_opname_from_ggmlop(enum ggml_op ggmlop) { - switch (ggmlop) { - case GGML_OP_ADD: - return QNN_OP_ELEMENT_WISE_ADD; - case GGML_OP_MUL: - return QNN_OP_ELEMENT_WISE_MULTIPLY; - case GGML_OP_MUL_MAT: - return QNN_OP_MAT_MUL; - default: - break; - } - return nullptr; -} - static bool qnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { if ((nullptr == ctx) || (nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { @@ -270,181 +255,6 @@ class qnn_perf { }; #endif -#define VALIDATE(value, status) \ - do { \ - status = value; \ - if (status != QNN_SUCCESS) { \ - QNN_LOG_WARN("%s expected QNN_SUCCESS\n", #value); \ - return status; \ - } \ - } while (0) - -#define QNN_TENSOR_GET_ID(tensor) get_qnn_tensorid(tensor) -#define QNN_TENSOR_GET_NAME(tensor) get_qnn_tensorname(tensor) -#define QNN_TENSOR_GET_TYPE(tensor) get_qnn_tensortype(tensor) -#define QNN_TENSOR_GET_DATA_FORMAT(tensor) get_qnn_tensor_dataformat(tensor) -#define QNN_TENSOR_GET_DATA_TYPE(tensor) get_qnn_tensor_datatype(tensor) -#define QNN_TENSOR_GET_QUANT_PARAMS(tensor) get_qnn_tensor_quantparams(tensor) -#define QNN_TENSOR_GET_RANK(tensor) get_qnn_tensor_rank(tensor) -#define QNN_TENSOR_GET_DIMENSIONS(tensor) get_qnn_tensor_dimensions(tensor) -#define QNN_TENSOR_GET_MEM_TYPE(tensor) get_qnn_tensor_memtype(tensor) - -#define QNN_TENSOR_SET_ID(tensor, value) set_qnn_tensor_id(tensor, value) -#define QNN_TENSOR_SET_NAME(tensor, value) set_qnn_tensor_name(tensor, value) -#define QNN_TENSOR_SET_TYPE(tensor, value) set_qnn_tensor_type(tensor, value) -#define QNN_TENSOR_SET_DATA_FORMAT(tensor, value) set_qnn_tensor_dataformat(tensor, value) -#define QNN_TENSOR_SET_DATA_TYPE(tensor, value) set_qnn_tensor_datatype(tensor, value) -#define QNN_TENSOR_SET_QUANT_PARAMS(tensor, value) set_qnn_tensor_quantparams(tensor, value) -#define QNN_TENSOR_SET_RANK(tensor, value) set_qnn_tensor_rank(tensor, value) -#define QNN_TENSOR_SET_DIMENSIONS(tensor, value) set_qnn_tensor_dimensions(tensor, value) -#define QNN_TENSOR_SET_MEM_TYPE(tensor, value) set_qnn_tensor_memtype(tensor, value) -#define QNN_TENSOR_SET_CLIENT_BUF(tensor, value) set_qnn_tensor_clientbuf(tensor, value) -#define QNN_TENSOR_SET_MEM_HANDLE(tensor, value) set_qnn_tensor_memhandle(tensor, value) -#define VALIDATE_TENSOR_VERSION(tensor, err) VALIDATE(validate_tensor_version(tensor), err) - -static inline int validate_tensor_version(Qnn_Tensor_t tensor) { - if (tensor.version != QNN_TENSOR_VERSION_1) { - QNN_LOG_WARN( - "validate_tensor_version() tensor %s, got unsupported version %d\n", - tensor.v1.name, tensor.version); - return 1; - } - return 0; -} - -static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.id; - } - - return 0u; -} - -static inline const char * get_qnn_tensorname(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.name; - } - return nullptr; -} - -static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.type; - } - return QNN_TENSOR_TYPE_UNDEFINED; -} - -static inline Qnn_TensorDataFormat_t - get_qnn_tensor_dataformat(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.dataFormat; - } - return QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER; -} - -static inline Qnn_DataType_t - get_qnn_tensor_datatype(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.dataType; - } - return QNN_DATATYPE_UNDEFINED; -} - -static inline Qnn_QuantizeParams_t - get_qnn_tensor_quantparams(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.quantizeParams; - } - return QNN_QUANTIZE_PARAMS_INIT; -} - -static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.rank; - } - return 0u; -} - -static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.dimensions; - } - return nullptr; -} - -static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.memType; - } - return QNN_TENSORMEMTYPE_UNDEFINED; -} - -static inline void set_qnn_tensor_id(Qnn_Tensor_t & tensor, uint32_t id) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.id = id; - } -} - -static inline void set_qnn_tensor_name(Qnn_Tensor_t & tensor, const char * name) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.name = name; - } -} - -static inline void set_qnn_tensor_type(Qnn_Tensor_t & tensor, Qnn_TensorType_t type) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.type = type; - } -} - -static inline void set_qnn_tensor_dataformat(Qnn_Tensor_t & tensor, Qnn_TensorDataFormat_t format) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.dataFormat = format; - } -} - -static inline void set_qnn_tensor_datatype(Qnn_Tensor_t & tensor, Qnn_DataType_t dataType) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.dataType = dataType; - } -} - -static inline void set_qnn_tensor_quantparams(Qnn_Tensor_t & tensor, Qnn_QuantizeParams_t params) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.quantizeParams = params; - } -} - -static inline void set_qnn_tensor_rank(Qnn_Tensor_t & tensor, uint32_t rank) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.rank = rank; - } -} - -static inline void set_qnn_tensor_dimensions(Qnn_Tensor_t & tensor, uint32_t * dims) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.dimensions = dims; - } -} - -static inline void set_qnn_tensor_memtype(Qnn_Tensor_t & tensor, Qnn_TensorMemType_t mem_type) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.memType = mem_type; - } -} - -static inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t & tensor, Qnn_ClientBuffer_t client_buf) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.clientBuf = client_buf; - } -} - -static inline void set_qnn_tensor_memhandle(Qnn_Tensor_t & tensor, Qnn_MemHandle_t handle) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.memHandle = handle; - } -} - static size_t memscpy(void * dst, size_t dst_size, const void * src, size_t copy_size) { if (!dst || !src || !dst_size || !copy_size) return 0; @@ -613,7 +423,7 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src CHECK_PARAMS(ctx, src0, src1, dst); instance = ctx->instance; - QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + auto qnn_raw_interface = ctx->raw_interface; qnn_perf perf("ggml_qnn_add"); perf.start(); @@ -807,7 +617,7 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, CHECK_PARAMS(ctx, src0, src1, dst); instance = ctx->instance; - QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + auto qnn_raw_interface = ctx->raw_interface; qnn_perf perf("ggml_qnn_mul_mat"); perf.start(); diff --git a/ggml-qnn/backend.hpp b/ggml-qnn/backend.hpp index 1f674103d29ac..b5aacf57c1aa0 100644 --- a/ggml-qnn/backend.hpp +++ b/ggml-qnn/backend.hpp @@ -12,13 +12,13 @@ #include "qnn.hpp" struct ggml_backend_qnn_context { - int device; - int threads; - char name[GGML_MAX_NAME]; - char lib[GGML_MAX_NAME]; + int device; + int threads; + char name[GGML_MAX_NAME]; + char lib[GGML_MAX_NAME]; qnn::qnn_instance* instance; - struct ggml_backend* backend; - QNN_INTERFACE_VER_TYPE raw_interface; + ggml_backend* backend; + QNN_INTERFACE_VER_TYPE raw_interface; QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface; - qnn::qcom_socinfo socinfo; + qnn::qcom_socinfo socinfo; }; diff --git a/ggml-qnn/utils.hpp b/ggml-qnn/utils.hpp index c952e8bc298c6..2ec7c0f13f0ce 100644 --- a/ggml-qnn/utils.hpp +++ b/ggml-qnn/utils.hpp @@ -109,4 +109,200 @@ namespace qnn { return ggml_nbytes(tensor); } + + // ================================================================================================= + // + // QNN backend internal helper functions + // + // ================================================================================================= + // TODO: only support GGML_OP_ADD/GGML_OP_MUL/GGML_OP_MUL_MAT + const char* opname_from_ggmlop(enum ggml_op ggmlop) { + switch (ggmlop) { + case GGML_OP_ADD: + return QNN_OP_ELEMENT_WISE_ADD; + case GGML_OP_MUL: + return QNN_OP_ELEMENT_WISE_MULTIPLY; + case GGML_OP_MUL_MAT: + return QNN_OP_MAT_MUL; + default: + break; + } + return nullptr; + } + + inline int validate_tensor_version(Qnn_Tensor_t tensor) { + if (tensor.version != QNN_TENSOR_VERSION_1) { + QNN_LOG_WARN( + "validate_tensor_version() tensor %s, got unsupported version %d\n", + tensor.v1.name, tensor.version); + return 1; + } + return 0; + } + + inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t& tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.id; + } + + return 0u; + } + + inline const char* get_qnn_tensorname(const Qnn_Tensor_t& tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.name; + } + return nullptr; + } + + inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t& tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.type; + } + return QNN_TENSOR_TYPE_UNDEFINED; + } + + inline Qnn_TensorDataFormat_t + get_qnn_tensor_dataformat(const Qnn_Tensor_t& tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.dataFormat; + } + return QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER; + } + + inline Qnn_DataType_t + get_qnn_tensor_datatype(const Qnn_Tensor_t& tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.dataType; + } + return QNN_DATATYPE_UNDEFINED; + } + + inline Qnn_QuantizeParams_t + get_qnn_tensor_quantparams(const Qnn_Tensor_t& tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.quantizeParams; + } + return QNN_QUANTIZE_PARAMS_INIT; + } + + inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t& tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.rank; + } + return 0u; + } + + inline uint32_t* get_qnn_tensor_dimensions(const Qnn_Tensor_t& tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.dimensions; + } + return nullptr; + } + + inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t& tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.memType; + } + return QNN_TENSORMEMTYPE_UNDEFINED; + } + + inline void set_qnn_tensor_id(Qnn_Tensor_t& tensor, uint32_t id) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.id = id; + } + } + + inline void set_qnn_tensor_name(Qnn_Tensor_t& tensor, const char* name) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.name = name; + } + } + + inline void set_qnn_tensor_type(Qnn_Tensor_t& tensor, Qnn_TensorType_t type) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.type = type; + } + } + + inline void set_qnn_tensor_dataformat(Qnn_Tensor_t& tensor, Qnn_TensorDataFormat_t format) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.dataFormat = format; + } + } + + inline void set_qnn_tensor_datatype(Qnn_Tensor_t& tensor, Qnn_DataType_t dataType) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.dataType = dataType; + } + } + + inline void set_qnn_tensor_quantparams(Qnn_Tensor_t& tensor, Qnn_QuantizeParams_t params) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.quantizeParams = params; + } + } + + inline void set_qnn_tensor_rank(Qnn_Tensor_t& tensor, uint32_t rank) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.rank = rank; + } + } + + inline void set_qnn_tensor_dimensions(Qnn_Tensor_t& tensor, uint32_t* dims) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.dimensions = dims; + } + } + + inline void set_qnn_tensor_memtype(Qnn_Tensor_t& tensor, Qnn_TensorMemType_t mem_type) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.memType = mem_type; + } + } + + inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t& tensor, Qnn_ClientBuffer_t client_buf) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.clientBuf = client_buf; + } + } + + inline void set_qnn_tensor_memhandle(Qnn_Tensor_t& tensor, Qnn_MemHandle_t handle) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.memHandle = handle; + } + } } + + +#define VALIDATE(value, status) \ + do { \ + status = value; \ + if (status != QNN_SUCCESS) { \ + QNN_LOG_WARN("%s expected QNN_SUCCESS\n", #value); \ + return status; \ + } \ + } while (0) + +#define QNN_TENSOR_GET_ID(tensor) qnn::get_qnn_tensorid(tensor) +#define QNN_TENSOR_GET_NAME(tensor) qnn::get_qnn_tensorname(tensor) +#define QNN_TENSOR_GET_TYPE(tensor) qnn::get_qnn_tensortype(tensor) +#define QNN_TENSOR_GET_DATA_FORMAT(tensor) qnn::get_qnn_tensor_dataformat(tensor) +#define QNN_TENSOR_GET_DATA_TYPE(tensor) qnn::get_qnn_tensor_datatype(tensor) +#define QNN_TENSOR_GET_QUANT_PARAMS(tensor) qnn::get_qnn_tensor_quantparams(tensor) +#define QNN_TENSOR_GET_RANK(tensor) qnn::get_qnn_tensor_rank(tensor) +#define QNN_TENSOR_GET_DIMENSIONS(tensor) qnn::get_qnn_tensor_dimensions(tensor) +#define QNN_TENSOR_GET_MEM_TYPE(tensor) qnn::get_qnn_tensor_memtype(tensor) + +#define QNN_TENSOR_SET_ID(tensor, value) qnn::set_qnn_tensor_id(tensor, value) +#define QNN_TENSOR_SET_NAME(tensor, value) qnn::set_qnn_tensor_name(tensor, value) +#define QNN_TENSOR_SET_TYPE(tensor, value) qnn::set_qnn_tensor_type(tensor, value) +#define QNN_TENSOR_SET_DATA_FORMAT(tensor, value) qnn::set_qnn_tensor_dataformat(tensor, value) +#define QNN_TENSOR_SET_DATA_TYPE(tensor, value) qnn::set_qnn_tensor_datatype(tensor, value) +#define QNN_TENSOR_SET_QUANT_PARAMS(tensor, value) qnn::set_qnn_tensor_quantparams(tensor, value) +#define QNN_TENSOR_SET_RANK(tensor, value) qnn::set_qnn_tensor_rank(tensor, value) +#define QNN_TENSOR_SET_DIMENSIONS(tensor, value) qnn::set_qnn_tensor_dimensions(tensor, value) +#define QNN_TENSOR_SET_MEM_TYPE(tensor, value) qnn::set_qnn_tensor_memtype(tensor, value) +#define QNN_TENSOR_SET_CLIENT_BUF(tensor, value) qnn::set_qnn_tensor_clientbuf(tensor, value) +#define QNN_TENSOR_SET_MEM_HANDLE(tensor, value) qnn::set_qnn_tensor_memhandle(tensor, value) +#define VALIDATE_TENSOR_VERSION(tensor, err) VALIDATE(qnn::validate_tensor_version(tensor), err) From e1056da1c083ecba7b10d4833963a7c429cee054 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Mon, 24 Jun 2024 12:06:42 +0800 Subject: [PATCH 032/166] fix op handle checker --- ggml-qnn.cpp | 231 ++++++++++++++++++++++----------------------------- 1 file changed, 100 insertions(+), 131 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index 3a667a1970aba..ffa43718410ab 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -354,12 +354,100 @@ static int free_qnn_tensor(Qnn_Tensor_t & tensor) { // implementation of QNN backend for GGML // // ================================================================================================= +static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst); +static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); + +static ggml_qnn_func_t s_op_table[GGML_OP_COUNT] = { + nullptr, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + ggml_qnn_add, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + nullptr, // GGML_OP_SUB + nullptr, // GGML_OP_MUL + nullptr, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + nullptr, // GGML_OP_SQRT + nullptr, // GGML_OP_LOG + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + nullptr, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM + + ggml_qnn_mul_mat, // GGML_OP_MUL_MAT + nullptr, // GGML_OP_MUL_MAT_ID + nullptr, // GGML_OP_OUT_PROD + + nullptr, // GGML_OP_SCALE + nullptr, // GGML_OP_SET + nullptr, // GGML_OP_CPY + nullptr, // GGML_OP_CONT + nullptr, // GGML_OP_RESHAPE + nullptr, // GGML_OP_VIEW + nullptr, // GGML_OP_PERMUTE + nullptr, // GGML_OP_TRANSPOSE + nullptr, // GGML_OP_GET_ROWS + nullptr, // GGML_OP_GET_ROWS_BACK + nullptr, // GGML_OP_DIAG + nullptr, // GGML_OP_DIAG_MASK_INF + nullptr, // GGML_OP_DIAG_MASK_ZERO + nullptr, // GGML_OP_SOFT_MAX + nullptr, // GGML_OP_SOFT_MAX_BACK + nullptr, // GGML_OP_ROPE + nullptr, // GGML_OP_ROPE_BACK + nullptr, // GGML_OP_CLAMP + nullptr, // GGML_OP_CONV_TRANSPOSE_1D + nullptr, // GGML_OP_IM2COL + nullptr, // GGML_OP_CONV_TRANSPOSE_2D + nullptr, // GGML_OP_POOL_1D + nullptr, // GGML_OP_POOL_2D + nullptr, // GGML_OP_UPSCALE + nullptr, // GGML_OP_PAD + nullptr, // GGML_OP_ARANGE + nullptr, // GGML_OP_TIMESTEP_EMBEDDING + nullptr, // GGML_OP_ARGSORT + nullptr, // GGML_OP_LEAKY_RELU + + nullptr, // GGML_OP_FLASH_ATTN_EXT + nullptr, // GGML_OP_FLASH_ATTN_BACK + nullptr, // GGML_OP_SSM_CONV + nullptr, // GGML_OP_SSM_SCAN + nullptr, // GGML_OP_WIN_PART + nullptr, // GGML_OP_WIN_UNPART + nullptr, // GGML_OP_GET_REL_POS + nullptr, // GGML_OP_ADD_REL_POS + + nullptr, // GGML_OP_UNARY + + nullptr, // GGML_OP_MAP_UNARY + nullptr, // GGML_OP_MAP_BINARY + + nullptr, // GGML_OP_MAP_CUSTOM1_F32 + nullptr, // GGML_OP_MAP_CUSTOM2_F32 + nullptr, // GGML_OP_MAP_CUSTOM3_F32 + + nullptr, // GGML_OP_MAP_CUSTOM1 + nullptr, // GGML_OP_MAP_CUSTOM2 + nullptr, // GGML_OP_MAP_CUSTOM3 + + nullptr, // GGML_OP_CROSS_ENTROPY_LOSS + nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK +}; + static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, const struct ggml_tensor * tensor, bool b_dump_tensor_info) { - if (ggml_is_empty(tensor) || tensor->op == GGML_OP_RESHAPE || - tensor->op == GGML_OP_TRANSPOSE || tensor->op == GGML_OP_VIEW || - tensor->op == GGML_OP_PERMUTE || tensor->op == GGML_OP_NONE) { + if (ggml_is_empty(tensor) || !s_op_table[tensor->op]) { return false; } @@ -369,10 +457,10 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, return false; } - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; + const auto ne00 = src0->ne[0]; + const auto ne01 = src0->ne[1]; + const auto ne10 = src1->ne[0]; + const auto ne11 = src1->ne[1]; // make qnn_get_ggml_tensor_rank and QNN SDK happy if (ne00 <= 1 || ne01 <= 1 || ne10 <= 1 || ne11 <= 1) { return false; @@ -951,132 +1039,13 @@ static void ggml_qnn_nop(ggml_backend_qnn_context * ctx, const ggml_tensor * src bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx, struct ggml_compute_params * params, struct ggml_tensor * tensor) { - ggml_qnn_func_t func = nullptr; - - switch (tensor->op) { - case GGML_OP_ADD: - func = ggml_qnn_add; - break; - case GGML_OP_MUL_MAT: - func = ggml_qnn_mul_mat; - break; - case GGML_OP_REPEAT: - func = ggml_qnn_repeat; - break; - case GGML_OP_GET_ROWS: - func = ggml_qnn_get_rows; - break; - case GGML_OP_DUP: - func = ggml_qnn_dup; - break; - case GGML_OP_ACC: - func = ggml_qnn_acc; - break; - case GGML_OP_DIV: - func = ggml_qnn_div; - break; - case GGML_OP_UNARY: - switch (ggml_get_unary_op(tensor)) { - case GGML_UNARY_OP_GELU: - func = ggml_qnn_gelu; - break; - case GGML_UNARY_OP_SILU: - func = ggml_qnn_silu; - break; - case GGML_UNARY_OP_GELU_QUICK: - func = ggml_qnn_gelu_quick; - break; - case GGML_UNARY_OP_TANH: - func = ggml_qnn_tanh; - break; - case GGML_UNARY_OP_RELU: - func = ggml_qnn_relu; - break; - case GGML_UNARY_OP_HARDSIGMOID: - func = ggml_qnn_hardsigmoid; - break; - case GGML_UNARY_OP_HARDSWISH: - func = ggml_qnn_hardswish; - break; - default: - return false; - } - break; - case GGML_OP_NORM: - func = ggml_qnn_norm; - break; - case GGML_OP_GROUP_NORM: - func = ggml_qnn_group_norm; - break; - case GGML_OP_CONCAT: - func = ggml_qnn_concat; - break; - case GGML_OP_UPSCALE: - func = ggml_qnn_upscale; - break; - case GGML_OP_PAD: - func = ggml_qnn_pad; - break; - case GGML_OP_LEAKY_RELU: - func = ggml_qnn_leaky_relu; - break; - case GGML_OP_RMS_NORM: - func = ggml_qnn_rms_norm; - break; - case GGML_OP_MUL_MAT_ID: - func = ggml_qnn_mul_mat_id; - break; - case GGML_OP_SCALE: - func = ggml_qnn_scale; - break; - case GGML_OP_SQR: - func = ggml_qnn_sqr; - break; - case GGML_OP_CLAMP: - func = ggml_qnn_clamp; - break; - case GGML_OP_CPY: - func = ggml_qnn_cpy; - break; - case GGML_OP_CONT: - func = ggml_qnn_dup; - break; - case GGML_OP_NONE: - case GGML_OP_RESHAPE: - case GGML_OP_VIEW: - case GGML_OP_PERMUTE: - case GGML_OP_TRANSPOSE: - func = ggml_qnn_nop; - break; - case GGML_OP_DIAG_MASK_INF: - func = ggml_qnn_diag_mask_inf; - break; - case GGML_OP_SOFT_MAX: - func = ggml_qnn_soft_max; - break; - case GGML_OP_ROPE: - func = ggml_qnn_rope; - break; - case GGML_OP_IM2COL: - func = ggml_qnn_im2col; - break; - case GGML_OP_POOL_2D: - func = ggml_qnn_pool2d; - break; - case GGML_OP_SUM_ROWS: - func = ggml_qnn_sum_rows; - break; - case GGML_OP_ARGSORT: - func = ggml_qnn_argsort; - break; - default: + ggml_qnn_func_t func = s_op_table[tensor->op]; + if (!func) { + QNN_LOG_WARN("unsupported op %d", tensor->op); return false; } - if (nullptr != func) { - func(ctx, tensor->src[0], tensor->src[1], tensor); - } - + func(ctx, tensor->src[0], tensor->src[1], tensor); return true; } @@ -1349,7 +1318,7 @@ GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, GGML_CALL static bool ggml_backend_qnn_offload_op(ggml_backend_t backend,const ggml_tensor * tensor) { ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; - return ggml_qnn_compute_forward(ctx, nullptr, (ggml_tensor *) tensor); + return ggml_qnn_can_handle_op(ctx, tensor, false); } static ggml_backend_i ggml_backend_qnn_interface = { From c9e99bd603ea358eaa1c54505fd2d26faa3d9d4e Mon Sep 17 00:00:00 2001 From: hongruichen Date: Mon, 24 Jun 2024 22:11:28 +0800 Subject: [PATCH 033/166] split qnn ops into file --- ggml-qnn.cpp | 723 +--------------------------------- ggml-qnn/backend-ops.cpp | 675 +++++++++++++++++++++++++++++++ ggml-qnn/backend-ops.hpp | 17 + ggml-qnn/backend.hpp | 5 - ggml-qnn/qnn.hpp | 13 +- ggml-qnn/tensor.hpp | 1 + ggml-qnn/utils.cpp | 126 ++++++ ggml-qnn/utils.hpp | 172 +++----- tests/ggml-qnn/CMakeLists.txt | 2 + 9 files changed, 889 insertions(+), 845 deletions(-) create mode 100644 ggml-qnn/backend-ops.cpp create mode 100644 ggml-qnn/backend-ops.hpp create mode 100644 ggml-qnn/utils.cpp diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index ffa43718410ab..750d5ff91c3d3 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -1,22 +1,14 @@ #include #include -#include #include #include -#include -#include -#include #include #include -#include -#include #include -#include #include #include #include -#include #include #include #include @@ -28,7 +20,6 @@ #include #include #include -#include #include #include #include @@ -40,8 +31,9 @@ #include "ggml-qnn/logger.hpp" #include "ggml-qnn/utils.hpp" -#include "ggml-qnn/backend.hpp" #include "ggml-qnn/tensor.hpp" +#include "ggml-qnn/backend.hpp" +#include "ggml-qnn/backend-ops.hpp" // ================================================================================================= // @@ -63,11 +55,6 @@ static int free_qnn_tensor(Qnn_Tensor_t & tensor); #define QNN_BACKEND_NAME "qnn" -typedef void (*ggml_qnn_func_t)(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, - const ggml_tensor * src1, - ggml_tensor * dst); - static struct qnn::qcom_socinfo g_qnn_soc_info_table[] = { /* Qualcomm SnapDragon 8 Gen 1 */ [qnn::SM8450] = { @@ -183,78 +170,6 @@ struct ggml_backend_qnn_buffer_type_context { // QNN backend internal helper functions // // ================================================================================================= -static bool qnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { - if ((nullptr == ctx) || (nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { - QNN_LOG_WARN("invalid params\n"); - return false; - } - - qnn::qnn_instance *instance = nullptr; - Qnn_Tensor_t *tensor_0 = nullptr; - Qnn_Tensor_t *tensor_1 = nullptr; - Qnn_Tensor_t *tensor_2 = nullptr; - tensor_0 = (Qnn_Tensor_t *) src0->extra; - tensor_1 = (Qnn_Tensor_t *) src1->extra; - tensor_2 = (Qnn_Tensor_t *) dst->extra; - instance = ctx->instance; - if ((nullptr == instance) || (nullptr == tensor_0) || (nullptr == tensor_1) || (nullptr == tensor_2)) { - QNN_LOG_WARN("invalid params\n"); - return false; - } - - return true; -} - -#ifndef NDEBUG -#define CHECK_PARAMS(ctx, src0, src1, dst) \ - do { \ - if (!qnn_is_valid_params((ctx), (src0), (src1), (dst))) { \ - return; \ - } \ - } while (0) - -#else -#define CHECK_PARAMS(ctx, src0, src1, dst) -#endif - -#if ENABLE_QNNBACKEND_PERF -class qnn_perf { -public: - qnn_perf(const std::string & perf_name) : _perf_name(std::move(perf_name)) {}; - qnn_perf() = delete; - qnn_perf(const qnn_perf & ) = delete; - qnn_perf & operator= (const qnn_perf & ) = delete; - - void start() { - _begin_time = ggml_time_us(); - } - - void info() { - _end_time = ggml_time_us(); - _duration = (_end_time - _begin_time); - QNN_LOG_INFO("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration); - } - -private: - int64_t _begin_time = 0LL; - int64_t _end_time = 0LL; - int64_t _duration = 0LL; - std::string _perf_name; -}; -#else -class qnn_perf { -public: - qnn_perf(const std::string & perf_name) {} - qnn_perf() = delete; - qnn_perf(const qnn_perf & ) = delete; - qnn_perf & operator= (const qnn_perf & ) = delete; - - void start() {} - void info() {} -}; -#endif - static size_t memscpy(void * dst, size_t dst_size, const void * src, size_t copy_size) { if (!dst || !src || !dst_size || !copy_size) return 0; @@ -354,100 +269,10 @@ static int free_qnn_tensor(Qnn_Tensor_t & tensor) { // implementation of QNN backend for GGML // // ================================================================================================= -static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst); -static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); - -static ggml_qnn_func_t s_op_table[GGML_OP_COUNT] = { - nullptr, // GGML_OP_NONE - nullptr, // GGML_OP_DUP - ggml_qnn_add, // GGML_OP_ADD - nullptr, // GGML_OP_ADD1 - nullptr, // GGML_OP_ACC - nullptr, // GGML_OP_SUB - nullptr, // GGML_OP_MUL - nullptr, // GGML_OP_DIV - nullptr, // GGML_OP_SQR - nullptr, // GGML_OP_SQRT - nullptr, // GGML_OP_LOG - nullptr, // GGML_OP_SUM - nullptr, // GGML_OP_SUM_ROWS - nullptr, // GGML_OP_MEAN - nullptr, // GGML_OP_ARGMAX - nullptr, // GGML_OP_REPEAT - nullptr, // GGML_OP_REPEAT_BACK - nullptr, // GGML_OP_CONCAT - nullptr, // GGML_OP_SILU_BACK - nullptr, // GGML_OP_NORM - nullptr, // GGML_OP_RMS_NORM - nullptr, // GGML_OP_RMS_NORM_BACK - nullptr, // GGML_OP_GROUP_NORM - - ggml_qnn_mul_mat, // GGML_OP_MUL_MAT - nullptr, // GGML_OP_MUL_MAT_ID - nullptr, // GGML_OP_OUT_PROD - - nullptr, // GGML_OP_SCALE - nullptr, // GGML_OP_SET - nullptr, // GGML_OP_CPY - nullptr, // GGML_OP_CONT - nullptr, // GGML_OP_RESHAPE - nullptr, // GGML_OP_VIEW - nullptr, // GGML_OP_PERMUTE - nullptr, // GGML_OP_TRANSPOSE - nullptr, // GGML_OP_GET_ROWS - nullptr, // GGML_OP_GET_ROWS_BACK - nullptr, // GGML_OP_DIAG - nullptr, // GGML_OP_DIAG_MASK_INF - nullptr, // GGML_OP_DIAG_MASK_ZERO - nullptr, // GGML_OP_SOFT_MAX - nullptr, // GGML_OP_SOFT_MAX_BACK - nullptr, // GGML_OP_ROPE - nullptr, // GGML_OP_ROPE_BACK - nullptr, // GGML_OP_CLAMP - nullptr, // GGML_OP_CONV_TRANSPOSE_1D - nullptr, // GGML_OP_IM2COL - nullptr, // GGML_OP_CONV_TRANSPOSE_2D - nullptr, // GGML_OP_POOL_1D - nullptr, // GGML_OP_POOL_2D - nullptr, // GGML_OP_UPSCALE - nullptr, // GGML_OP_PAD - nullptr, // GGML_OP_ARANGE - nullptr, // GGML_OP_TIMESTEP_EMBEDDING - nullptr, // GGML_OP_ARGSORT - nullptr, // GGML_OP_LEAKY_RELU - - nullptr, // GGML_OP_FLASH_ATTN_EXT - nullptr, // GGML_OP_FLASH_ATTN_BACK - nullptr, // GGML_OP_SSM_CONV - nullptr, // GGML_OP_SSM_SCAN - nullptr, // GGML_OP_WIN_PART - nullptr, // GGML_OP_WIN_UNPART - nullptr, // GGML_OP_GET_REL_POS - nullptr, // GGML_OP_ADD_REL_POS - - nullptr, // GGML_OP_UNARY - - nullptr, // GGML_OP_MAP_UNARY - nullptr, // GGML_OP_MAP_BINARY - - nullptr, // GGML_OP_MAP_CUSTOM1_F32 - nullptr, // GGML_OP_MAP_CUSTOM2_F32 - nullptr, // GGML_OP_MAP_CUSTOM3_F32 - - nullptr, // GGML_OP_MAP_CUSTOM1 - nullptr, // GGML_OP_MAP_CUSTOM2 - nullptr, // GGML_OP_MAP_CUSTOM3 - - nullptr, // GGML_OP_CROSS_ENTROPY_LOSS - nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK -}; - static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, const struct ggml_tensor * tensor, bool b_dump_tensor_info) { - if (ggml_is_empty(tensor) || !s_op_table[tensor->op]) { + if (ggml_is_empty(tensor) || !qnn::ggml_qnn_op_array()[tensor->op]) { return false; } @@ -496,550 +321,10 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, return true; } - -//TODO: this function can be removed later because there are duplicated codes with ggml_qnn_mul_mat -// keep it for illustrate how to implement a specified GGMPL OP using QNN API + QNN RPC -static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - bool graph_initialized = false; - qnn::qnn_instance *instance = nullptr; - std::string graph_name = "ggml_op_qnn_add"; - Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Param_t qnn_params[] = {}; - enum ggml_op ggmlop = GGML_OP_ADD; - - CHECK_PARAMS(ctx, src0, src1, dst); - instance = ctx->instance; - auto qnn_raw_interface = ctx->raw_interface; - - qnn_perf perf("ggml_qnn_add"); - perf.start(); - - std::string map_entry = std::string(ggml_op_name(ggmlop)); - if (instance->_qnn_graph_map.find(map_entry) != - instance->_qnn_graph_map.end()) { - graph_initialized = true; - auto & graph_item = instance->_qnn_graph_map[map_entry]; - graph_handle = std::get<0>(graph_item); - } - - if (!graph_initialized) { - graph_name = graph_name + "_" + std::to_string(ctx->threads) + - "_" + src0->name + "_" + src1->name; - QNN_LOG_INFO("graph name %s", graph_name.c_str()); - if (ctx->device == QNN_BACKEND_NPU) { - QnnHtpGraph_CustomConfig_t hvx_config; - hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; - hvx_config.numHvxThreads = 8; - QnnGraph_Config_t graph_hvx_config; - graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_hvx_config.customConfig = &hvx_config; - - QnnHtpGraph_CustomConfig_t dlbc_config; - dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; - dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; - dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC - QnnGraph_Config_t graph_dlbc_config; - graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_dlbc_config.customConfig = &dlbc_config; - - QnnHtpGraph_CustomConfig_t opt_config; - opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; - opt_config.optimizationOption.floatValue = 1; // 1 / 3 - QnnGraph_Config_t graph_opt_config; - graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_opt_config.customConfig = &opt_config; - - QnnHtpGraph_CustomConfig_t vtcm_config; - vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; - vtcm_config.vtcmSizeInMB = ctx->socinfo.vtcm_size_in_mb; - QnnGraph_Config_t graph_vtcm_config; - graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_vtcm_config.customConfig = &vtcm_config; - - const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config, - &graph_dlbc_config, - &graph_vtcm_config, - &graph_opt_config, - NULL}; - error = qnn_raw_interface.graphCreate( - instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig, - &graph_handle); - } else { - error = qnn_raw_interface.graphCreate( - instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, - &graph_handle); - } - - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("can't create qnn graph handle with graph name %s, " - "error = %d\n", - graph_name.c_str(), error); - goto failure; - } else { - QNN_LOG_INFO("create qnn graph handle with graph name %s ok\n", graph_name.c_str()); - } - - qnn::ggml_qnn_tensor_input tensor_input0(src0, graph_handle, ctx); - if (!tensor_input0.is_valid()) { - goto failure; - } - qnn::ggml_qnn_tensor_input tensor_input1(src1, graph_handle, ctx); - if (!tensor_input1.is_valid()) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - qnn::ggml_qnn_tensor_output tensor_output(dst, graph_handle, ctx); - if (!tensor_output.is_valid()) { - goto failure; - } - - Qnn_Tensor_t tensor_inputs[] = {*tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor()}; - Qnn_Tensor_t tensor_outputs[] = {*tensor_output.get_qnn_tensor()}; - Qnn_OpConfig_t op_config = { - (Qnn_OpConfigVersion_t) 1, - .v1 = {"ggml_op_add", - QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_ELEMENT_WISE_ADD, - 0, qnn_params, - 2, tensor_inputs, - 1,tensor_outputs} - }; - error = qnn_raw_interface.graphAddNode(graph_handle, op_config); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - error = qnn_raw_interface.graphFinalize(graph_handle, - nullptr, nullptr); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - error = qnn_raw_interface.graphExecute(graph_handle, - tensor_inputs, 2, - tensor_outputs, 1, - nullptr, nullptr); - if (ctx->device == QNN_BACKEND_NPU) { - if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { - QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); - } - } - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - - auto graph_item = std::make_tuple(graph_handle, - tensor_input0.get_qnn_tensor(), - tensor_input1.get_qnn_tensor(), - tensor_output.get_qnn_tensor()); - instance->_qnn_graph_map[map_entry] = graph_item; - } else { - auto & graph_item = instance->_qnn_graph_map[map_entry]; - qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); - qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); - qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); - - Qnn_Tensor_t tensor_inputs[] = {*tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor()}; - Qnn_Tensor_t tensor_outputs[] = {*tensor_output.get_qnn_tensor()}; - error = qnn_raw_interface.graphExecute(graph_handle, - tensor_inputs,2, - tensor_outputs,1, - nullptr, nullptr); - if (ctx->device == QNN_BACKEND_NPU) { - if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { - QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); - } - } - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - } - -failure: - if (QNN_SUCCESS != error) { - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, src0->type, ggml_type_name(src0->type), - src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], - src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, src1->type, ggml_type_name(src1->type), - src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], - src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - dst->name, dst->type, ggml_type_name(dst->type), - dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], - dst->nb[1], dst->nb[2]); - } - - perf.info(); -} - -/* - * ggml_qnn_mul_mat was re-added as a standalone function because - * the following comments came from https://github.com/ggerganov/llama.cpp/pull/1632 - * MUL_MAT take most of the compute time (about 95%). - * So to speed up llama, we have to focus on MUL_MAT. - * - * We have three kinds of MUL_MAT to compute: - * mul_mat_f32: both src0 and src1 are F32. - * mul_mat_f16_f32: src0 is F16 and src1 is F32. - * mul_mat_q_f32: src0 is quantized (Q4_0, Q4_1, ...), and src1 is F32. - */ -static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - bool graph_initialized = false; - qnn::qnn_instance *instance = nullptr; - std::string graph_name = "ggml_op_qnn_mul_mat"; - Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Param_t qnn_params[] = {}; - enum ggml_op ggmlop = GGML_OP_MUL_MAT; - - CHECK_PARAMS(ctx, src0, src1, dst); - instance = ctx->instance; - auto qnn_raw_interface = ctx->raw_interface; - - qnn_perf perf("ggml_qnn_mul_mat"); - perf.start(); - - std::string map_entry = std::string(ggml_op_name(ggmlop)); - if (instance->_qnn_graph_map.find(map_entry) != - instance->_qnn_graph_map.end()) { - graph_initialized = true; - auto & graph_item = instance->_qnn_graph_map[map_entry]; - graph_handle = std::get<0>(graph_item); - } - - //TODO: for scenarios of quantized data in src0 - // pass-1: dequantize src0 to FP32 - // pass-2: dq-src0 * src1 - // the performance gains is worth although there is performance loss in pass-1 - - if (!graph_initialized) { - graph_name = graph_name + "_" + std::to_string(ctx->threads) + - "_" + src0->name + "_" + src1->name; - QNN_LOG_INFO("graph name %s", graph_name.c_str()); - if (ctx->device == QNN_BACKEND_NPU) { - QnnHtpGraph_CustomConfig_t hvx_config; - hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; - hvx_config.numHvxThreads = 8; - QnnGraph_Config_t graph_hvx_config; - graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_hvx_config.customConfig = &hvx_config; - - QnnHtpGraph_CustomConfig_t dlbc_config; - dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; - dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; - dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC - QnnGraph_Config_t graph_dlbc_config; - graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_dlbc_config.customConfig = &dlbc_config; - - QnnHtpGraph_CustomConfig_t opt_config; - opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; - opt_config.optimizationOption.floatValue = 1; //1 / 3 - QnnGraph_Config_t graph_opt_config; - graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_opt_config.customConfig = &opt_config; - - QnnHtpGraph_CustomConfig_t vtcm_config; - vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; - vtcm_config.vtcmSizeInMB = ctx->socinfo.vtcm_size_in_mb; - QnnGraph_Config_t graph_vtcm_config; - graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_vtcm_config.customConfig = &vtcm_config; - - const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config, - &graph_dlbc_config, - &graph_vtcm_config, - &graph_opt_config, - NULL}; - error = qnn_raw_interface.graphCreate( - instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig, - &graph_handle); - } else { - error = qnn_raw_interface.graphCreate( - instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, - &graph_handle); - } - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("can't create qnn graph handle with graph name %s, " - "error = %d\n", - graph_name.c_str(), error); - goto failure; - } - - qnn::ggml_qnn_tensor_input tensor_input0(src0, graph_handle, ctx); - if (!tensor_input0.is_valid()) { - goto failure; - } - qnn::ggml_qnn_tensor_input tensor_input1(src1, graph_handle, ctx); - if (!tensor_input1.is_valid()) { - goto failure; - } - qnn::ggml_qnn_tensor_output tensor_output(dst, graph_handle, ctx); - if (!tensor_output.is_valid()) { - goto failure; - } - - Qnn_Tensor_t tensor_inputs[] = {*tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor()}; - Qnn_Tensor_t tensor_outputs[] = {*tensor_output.get_qnn_tensor()}; - Qnn_OpConfig_t op_config = { - (Qnn_OpConfigVersion_t) 1, - .v1 = {"ggml_op_mul_mat", - QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_MAT_MUL, - 0, qnn_params, - 2, tensor_inputs, - 1, tensor_outputs} - }; - error = qnn_raw_interface.graphAddNode(graph_handle, op_config); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - error = qnn_raw_interface.graphFinalize(graph_handle, - nullptr, nullptr); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - error = qnn_raw_interface.graphExecute(graph_handle, - tensor_inputs, 2, - tensor_outputs, 1, - nullptr, nullptr); - if (ctx->device == QNN_BACKEND_NPU) { - if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { - QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); - } - } - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - - auto graph_item = std::make_tuple(graph_handle, - tensor_input0.get_qnn_tensor(), - tensor_input1.get_qnn_tensor(), - tensor_output.get_qnn_tensor()); - instance->_qnn_graph_map[map_entry] = graph_item; - } else { - auto & graph_item= instance->_qnn_graph_map[map_entry]; - qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); - qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); - qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); - - Qnn_Tensor_t tensor_inputs[] = {*tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor()}; - Qnn_Tensor_t tensor_outputs[] = {*tensor_output.get_qnn_tensor()}; - error = qnn_raw_interface.graphExecute(graph_handle, - tensor_inputs, 2, - tensor_outputs, 1, - nullptr, nullptr); - if (ctx->device == QNN_BACKEND_NPU) { - if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { - QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); - } - } - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - } - -failure: - if (QNN_SUCCESS != error) { - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, src0->type, ggml_type_name(src0->type), - src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], - src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, src1->type, ggml_type_name(src1->type), - src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], - src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], - dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); - } - - perf.info(); -} - -static void ggml_qnn_repeat(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_get_rows(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_acc(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { -} - -static void ggml_qnn_div(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { -} - -static void ggml_qnn_gelu(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_silu(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_gelu_quick(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { -} - -static void ggml_qnn_tanh(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_relu(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_hardsigmoid(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { -} - -static void ggml_qnn_hardswish(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_leaky_relu(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { -} - -static void ggml_qnn_sqr(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { -} - -static void ggml_qnn_norm(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_group_norm(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { -} - -static void ggml_qnn_concat(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_upscale(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_pad(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { -} - -static void ggml_qnn_rms_norm(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_cpy(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { -} - -static void ggml_qnn_dup(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { - ggml_qnn_cpy(ctx, src0, dst, nullptr); - (void) src1; -} - -static void ggml_qnn_mul_mat_id(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { -} - -static void ggml_qnn_scale(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_clamp(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_diag_mask_inf(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { -} - -static void ggml_qnn_soft_max(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_rope(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { - GGML_ASSERT(ggml_is_contiguous(src0)); -} - -static void ggml_qnn_pool2d(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_im2col(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_sum_rows(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { - GGML_ASSERT(ggml_is_contiguous(src0)); -} - -static void ggml_qnn_argsort(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { - GGML_ASSERT(ggml_is_contiguous(src0)); -} - -static void ggml_qnn_nop(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { - (void)src0; - (void)src1; - (void)dst; -} - bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx, struct ggml_compute_params * params, struct ggml_tensor * tensor) { - ggml_qnn_func_t func = s_op_table[tensor->op]; + auto func = qnn::ggml_qnn_op_array()[tensor->op]; if (!func) { QNN_LOG_WARN("unsupported op %d", tensor->op); return false; diff --git a/ggml-qnn/backend-ops.cpp b/ggml-qnn/backend-ops.cpp new file mode 100644 index 0000000000000..a9c94a6df3102 --- /dev/null +++ b/ggml-qnn/backend-ops.cpp @@ -0,0 +1,675 @@ + +#include "backend-ops.hpp" + +#include "utils.hpp" +#include "logger.hpp" +#include "tensor.hpp" + + +static bool qnn_is_valid_params(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { + if ((nullptr == ctx) || (nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { + QNN_LOG_WARN("invalid params\n"); + return false; + } + + qnn::qnn_instance* instance = nullptr; + Qnn_Tensor_t* tensor_0 = nullptr; + Qnn_Tensor_t* tensor_1 = nullptr; + Qnn_Tensor_t* tensor_2 = nullptr; + tensor_0 = (Qnn_Tensor_t*)src0->extra; + tensor_1 = (Qnn_Tensor_t*)src1->extra; + tensor_2 = (Qnn_Tensor_t*)dst->extra; + instance = ctx->instance; + if ((nullptr == instance) || (nullptr == tensor_0) || (nullptr == tensor_1) || (nullptr == tensor_2)) { + QNN_LOG_WARN("invalid params\n"); + return false; + } + + return true; +} + +#ifndef NDEBUG +#define CHECK_PARAMS(ctx, src0, src1, dst) \ + do { \ + if (!qnn_is_valid_params((ctx), (src0), (src1), (dst))) { \ + return; \ + } \ + } while (0) + +#else +#define CHECK_PARAMS(ctx, src0, src1, dst) +#endif + +//TODO: this function can be removed later because there are duplicated codes with ggml_qnn_mul_mat +// keep it for illustrate how to implement a specified GGMPL OP using QNN API + QNN RPC +static void ggml_qnn_add(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + bool graph_initialized = false; + qnn::qnn_instance* instance = nullptr; + std::string graph_name = "ggml_op_qnn_add"; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Param_t qnn_params[] = {}; + enum ggml_op ggmlop = GGML_OP_ADD; + + CHECK_PARAMS(ctx, src0, src1, dst); + instance = ctx->instance; + auto qnn_raw_interface = ctx->raw_interface; + + qnn::qnn_perf perf("ggml_qnn_add"); + perf.start(); + + std::string map_entry = std::string(ggml_op_name(ggmlop)); + if (instance->_qnn_graph_map.find(map_entry) != + instance->_qnn_graph_map.end()) { + graph_initialized = true; + auto& graph_item = instance->_qnn_graph_map[map_entry]; + graph_handle = std::get<0>(graph_item); + } + + if (!graph_initialized) { + graph_name = graph_name + "_" + std::to_string(ctx->threads) + + "_" + src0->name + "_" + src1->name; + QNN_LOG_INFO("graph name %s", graph_name.c_str()); + if (ctx->device == QNN_BACKEND_NPU) { + QnnHtpGraph_CustomConfig_t hvx_config; + hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; + hvx_config.numHvxThreads = 8; + QnnGraph_Config_t graph_hvx_config; + graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_hvx_config.customConfig = &hvx_config; + + QnnHtpGraph_CustomConfig_t dlbc_config; + dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; + dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; + dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC + QnnGraph_Config_t graph_dlbc_config; + graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_dlbc_config.customConfig = &dlbc_config; + + QnnHtpGraph_CustomConfig_t opt_config; + opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; + opt_config.optimizationOption.floatValue = 1; // 1 / 3 + QnnGraph_Config_t graph_opt_config; + graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_opt_config.customConfig = &opt_config; + + QnnHtpGraph_CustomConfig_t vtcm_config; + vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; + vtcm_config.vtcmSizeInMB = ctx->socinfo.vtcm_size_in_mb; + QnnGraph_Config_t graph_vtcm_config; + graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_vtcm_config.customConfig = &vtcm_config; + + const QnnGraph_Config_t* p_graphconfig[] = { &graph_hvx_config, + &graph_dlbc_config, + &graph_vtcm_config, + &graph_opt_config, + NULL }; + error = qnn_raw_interface.graphCreate( + instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig, + &graph_handle); + } + else { + error = qnn_raw_interface.graphCreate( + instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, + &graph_handle); + } + + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("can't create qnn graph handle with graph name %s, " + "error = %d\n", + graph_name.c_str(), error); + goto failure; + } + else { + QNN_LOG_INFO("create qnn graph handle with graph name %s ok\n", graph_name.c_str()); + } + + qnn::ggml_qnn_tensor_input tensor_input0(src0, graph_handle, ctx); + if (!tensor_input0.is_valid()) { + goto failure; + } + qnn::ggml_qnn_tensor_input tensor_input1(src1, graph_handle, ctx); + if (!tensor_input1.is_valid()) { + QNN_LOG_INFO("error = %d\n", error); + goto failure; + } + qnn::ggml_qnn_tensor_output tensor_output(dst, graph_handle, ctx); + if (!tensor_output.is_valid()) { + goto failure; + } + + Qnn_Tensor_t tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }; + Qnn_Tensor_t tensor_outputs[] = { *tensor_output.get_qnn_tensor() }; + Qnn_OpConfig_t op_config = { + (Qnn_OpConfigVersion_t)1, + .v1 = {"ggml_op_add", + QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_ELEMENT_WISE_ADD, + 0, qnn_params, + 2, tensor_inputs, + 1,tensor_outputs} + }; + error = qnn_raw_interface.graphAddNode(graph_handle, op_config); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + goto failure; + } + error = qnn_raw_interface.graphFinalize(graph_handle, + nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + goto failure; + } + error = qnn_raw_interface.graphExecute(graph_handle, + tensor_inputs, 2, + tensor_outputs, 1, + nullptr, nullptr); + if (ctx->device == QNN_BACKEND_NPU) { + if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { + QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); + } + } + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + goto failure; + } + + auto graph_item = std::make_tuple(graph_handle, + tensor_input0.get_qnn_tensor(), + tensor_input1.get_qnn_tensor(), + tensor_output.get_qnn_tensor()); + instance->_qnn_graph_map[map_entry] = graph_item; + } + else { + auto& graph_item = instance->_qnn_graph_map[map_entry]; + qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); + qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); + qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); + + Qnn_Tensor_t tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }; + Qnn_Tensor_t tensor_outputs[] = { *tensor_output.get_qnn_tensor() }; + error = qnn_raw_interface.graphExecute(graph_handle, + tensor_inputs, 2, + tensor_outputs, 1, + nullptr, nullptr); + if (ctx->device == QNN_BACKEND_NPU) { + if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { + QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); + } + } + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + goto failure; + } + } + +failure: + if (QNN_SUCCESS != error) { + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, src0->type, ggml_type_name(src0->type), + src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], + src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, src1->type, ggml_type_name(src1->type), + src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], + src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, dst->type, ggml_type_name(dst->type), + dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], + dst->nb[1], dst->nb[2]); + } + + perf.info(); +} + +/* + * ggml_qnn_mul_mat was re-added as a standalone function because + * the following comments came from https://github.com/ggerganov/llama.cpp/pull/1632 + * MUL_MAT take most of the compute time (about 95%). + * So to speed up llama, we have to focus on MUL_MAT. + * + * We have three kinds of MUL_MAT to compute: + * mul_mat_f32: both src0 and src1 are F32. + * mul_mat_f16_f32: src0 is F16 and src1 is F32. + * mul_mat_q_f32: src0 is quantized (Q4_0, Q4_1, ...), and src1 is F32. + */ +static void ggml_qnn_mul_mat(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + bool graph_initialized = false; + qnn::qnn_instance* instance = nullptr; + std::string graph_name = "ggml_op_qnn_mul_mat"; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Param_t qnn_params[] = {}; + enum ggml_op ggmlop = GGML_OP_MUL_MAT; + + CHECK_PARAMS(ctx, src0, src1, dst); + instance = ctx->instance; + auto qnn_raw_interface = ctx->raw_interface; + + qnn::qnn_perf perf("ggml_qnn_mul_mat"); + perf.start(); + + std::string map_entry = std::string(ggml_op_name(ggmlop)); + if (instance->_qnn_graph_map.find(map_entry) != + instance->_qnn_graph_map.end()) { + graph_initialized = true; + auto& graph_item = instance->_qnn_graph_map[map_entry]; + graph_handle = std::get<0>(graph_item); + } + + //TODO: for scenarios of quantized data in src0 + // pass-1: dequantize src0 to FP32 + // pass-2: dq-src0 * src1 + // the performance gains is worth although there is performance loss in pass-1 + + if (!graph_initialized) { + graph_name = graph_name + "_" + std::to_string(ctx->threads) + + "_" + src0->name + "_" + src1->name; + QNN_LOG_INFO("graph name %s", graph_name.c_str()); + if (ctx->device == QNN_BACKEND_NPU) { + QnnHtpGraph_CustomConfig_t hvx_config; + hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; + hvx_config.numHvxThreads = 8; + QnnGraph_Config_t graph_hvx_config; + graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_hvx_config.customConfig = &hvx_config; + + QnnHtpGraph_CustomConfig_t dlbc_config; + dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; + dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; + dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC + QnnGraph_Config_t graph_dlbc_config; + graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_dlbc_config.customConfig = &dlbc_config; + + QnnHtpGraph_CustomConfig_t opt_config; + opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; + opt_config.optimizationOption.floatValue = 1; //1 / 3 + QnnGraph_Config_t graph_opt_config; + graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_opt_config.customConfig = &opt_config; + + QnnHtpGraph_CustomConfig_t vtcm_config; + vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; + vtcm_config.vtcmSizeInMB = ctx->socinfo.vtcm_size_in_mb; + QnnGraph_Config_t graph_vtcm_config; + graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_vtcm_config.customConfig = &vtcm_config; + + const QnnGraph_Config_t* p_graphconfig[] = { &graph_hvx_config, + &graph_dlbc_config, + &graph_vtcm_config, + &graph_opt_config, + NULL }; + error = qnn_raw_interface.graphCreate( + instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig, + &graph_handle); + } + else { + error = qnn_raw_interface.graphCreate( + instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, + &graph_handle); + } + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("can't create qnn graph handle with graph name %s, " + "error = %d\n", + graph_name.c_str(), error); + goto failure; + } + + qnn::ggml_qnn_tensor_input tensor_input0(src0, graph_handle, ctx); + if (!tensor_input0.is_valid()) { + goto failure; + } + qnn::ggml_qnn_tensor_input tensor_input1(src1, graph_handle, ctx); + if (!tensor_input1.is_valid()) { + goto failure; + } + qnn::ggml_qnn_tensor_output tensor_output(dst, graph_handle, ctx); + if (!tensor_output.is_valid()) { + goto failure; + } + + Qnn_Tensor_t tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }; + Qnn_Tensor_t tensor_outputs[] = { *tensor_output.get_qnn_tensor() }; + Qnn_OpConfig_t op_config = { + (Qnn_OpConfigVersion_t)1, + .v1 = {"ggml_op_mul_mat", + QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_MAT_MUL, + 0, qnn_params, + 2, tensor_inputs, + 1, tensor_outputs} + }; + error = qnn_raw_interface.graphAddNode(graph_handle, op_config); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + goto failure; + } + error = qnn_raw_interface.graphFinalize(graph_handle, + nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + goto failure; + } + error = qnn_raw_interface.graphExecute(graph_handle, + tensor_inputs, 2, + tensor_outputs, 1, + nullptr, nullptr); + if (ctx->device == QNN_BACKEND_NPU) { + if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { + QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); + } + } + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + goto failure; + } + + auto graph_item = std::make_tuple(graph_handle, + tensor_input0.get_qnn_tensor(), + tensor_input1.get_qnn_tensor(), + tensor_output.get_qnn_tensor()); + instance->_qnn_graph_map[map_entry] = graph_item; + } + else { + auto& graph_item = instance->_qnn_graph_map[map_entry]; + qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); + qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); + qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); + + Qnn_Tensor_t tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }; + Qnn_Tensor_t tensor_outputs[] = { *tensor_output.get_qnn_tensor() }; + error = qnn_raw_interface.graphExecute(graph_handle, + tensor_inputs, 2, + tensor_outputs, 1, + nullptr, nullptr); + if (ctx->device == QNN_BACKEND_NPU) { + if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { + QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); + } + } + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + goto failure; + } + } + +failure: + if (QNN_SUCCESS != error) { + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, src0->type, ggml_type_name(src0->type), + src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], + src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, src1->type, ggml_type_name(src1->type), + src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], + src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], + dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); + } + + perf.info(); +} + +static void ggml_qnn_repeat(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_get_rows(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_acc(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { +} + +static void ggml_qnn_div(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { +} + +static void ggml_qnn_gelu(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_silu(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_gelu_quick(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { +} + +static void ggml_qnn_tanh(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_relu(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_hardsigmoid(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { +} + +static void ggml_qnn_hardswish(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_leaky_relu(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { +} + +static void ggml_qnn_sqr(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { +} + +static void ggml_qnn_norm(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_group_norm(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { +} + +static void ggml_qnn_concat(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_upscale(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_pad(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { +} + +static void ggml_qnn_rms_norm(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_cpy(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { +} + +static void ggml_qnn_dup(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { + ggml_qnn_cpy(ctx, src0, dst, nullptr); + (void)src1; +} + +static void ggml_qnn_mul_mat_id(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { +} + +static void ggml_qnn_scale(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_clamp(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_diag_mask_inf(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { +} + +static void ggml_qnn_soft_max(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_rope(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); +} + +static void ggml_qnn_pool2d(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_im2col(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_sum_rows(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); +} + +static void ggml_qnn_argsort(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); +} + +static void ggml_qnn_nop(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { + (void)src0; + (void)src1; + (void)dst; +} + +qnn::ggml_qnn_op_array_t qnn::ggml_qnn_op_array() { + static constexpr const qnn::ggml_qnn_op_t kQnnOpsTable[GGML_OP_COUNT] = { + nullptr, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + ggml_qnn_add, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + nullptr, // GGML_OP_SUB + nullptr, // GGML_OP_MUL + nullptr, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + nullptr, // GGML_OP_SQRT + nullptr, // GGML_OP_LOG + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + nullptr, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM + + ggml_qnn_mul_mat, // GGML_OP_MUL_MAT + nullptr, // GGML_OP_MUL_MAT_ID + nullptr, // GGML_OP_OUT_PROD + + nullptr, // GGML_OP_SCALE + nullptr, // GGML_OP_SET + nullptr, // GGML_OP_CPY + nullptr, // GGML_OP_CONT + nullptr, // GGML_OP_RESHAPE + nullptr, // GGML_OP_VIEW + nullptr, // GGML_OP_PERMUTE + nullptr, // GGML_OP_TRANSPOSE + nullptr, // GGML_OP_GET_ROWS + nullptr, // GGML_OP_GET_ROWS_BACK + nullptr, // GGML_OP_DIAG + nullptr, // GGML_OP_DIAG_MASK_INF + nullptr, // GGML_OP_DIAG_MASK_ZERO + nullptr, // GGML_OP_SOFT_MAX + nullptr, // GGML_OP_SOFT_MAX_BACK + nullptr, // GGML_OP_ROPE + nullptr, // GGML_OP_ROPE_BACK + nullptr, // GGML_OP_CLAMP + nullptr, // GGML_OP_CONV_TRANSPOSE_1D + nullptr, // GGML_OP_IM2COL + nullptr, // GGML_OP_CONV_TRANSPOSE_2D + nullptr, // GGML_OP_POOL_1D + nullptr, // GGML_OP_POOL_2D + nullptr, // GGML_OP_UPSCALE + nullptr, // GGML_OP_PAD + nullptr, // GGML_OP_ARANGE + nullptr, // GGML_OP_TIMESTEP_EMBEDDING + nullptr, // GGML_OP_ARGSORT + nullptr, // GGML_OP_LEAKY_RELU + + nullptr, // GGML_OP_FLASH_ATTN_EXT + nullptr, // GGML_OP_FLASH_ATTN_BACK + nullptr, // GGML_OP_SSM_CONV + nullptr, // GGML_OP_SSM_SCAN + nullptr, // GGML_OP_WIN_PART + nullptr, // GGML_OP_WIN_UNPART + nullptr, // GGML_OP_GET_REL_POS + nullptr, // GGML_OP_ADD_REL_POS + + nullptr, // GGML_OP_UNARY + + nullptr, // GGML_OP_MAP_UNARY + nullptr, // GGML_OP_MAP_BINARY + + nullptr, // GGML_OP_MAP_CUSTOM1_F32 + nullptr, // GGML_OP_MAP_CUSTOM2_F32 + nullptr, // GGML_OP_MAP_CUSTOM3_F32 + + nullptr, // GGML_OP_MAP_CUSTOM1 + nullptr, // GGML_OP_MAP_CUSTOM2 + nullptr, // GGML_OP_MAP_CUSTOM3 + + nullptr, // GGML_OP_CROSS_ENTROPY_LOSS + nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK + }; + + return kQnnOpsTable; +} diff --git a/ggml-qnn/backend-ops.hpp b/ggml-qnn/backend-ops.hpp new file mode 100644 index 0000000000000..c3dd5de302289 --- /dev/null +++ b/ggml-qnn/backend-ops.hpp @@ -0,0 +1,17 @@ +#pragma once + +#include "ggml.h" +#include "backend.hpp" + +namespace qnn { + + typedef void (*ggml_qnn_op_t)(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, + const ggml_tensor* src1, + ggml_tensor* dst); + + typedef const ggml_qnn_op_t(&ggml_qnn_op_array_t)[GGML_OP_COUNT]; + + ggml_qnn_op_array_t ggml_qnn_op_array(); + +} diff --git a/ggml-qnn/backend.hpp b/ggml-qnn/backend.hpp index b5aacf57c1aa0..dc40090ee6114 100644 --- a/ggml-qnn/backend.hpp +++ b/ggml-qnn/backend.hpp @@ -1,11 +1,6 @@ #pragma once -#include "QnnTypes.h" -#include "QnnCommon.h" -#include "QnnContext.h" -#include "QnnBackend.h" - #include "ggml.h" #include "ggml-backend.h" diff --git a/ggml-qnn/qnn.hpp b/ggml-qnn/qnn.hpp index 212b6f8521745..6caefb75644f7 100644 --- a/ggml-qnn/qnn.hpp +++ b/ggml-qnn/qnn.hpp @@ -1,21 +1,27 @@ #pragma once +#include #include +#include +#include +#include // header file of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK // https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct #include "QnnTypes.h" #include "QnnCommon.h" +#include "QnnInterface.h" #include "QnnContext.h" #include "QnnBackend.h" #include "QnnGraph.h" #include "QnnProperty.h" #include "QnnTensor.h" +#include "System/QnnSystemInterface.h" #include "HTP/QnnHtpDevice.h" #include "HTP/QnnHtpGraph.h" +#include "qnn-types.hpp" #include "utils.hpp" -#include "logger.hpp" namespace qnn { @@ -864,9 +870,8 @@ namespace qnn { const qnn::qcom_socinfo& get_soc_info() { return _soc_info; } public: - std::map> - _qnn_graph_map; + std::map> _qnn_graph_map; private: int load_system() { diff --git a/ggml-qnn/tensor.hpp b/ggml-qnn/tensor.hpp index 687ebd8905ef4..de0d1dc2dbbef 100644 --- a/ggml-qnn/tensor.hpp +++ b/ggml-qnn/tensor.hpp @@ -4,6 +4,7 @@ #include "QnnTensor.h" #include "System/QnnSystemInterface.h" +#include "ggml-qnn.h" #include "backend.hpp" #include "qnn.hpp" diff --git a/ggml-qnn/utils.cpp b/ggml-qnn/utils.cpp new file mode 100644 index 0000000000000..798445c02fd76 --- /dev/null +++ b/ggml-qnn/utils.cpp @@ -0,0 +1,126 @@ + +#include "utils.hpp" + +#include "ggml-qnn.h" +#include "qnn-types.hpp" + +namespace qnn { + + // TODO: mapping more ggml data type to QNN data type + // ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684 + Qnn_DataType_t datatype_from_ggml_datatype(enum ggml_type ggmltype) { + switch (ggmltype) { + case GGML_TYPE_F16: + return QNN_DATATYPE_FLOAT_16; + case GGML_TYPE_F32: + return QNN_DATATYPE_FLOAT_32; + case GGML_TYPE_I8: + return QNN_DATATYPE_INT_8; + case GGML_TYPE_Q8_0: + return QNN_DATATYPE_SFIXED_POINT_8; + case GGML_TYPE_Q4_0: + return QNN_DATATYPE_SFIXED_POINT_4; + default: + break; + } + return QNN_DATATYPE_UNDEFINED; + } + + + uint32_t get_ggml_tensor_rank(const ggml_tensor* tensor) { + uint32_t rank = 0; + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) { + rank++; + } + } + return rank; + } + + + const char* get_backend_name(int n_backend_type) { + switch (n_backend_type) { + case QNN_BACKEND_CPU: + return "QNN-CPU"; + case QNN_BACKEND_GPU: + return "QNN-GPU"; + case QNN_BACKEND_NPU: + return "QNN-NPU"; + case QNN_BACKEND_GGML: + return "ggml"; //"fake" QNN backend, used for compare performance between QNN backend and original GGML + default: + return "unknown"; + } + } + + const char* get_chipset_desc(uint32_t chipset_id) { + switch (chipset_id) { + case SM8450: + return "SM8450"; + case SM8475: + return "SM8475"; + case SM8550: + return "SM8550"; + case SM8650: + return "SM8650"; + default: + return "unknown"; + } + } + + const char* get_htparch_desc(size_t htp_arch) { + switch (htp_arch) { + case V68: + return "QCOM_HTP_V68"; + case V69: + return "QCOM_HTP_V69"; + case V73: + return "QCOM_HTP_V73"; + case V75: + return "QCOM_HTP_V75"; + default: + return "unknown"; + } + } + + intptr_t align_to(size_t alignment, intptr_t offset) { + return offset % alignment == 0 + ? offset + : offset + (static_cast(alignment) - + offset % static_cast(alignment)); + } + + uint32_t get_ggml_tensor_data_size(const ggml_tensor* tensor) { + /* + size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); + size_t n_dims = qnn_get_ggml_tensor_rank(tensor); + for (int i = 1; i < n_dims; i++) { + data_size *= tensor->ne[i]; + } + + return data_size; + */ + return ggml_nbytes(tensor); + } + + // ================================================================================================= + // + // QNN backend internal helper functions + // + // ================================================================================================= + // TODO: only support GGML_OP_ADD/GGML_OP_MUL/GGML_OP_MUL_MAT + const char* opname_from_ggmlop(enum ggml_op ggmlop) { + switch (ggmlop) { + case GGML_OP_ADD: + return QNN_OP_ELEMENT_WISE_ADD; + case GGML_OP_MUL: + return QNN_OP_ELEMENT_WISE_MULTIPLY; + case GGML_OP_MUL_MAT: + return QNN_OP_MAT_MUL; + default: + break; + } + return nullptr; + } + +} diff --git a/ggml-qnn/utils.hpp b/ggml-qnn/utils.hpp index 2ec7c0f13f0ce..4889c6dc8601c 100644 --- a/ggml-qnn/utils.hpp +++ b/ggml-qnn/utils.hpp @@ -1,135 +1,34 @@ #pragma once +#include +#include +#include +#include +#include +#include + #include "QnnTypes.h" #include "ggml.h" -#include "qnn-types.hpp" +#include "logger.hpp" namespace qnn { - // TODO: mapping more ggml data type to QNN data type - // ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684 - Qnn_DataType_t datatype_from_ggml_datatype(enum ggml_type ggmltype) { - switch (ggmltype) { - case GGML_TYPE_F16: - return QNN_DATATYPE_FLOAT_16; - case GGML_TYPE_F32: - return QNN_DATATYPE_FLOAT_32; - case GGML_TYPE_I8: - return QNN_DATATYPE_INT_8; - case GGML_TYPE_Q8_0: - return QNN_DATATYPE_SFIXED_POINT_8; - case GGML_TYPE_Q4_0: - return QNN_DATATYPE_SFIXED_POINT_4; - default: - break; - } - return QNN_DATATYPE_UNDEFINED; - } - - - uint32_t get_ggml_tensor_rank(const ggml_tensor* tensor) { - uint32_t rank = 0; - for (int i = 0; i < GGML_MAX_DIMS; i++) { - if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) { - rank++; - } - } - return rank; - } + Qnn_DataType_t datatype_from_ggml_datatype(enum ggml_type ggmltype); + uint32_t get_ggml_tensor_rank(const ggml_tensor* tensor); + const char* get_backend_name(int n_backend_type); + const char* get_chipset_desc(uint32_t chipset_id); + const char* get_htparch_desc(size_t htp_arch); + intptr_t align_to(size_t alignment, intptr_t offset); + uint32_t get_ggml_tensor_data_size(const ggml_tensor* tensor); - - const char* get_backend_name(int n_backend_type) { - switch (n_backend_type) { - case QNN_BACKEND_CPU: - return "QNN-CPU"; - case QNN_BACKEND_GPU: - return "QNN-GPU"; - case QNN_BACKEND_NPU: - return "QNN-NPU"; - case QNN_BACKEND_GGML: - return "ggml"; //"fake" QNN backend, used for compare performance between QNN backend and original GGML - default: - return "unknown"; - } - } - - const char* get_chipset_desc(uint32_t chipset_id) { - switch (chipset_id) { - case SM8450: - return "SM8450"; - case SM8475: - return "SM8475"; - case SM8550: - return "SM8550"; - case SM8650: - return "SM8650"; - default: - return "unknown"; - } - } - - const char* get_htparch_desc(size_t htp_arch) { - switch (htp_arch) { - case V68: - return "QCOM_HTP_V68"; - case V69: - return "QCOM_HTP_V69"; - case V73: - return "QCOM_HTP_V73"; - case V75: - return "QCOM_HTP_V75"; - default: - return "unknown"; - } - } + const char* opname_from_ggmlop(enum ggml_op ggmlop); template Fn load_qnn_functionpointers(void* handle, const char* function_name) { return reinterpret_cast(dlsym(handle, function_name)); } - intptr_t align_to(size_t alignment, intptr_t offset) { - return offset % alignment == 0 - ? offset - : offset + (static_cast(alignment) - - offset % static_cast(alignment)); - } - - uint32_t get_ggml_tensor_data_size(const ggml_tensor* tensor) { - /* - size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); - size_t n_dims = qnn_get_ggml_tensor_rank(tensor); - for (int i = 1; i < n_dims; i++) { - data_size *= tensor->ne[i]; - } - - return data_size; - */ - return ggml_nbytes(tensor); - } - - - // ================================================================================================= - // - // QNN backend internal helper functions - // - // ================================================================================================= - // TODO: only support GGML_OP_ADD/GGML_OP_MUL/GGML_OP_MUL_MAT - const char* opname_from_ggmlop(enum ggml_op ggmlop) { - switch (ggmlop) { - case GGML_OP_ADD: - return QNN_OP_ELEMENT_WISE_ADD; - case GGML_OP_MUL: - return QNN_OP_ELEMENT_WISE_MULTIPLY; - case GGML_OP_MUL_MAT: - return QNN_OP_MAT_MUL; - default: - break; - } - return nullptr; - } - inline int validate_tensor_version(Qnn_Tensor_t tensor) { if (tensor.version != QNN_TENSOR_VERSION_1) { QNN_LOG_WARN( @@ -272,6 +171,45 @@ namespace qnn { tensor.v1.memHandle = handle; } } + + +#if ENABLE_QNNBACKEND_PERF + class qnn_perf { + public: + qnn_perf(const std::string& perf_name) : _perf_name(std::move(perf_name)) {}; + qnn_perf() = delete; + qnn_perf(const qnn_perf&) = delete; + qnn_perf& operator= (const qnn_perf&) = delete; + + void start() { + _begin_time = ggml_time_us(); + } + + void info() { + _end_time = ggml_time_us(); + _duration = (_end_time - _begin_time); + QNN_LOG_INFO("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration); + } + + private: + int64_t _begin_time = 0LL; + int64_t _end_time = 0LL; + int64_t _duration = 0LL; + std::string _perf_name; + }; +#else + class qnn_perf { + public: + qnn_perf(const std::string& perf_name) {} + qnn_perf() = delete; + qnn_perf(const qnn_perf&) = delete; + qnn_perf& operator= (const qnn_perf&) = delete; + + void start() {} + void info() {} + }; +#endif + } diff --git a/tests/ggml-qnn/CMakeLists.txt b/tests/ggml-qnn/CMakeLists.txt index 77a2059ed0f0c..66e8c077a1d3a 100644 --- a/tests/ggml-qnn/CMakeLists.txt +++ b/tests/ggml-qnn/CMakeLists.txt @@ -21,6 +21,8 @@ set(SOURCE_FILES ../../ggml-backend.c ../../ggml-quants.c ../../ggml-qnn/logger.cpp + ../../ggml-qnn/utils.cpp + ../../ggml-qnn/backend-ops.cpp ../../ggml-qnn.cpp ggml-qnn-ut.cpp ) From 8b677d1b2facc248409b3356a18198437add807d Mon Sep 17 00:00:00 2001 From: hongruichen Date: Tue, 2 Jul 2024 10:40:07 +0800 Subject: [PATCH 034/166] move qnn backend into sub folder --- ggml-qnn.h => ggml/include/ggml-qnn.h | 0 ggml-qnn.cpp => ggml/src/ggml-qnn.cpp | 0 .../src/ggml-qnn}/backend-ops.cpp | 0 .../src/ggml-qnn}/backend-ops.hpp | 0 {ggml-qnn => ggml/src/ggml-qnn}/backend.hpp | 0 {ggml-qnn => ggml/src/ggml-qnn}/logger.cpp | 0 {ggml-qnn => ggml/src/ggml-qnn}/logger.hpp | 0 {ggml-qnn => ggml/src/ggml-qnn}/qnn-types.hpp | 0 {ggml-qnn => ggml/src/ggml-qnn}/qnn.hpp | 0 {ggml-qnn => ggml/src/ggml-qnn}/tensor.hpp | 0 {ggml-qnn => ggml/src/ggml-qnn}/utils.cpp | 0 {ggml-qnn => ggml/src/ggml-qnn}/utils.hpp | 0 tests/ggml-qnn/CMakeLists.txt | 40 +++++++++---------- 13 files changed, 19 insertions(+), 21 deletions(-) rename ggml-qnn.h => ggml/include/ggml-qnn.h (100%) rename ggml-qnn.cpp => ggml/src/ggml-qnn.cpp (100%) rename {ggml-qnn => ggml/src/ggml-qnn}/backend-ops.cpp (100%) rename {ggml-qnn => ggml/src/ggml-qnn}/backend-ops.hpp (100%) rename {ggml-qnn => ggml/src/ggml-qnn}/backend.hpp (100%) rename {ggml-qnn => ggml/src/ggml-qnn}/logger.cpp (100%) rename {ggml-qnn => ggml/src/ggml-qnn}/logger.hpp (100%) rename {ggml-qnn => ggml/src/ggml-qnn}/qnn-types.hpp (100%) rename {ggml-qnn => ggml/src/ggml-qnn}/qnn.hpp (100%) rename {ggml-qnn => ggml/src/ggml-qnn}/tensor.hpp (100%) rename {ggml-qnn => ggml/src/ggml-qnn}/utils.cpp (100%) rename {ggml-qnn => ggml/src/ggml-qnn}/utils.hpp (100%) diff --git a/ggml-qnn.h b/ggml/include/ggml-qnn.h similarity index 100% rename from ggml-qnn.h rename to ggml/include/ggml-qnn.h diff --git a/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp similarity index 100% rename from ggml-qnn.cpp rename to ggml/src/ggml-qnn.cpp diff --git a/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp similarity index 100% rename from ggml-qnn/backend-ops.cpp rename to ggml/src/ggml-qnn/backend-ops.cpp diff --git a/ggml-qnn/backend-ops.hpp b/ggml/src/ggml-qnn/backend-ops.hpp similarity index 100% rename from ggml-qnn/backend-ops.hpp rename to ggml/src/ggml-qnn/backend-ops.hpp diff --git a/ggml-qnn/backend.hpp b/ggml/src/ggml-qnn/backend.hpp similarity index 100% rename from ggml-qnn/backend.hpp rename to ggml/src/ggml-qnn/backend.hpp diff --git a/ggml-qnn/logger.cpp b/ggml/src/ggml-qnn/logger.cpp similarity index 100% rename from ggml-qnn/logger.cpp rename to ggml/src/ggml-qnn/logger.cpp diff --git a/ggml-qnn/logger.hpp b/ggml/src/ggml-qnn/logger.hpp similarity index 100% rename from ggml-qnn/logger.hpp rename to ggml/src/ggml-qnn/logger.hpp diff --git a/ggml-qnn/qnn-types.hpp b/ggml/src/ggml-qnn/qnn-types.hpp similarity index 100% rename from ggml-qnn/qnn-types.hpp rename to ggml/src/ggml-qnn/qnn-types.hpp diff --git a/ggml-qnn/qnn.hpp b/ggml/src/ggml-qnn/qnn.hpp similarity index 100% rename from ggml-qnn/qnn.hpp rename to ggml/src/ggml-qnn/qnn.hpp diff --git a/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp similarity index 100% rename from ggml-qnn/tensor.hpp rename to ggml/src/ggml-qnn/tensor.hpp diff --git a/ggml-qnn/utils.cpp b/ggml/src/ggml-qnn/utils.cpp similarity index 100% rename from ggml-qnn/utils.cpp rename to ggml/src/ggml-qnn/utils.cpp diff --git a/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp similarity index 100% rename from ggml-qnn/utils.hpp rename to ggml/src/ggml-qnn/utils.hpp diff --git a/tests/ggml-qnn/CMakeLists.txt b/tests/ggml-qnn/CMakeLists.txt index 66e8c077a1d3a..b4f1bd6c07482 100644 --- a/tests/ggml-qnn/CMakeLists.txt +++ b/tests/ggml-qnn/CMakeLists.txt @@ -13,18 +13,18 @@ set(QNN_INC_PATH ${QNN_SDK_PATH}/include/QNN) set(QNN_LIB_PATH ${QNN_SDK_PATH}/lib/aarch64-android) include_directories(${QNN_INC_PATH}) -include_directories(../../) # ggml.h +include_directories(../../ggml/include) # ggml.h, ggml-qnn.h set(SOURCE_FILES - ../../ggml.c - ../../ggml-alloc.c - ../../ggml-backend.c - ../../ggml-quants.c - ../../ggml-qnn/logger.cpp - ../../ggml-qnn/utils.cpp - ../../ggml-qnn/backend-ops.cpp - ../../ggml-qnn.cpp - ggml-qnn-ut.cpp + ../../ggml/src/ggml.c + ../../ggml/src/ggml-alloc.c + ../../ggml/src/ggml-backend.c + ../../ggml/src/ggml-quants.c + ../../ggml/src/ggml-qnn/logger.cpp + ../../ggml/src/ggml-qnn/utils.cpp + ../../ggml/src/ggml-qnn/backend-ops.cpp + ../../ggml/src/ggml-qnn.cpp + ggml-qnn-ut.cpp ) @@ -36,22 +36,20 @@ add_definitions(-D__ARM_NEON) add_definitions(-DGGML_USE_QNN) if(CMAKE_BUILD_TYPE STREQUAL "Release") -add_definitions(-DNDEBUG) -add_definitions(-O3) + add_definitions(-DNDEBUG) + add_definitions(-O3) else() -add_definitions(-O3) + add_definitions(-O3) endif() if (TARGET_SNAPDRAGON_8_GEN3) -# the below build optimization only verified and works well on Qualcomm SM8650-AB Snapdragon 8 Gen 3 -add_definitions(-march=armv8.7-a) -add_definitions(-mcpu=cortex-x1) -add_definitions(-mtune=cortex-x1) - + # the below build optimization only verified and works well on Qualcomm SM8650-AB Snapdragon 8 Gen 3 + add_definitions(-march=armv8.7-a) + add_definitions(-mcpu=cortex-x1) + add_definitions(-mtune=cortex-x1) else() -# the below build optimization might be works well on ALL Android phone equipped with Qualcomm mainstream mobile SoC -add_definitions(-mcpu=cortex-a72) - + # the below build optimization might be works well on ALL Android phone equipped with Qualcomm mainstream mobile SoC + add_definitions(-mcpu=cortex-a72) endif() add_compile_options("-Wall" "-Wno-sign-compare") From 38f88d5fb15eed11265bba11ddbd85e36ebffaa1 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Tue, 2 Jul 2024 19:46:17 +0800 Subject: [PATCH 035/166] fix compiling error after merge latest master --- ggml/src/ggml-qnn.cpp | 21 +++++---------------- ggml/src/ggml-qnn/backend-ops.cpp | 24 ++++++++++-------------- 2 files changed, 15 insertions(+), 30 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 750d5ff91c3d3..e5fc00045beb3 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -321,9 +321,7 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, return true; } -bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx, - struct ggml_compute_params * params, - struct ggml_tensor * tensor) { +bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx, struct ggml_tensor * tensor) { auto func = qnn::ggml_qnn_op_array()[tensor->op]; if (!func) { QNN_LOG_WARN("unsupported op %d", tensor->op); @@ -515,13 +513,6 @@ GGML_CALL static size_t ggml_backend_qnn_buffer_type_get_max_size(ggml_backend_b return (96 * 1024 * 1024); } -GGML_CALL static bool ggml_backend_qnn_buffer_type_supports_backend( - ggml_backend_buffer_type_t buft, ggml_backend_t backend) { - GGML_UNUSED(buft); - - return ggml_backend_is_qnn(backend) || ggml_backend_is_cpu(backend); -} - GGML_CALL static bool ggml_backend_qnn_buffer_is_host(ggml_backend_buffer_type_t buft) { GGML_UNUSED(buft); return true; @@ -574,9 +565,6 @@ GGML_CALL static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backe ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; GGML_UNUSED(ctx); - ggml_compute_params params = {}; - params.type = GGML_TASK_TYPE_COMPUTE; - params.ith = 0; for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || @@ -584,7 +572,7 @@ GGML_CALL static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backe node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { continue; } - bool ok = ggml_qnn_compute_forward(ctx, ¶ms, node); + bool ok = ggml_qnn_compute_forward(ctx, node); if (!ok) { QNN_LOG_DEBUG("error: op not supported %s (%s)\n", node->name, ggml_op_name(node->op)); } @@ -616,9 +604,11 @@ static ggml_backend_i ggml_backend_qnn_interface = { /* .synchronize = */ nullptr, /* .graph_plan_create = */ nullptr, /* .graph_plan_free = */ nullptr, + /* .graph_plan_update = */ nullptr, /* .graph_plan_compute = */ nullptr, /* .graph_compute = */ ggml_backend_qnn_graph_compute, /* .supports_op = */ ggml_backend_qnn_supports_op, + /* .supports_buft = */ nullptr, /* .offload_op = */ ggml_backend_qnn_offload_op, /* .event_new = */ nullptr, /* .event_free = */ nullptr, @@ -702,10 +692,9 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device) { /* .get_alignment = */ ggml_backend_qnn_buffer_type_get_alignment, /* .get_max_size = */ ggml_backend_qnn_buffer_type_get_max_size, /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes - /* .supports_backend = */ ggml_backend_qnn_buffer_type_supports_backend, /* .is_host = */ ggml_backend_qnn_buffer_is_host }, - /* .context = */ & context, + /* .context = */ &context, }; } ggml_backend_qnn_buffer_type_initialized = true; diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index a9c94a6df3102..f1fe699ab653d 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -8,21 +8,17 @@ static bool qnn_is_valid_params(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, const ggml_tensor* src1, ggml_tensor* dst) { - if ((nullptr == ctx) || (nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { + if (!ctx || !src0 || !src1 || !dst) { QNN_LOG_WARN("invalid params\n"); return false; } - qnn::qnn_instance* instance = nullptr; - Qnn_Tensor_t* tensor_0 = nullptr; - Qnn_Tensor_t* tensor_1 = nullptr; - Qnn_Tensor_t* tensor_2 = nullptr; - tensor_0 = (Qnn_Tensor_t*)src0->extra; - tensor_1 = (Qnn_Tensor_t*)src1->extra; - tensor_2 = (Qnn_Tensor_t*)dst->extra; - instance = ctx->instance; - if ((nullptr == instance) || (nullptr == tensor_0) || (nullptr == tensor_1) || (nullptr == tensor_2)) { - QNN_LOG_WARN("invalid params\n"); + auto* instance = ctx->instance; + auto* tensor0 = src0->extra; + auto* tensor1 = src1->extra; + auto* tensor2 = dst->extra; + if (!instance || !tensor0 || !tensor1 || !tensor2) { + QNN_LOG_WARN("invalid tensors\n"); return false; } @@ -60,7 +56,7 @@ static void ggml_qnn_add(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, qnn::qnn_perf perf("ggml_qnn_add"); perf.start(); - std::string map_entry = std::string(ggml_op_name(ggmlop)); + std::string map_entry(ggml_op_name(ggmlop)); if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { graph_initialized = true; @@ -141,8 +137,8 @@ static void ggml_qnn_add(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, goto failure; } - Qnn_Tensor_t tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }; - Qnn_Tensor_t tensor_outputs[] = { *tensor_output.get_qnn_tensor() }; + Qnn_Tensor_t tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }; + Qnn_Tensor_t tensor_outputs[] = { *tensor_output.get_qnn_tensor() }; Qnn_OpConfig_t op_config = { (Qnn_OpConfigVersion_t)1, .v1 = {"ggml_op_add", From 000240cf6273d02e91ba38ba7873d6151368ec6c Mon Sep 17 00:00:00 2001 From: hongruichen Date: Thu, 4 Jul 2024 22:18:45 +0800 Subject: [PATCH 036/166] add clang format file and reformating --- ggml/include/ggml-qnn.h | 39 +- ggml/src/ggml-qnn/.clang-format | 31 + ggml/src/ggml-qnn/backend-ops.cpp | 500 ++++----- ggml/src/ggml-qnn/backend-ops.hpp | 13 +- ggml/src/ggml-qnn/backend.hpp | 5 +- ggml/src/ggml-qnn/logger.cpp | 63 +- ggml/src/ggml-qnn/logger.hpp | 36 +- ggml/src/ggml-qnn/qnn-types.hpp | 96 +- ggml/src/ggml-qnn/qnn.hpp | 1666 +++++++++++++---------------- ggml/src/ggml-qnn/tensor.hpp | 225 ++-- ggml/src/ggml-qnn/utils.cpp | 102 +- ggml/src/ggml-qnn/utils.hpp | 357 +++---- 12 files changed, 1419 insertions(+), 1714 deletions(-) create mode 100644 ggml/src/ggml-qnn/.clang-format diff --git a/ggml/include/ggml-qnn.h b/ggml/include/ggml-qnn.h index 9ea3dcda62c64..60aaf22179647 100644 --- a/ggml/include/ggml-qnn.h +++ b/ggml/include/ggml-qnn.h @@ -1,41 +1,48 @@ #pragma once -#include "ggml.h" #include "ggml-backend.h" +#include "ggml.h" #ifdef __cplusplus extern "C" { #endif - -#define GGML_QNN_MAX_DEVICES 3 +#define GGML_QNN_MAX_DEVICES 3 enum QNNBackend { - QNN_BACKEND_CPU, - QNN_BACKEND_GPU, - QNN_BACKEND_NPU, - QNN_BACKEND_GGML, //"fake" QNN backend, used for compare performance between QNN and original GGML + QNN_BACKEND_CPU, + QNN_BACKEND_GPU, + QNN_BACKEND_NPU, + QNN_BACKEND_GGML, //"fake" QNN backend, used for compare performance between + // QNN and original GGML }; -GGML_API int ggml_backend_qnn_reg_devices(void); +GGML_API int ggml_backend_qnn_reg_devices(void); /** * - * @param device 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2: QNN_BACKEND_NPU - * @param qnn_lib_path qnn library path, such as "/data/local/tmp/" on Android or specified in JNI layer + * @param device 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2: + * QNN_BACKEND_NPU + * @param qnn_lib_path qnn library path, such as "/data/local/tmp/" on + * Android or specified in JNI layer * @return */ -GGML_API ggml_backend_t ggml_backend_qnn_init(size_t dev_num, const char * qnn_lib_path); +GGML_API ggml_backend_t ggml_backend_qnn_init(size_t dev_num, + const char* qnn_lib_path); -GGML_API bool ggml_backend_is_qnn(ggml_backend_t backend); +GGML_API bool ggml_backend_is_qnn(ggml_backend_t backend); -GGML_API void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int thread_counts); +GGML_API void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, + int thread_counts); -GGML_API int ggml_backend_qnn_get_device_count(void); +GGML_API int ggml_backend_qnn_get_device_count(void); -GGML_API void ggml_backend_qnn_get_device_description(size_t dev_num, char * description, size_t description_size); +GGML_API void ggml_backend_qnn_get_device_description(size_t dev_num, + char* description, + size_t description_size); -GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t dev_num); +GGML_API GGML_CALL ggml_backend_buffer_type_t +ggml_backend_qnn_buffer_type(size_t dev_num); #ifdef __cplusplus } diff --git a/ggml/src/ggml-qnn/.clang-format b/ggml/src/ggml-qnn/.clang-format new file mode 100644 index 0000000000000..3b933ff10db42 --- /dev/null +++ b/ggml/src/ggml-qnn/.clang-format @@ -0,0 +1,31 @@ +--- +BasedOnStyle: Google +IndentWidth: 4 +AccessModifierOffset: -4 +AlignAfterOpenBracket: Align +AlignOperands: true +AlignTrailingComments: true +BinPackArguments: true +BinPackParameters: true +BreakBeforeBraces: Custom +BreakConstructorInitializers: AfterColon +ColumnLimit: 120 +Cpp11BracedListStyle: false +DerivePointerAlignment: false +IncludeCategories: + - Regex: '^<.*\.h>' + Priority: 1 + - Regex: '^<.*' + Priority: 2 + - Regex: '^"ggml\.h"' + Priority: 3 + - Regex: '^"ggml-.+\.h"' + Priority: 4 + - Regex: '.*' + Priority: 5 +KeepEmptyLinesAtTheStartOfBlocks: true +MaxEmptyLinesToKeep: 1 +PointerAlignment: Right +SortIncludes: true +SpacesBeforeTrailingComments: 1 +UseTab: Never \ No newline at end of file diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index f1fe699ab653d..cde1bd248cc29 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -1,22 +1,21 @@ #include "backend-ops.hpp" -#include "utils.hpp" #include "logger.hpp" #include "tensor.hpp" +#include "utils.hpp" - -static bool qnn_is_valid_params(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, - const ggml_tensor* src1, ggml_tensor* dst) { +static bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) { if (!ctx || !src0 || !src1 || !dst) { QNN_LOG_WARN("invalid params\n"); return false; } - auto* instance = ctx->instance; - auto* tensor0 = src0->extra; - auto* tensor1 = src1->extra; - auto* tensor2 = dst->extra; + auto *instance = ctx->instance; + auto *tensor0 = src0->extra; + auto *tensor1 = src1->extra; + auto *tensor2 = dst->extra; if (!instance || !tensor0 || !tensor1 || !tensor2) { QNN_LOG_WARN("invalid tensors\n"); return false; @@ -26,28 +25,28 @@ static bool qnn_is_valid_params(ggml_backend_qnn_context* ctx, const ggml_tensor } #ifndef NDEBUG -#define CHECK_PARAMS(ctx, src0, src1, dst) \ - do { \ - if (!qnn_is_valid_params((ctx), (src0), (src1), (dst))) { \ - return; \ - } \ +#define CHECK_PARAMS(ctx, src0, src1, dst) \ + do { \ + if (!qnn_is_valid_params((ctx), (src0), (src1), (dst))) { \ + return; \ + } \ } while (0) #else #define CHECK_PARAMS(ctx, src0, src1, dst) #endif -//TODO: this function can be removed later because there are duplicated codes with ggml_qnn_mul_mat -// keep it for illustrate how to implement a specified GGMPL OP using QNN API + QNN RPC -static void ggml_qnn_add(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, - const ggml_tensor* src1, ggml_tensor* dst) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - bool graph_initialized = false; - qnn::qnn_instance* instance = nullptr; - std::string graph_name = "ggml_op_qnn_add"; - Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Param_t qnn_params[] = {}; - enum ggml_op ggmlop = GGML_OP_ADD; +// TODO: this function can be removed later because there are duplicated codes with ggml_qnn_mul_mat +// keep it for illustrate how to implement a specified GGMPL OP using QNN API + QNN RPC +static void ggml_qnn_add(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + bool graph_initialized = false; + qnn::qnn_instance *instance = nullptr; + std::string graph_name = "ggml_op_qnn_add"; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Param_t qnn_params[] = {}; + enum ggml_op ggmlop = GGML_OP_ADD; CHECK_PARAMS(ctx, src0, src1, dst); instance = ctx->instance; @@ -57,16 +56,14 @@ static void ggml_qnn_add(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, perf.start(); std::string map_entry(ggml_op_name(ggmlop)); - if (instance->_qnn_graph_map.find(map_entry) != - instance->_qnn_graph_map.end()) { + if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { graph_initialized = true; - auto& graph_item = instance->_qnn_graph_map[map_entry]; + auto &graph_item = instance->_qnn_graph_map[map_entry]; graph_handle = std::get<0>(graph_item); } if (!graph_initialized) { - graph_name = graph_name + "_" + std::to_string(ctx->threads) + - "_" + src0->name + "_" + src1->name; + graph_name = graph_name + "_" + std::to_string(ctx->threads) + "_" + src0->name + "_" + src1->name; QNN_LOG_INFO("graph name %s", graph_name.c_str()); if (ctx->device == QNN_BACKEND_NPU) { QnnHtpGraph_CustomConfig_t hvx_config; @@ -86,7 +83,7 @@ static void ggml_qnn_add(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, QnnHtpGraph_CustomConfig_t opt_config; opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; - opt_config.optimizationOption.floatValue = 1; // 1 / 3 + opt_config.optimizationOption.floatValue = 1; // 1 / 3 QnnGraph_Config_t graph_opt_config; graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; graph_opt_config.customConfig = &opt_config; @@ -98,28 +95,22 @@ static void ggml_qnn_add(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; graph_vtcm_config.customConfig = &vtcm_config; - const QnnGraph_Config_t* p_graphconfig[] = { &graph_hvx_config, - &graph_dlbc_config, - &graph_vtcm_config, - &graph_opt_config, - NULL }; - error = qnn_raw_interface.graphCreate( - instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig, - &graph_handle); - } - else { - error = qnn_raw_interface.graphCreate( - instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, - &graph_handle); + const QnnGraph_Config_t *p_graphconfig[] = { &graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config, + &graph_opt_config, NULL }; + error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig, + &graph_handle); + } else { + error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, + &graph_handle); } if (QNN_SUCCESS != error) { - QNN_LOG_INFO("can't create qnn graph handle with graph name %s, " + QNN_LOG_INFO( + "can't create qnn graph handle with graph name %s, " "error = %d\n", graph_name.c_str(), error); goto failure; - } - else { + } else { QNN_LOG_INFO("create qnn graph handle with graph name %s ok\n", graph_name.c_str()); } @@ -139,30 +130,20 @@ static void ggml_qnn_add(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, Qnn_Tensor_t tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }; Qnn_Tensor_t tensor_outputs[] = { *tensor_output.get_qnn_tensor() }; - Qnn_OpConfig_t op_config = { - (Qnn_OpConfigVersion_t)1, - .v1 = {"ggml_op_add", - QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_ELEMENT_WISE_ADD, - 0, qnn_params, - 2, tensor_inputs, - 1,tensor_outputs} - }; + Qnn_OpConfig_t op_config = { (Qnn_OpConfigVersion_t)1, + .v1 = { "ggml_op_add", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_ELEMENT_WISE_ADD, 0, + qnn_params, 2, tensor_inputs, 1, tensor_outputs } }; error = qnn_raw_interface.graphAddNode(graph_handle, op_config); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; } - error = qnn_raw_interface.graphFinalize(graph_handle, - nullptr, nullptr); + error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; } - error = qnn_raw_interface.graphExecute(graph_handle, - tensor_inputs, 2, - tensor_outputs, 1, - nullptr, nullptr); + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); if (ctx->device == QNN_BACKEND_NPU) { if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); @@ -173,24 +154,18 @@ static void ggml_qnn_add(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, goto failure; } - auto graph_item = std::make_tuple(graph_handle, - tensor_input0.get_qnn_tensor(), - tensor_input1.get_qnn_tensor(), - tensor_output.get_qnn_tensor()); + auto graph_item = std::make_tuple(graph_handle, tensor_input0.get_qnn_tensor(), tensor_input1.get_qnn_tensor(), + tensor_output.get_qnn_tensor()); instance->_qnn_graph_map[map_entry] = graph_item; - } - else { - auto& graph_item = instance->_qnn_graph_map[map_entry]; + } else { + auto &graph_item = instance->_qnn_graph_map[map_entry]; qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); Qnn_Tensor_t tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }; Qnn_Tensor_t tensor_outputs[] = { *tensor_output.get_qnn_tensor() }; - error = qnn_raw_interface.graphExecute(graph_handle, - tensor_inputs, 2, - tensor_outputs, 1, - nullptr, nullptr); + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); if (ctx->device == QNN_BACKEND_NPU) { if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); @@ -204,21 +179,18 @@ static void ggml_qnn_add(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, failure: if (QNN_SUCCESS != error) { - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, src0->type, ggml_type_name(src0->type), - src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], - src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, src1->type, ggml_type_name(src1->type), - src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], - src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - dst->name, dst->type, ggml_type_name(dst->type), - dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], - dst->nb[1], dst->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 + ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], + src0->nb[0], src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 + ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], + src1->nb[0], src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 + ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], + dst->nb[1], dst->nb[2]); } perf.info(); @@ -235,16 +207,15 @@ static void ggml_qnn_add(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, * mul_mat_f16_f32: src0 is F16 and src1 is F32. * mul_mat_q_f32: src0 is quantized (Q4_0, Q4_1, ...), and src1 is F32. */ -static void ggml_qnn_mul_mat(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, const ggml_tensor* src1, - ggml_tensor* dst) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - bool graph_initialized = false; - qnn::qnn_instance* instance = nullptr; - std::string graph_name = "ggml_op_qnn_mul_mat"; - Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Param_t qnn_params[] = {}; - enum ggml_op ggmlop = GGML_OP_MUL_MAT; +static void ggml_qnn_mul_mat(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + bool graph_initialized = false; + qnn::qnn_instance *instance = nullptr; + std::string graph_name = "ggml_op_qnn_mul_mat"; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Param_t qnn_params[] = {}; + enum ggml_op ggmlop = GGML_OP_MUL_MAT; CHECK_PARAMS(ctx, src0, src1, dst); instance = ctx->instance; @@ -254,21 +225,19 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context* ctx, perf.start(); std::string map_entry = std::string(ggml_op_name(ggmlop)); - if (instance->_qnn_graph_map.find(map_entry) != - instance->_qnn_graph_map.end()) { + if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { graph_initialized = true; - auto& graph_item = instance->_qnn_graph_map[map_entry]; + auto &graph_item = instance->_qnn_graph_map[map_entry]; graph_handle = std::get<0>(graph_item); } - //TODO: for scenarios of quantized data in src0 - // pass-1: dequantize src0 to FP32 - // pass-2: dq-src0 * src1 - // the performance gains is worth although there is performance loss in pass-1 + // TODO: for scenarios of quantized data in src0 + // pass-1: dequantize src0 to FP32 + // pass-2: dq-src0 * src1 + // the performance gains is worth although there is performance loss in pass-1 if (!graph_initialized) { - graph_name = graph_name + "_" + std::to_string(ctx->threads) + - "_" + src0->name + "_" + src1->name; + graph_name = graph_name + "_" + std::to_string(ctx->threads) + "_" + src0->name + "_" + src1->name; QNN_LOG_INFO("graph name %s", graph_name.c_str()); if (ctx->device == QNN_BACKEND_NPU) { QnnHtpGraph_CustomConfig_t hvx_config; @@ -288,7 +257,7 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context* ctx, QnnHtpGraph_CustomConfig_t opt_config; opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; - opt_config.optimizationOption.floatValue = 1; //1 / 3 + opt_config.optimizationOption.floatValue = 1; // 1 / 3 QnnGraph_Config_t graph_opt_config; graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; graph_opt_config.customConfig = &opt_config; @@ -300,22 +269,17 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context* ctx, graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; graph_vtcm_config.customConfig = &vtcm_config; - const QnnGraph_Config_t* p_graphconfig[] = { &graph_hvx_config, - &graph_dlbc_config, - &graph_vtcm_config, - &graph_opt_config, - NULL }; - error = qnn_raw_interface.graphCreate( - instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig, - &graph_handle); - } - else { - error = qnn_raw_interface.graphCreate( - instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, - &graph_handle); + const QnnGraph_Config_t *p_graphconfig[] = { &graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config, + &graph_opt_config, NULL }; + error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig, + &graph_handle); + } else { + error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, + &graph_handle); } if (QNN_SUCCESS != error) { - QNN_LOG_INFO("can't create qnn graph handle with graph name %s, " + QNN_LOG_INFO( + "can't create qnn graph handle with graph name %s, " "error = %d\n", graph_name.c_str(), error); goto failure; @@ -334,32 +298,22 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context* ctx, goto failure; } - Qnn_Tensor_t tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }; - Qnn_Tensor_t tensor_outputs[] = { *tensor_output.get_qnn_tensor() }; - Qnn_OpConfig_t op_config = { - (Qnn_OpConfigVersion_t)1, - .v1 = {"ggml_op_mul_mat", - QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_MAT_MUL, - 0, qnn_params, - 2, tensor_inputs, - 1, tensor_outputs} - }; + Qnn_Tensor_t tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }; + Qnn_Tensor_t tensor_outputs[] = { *tensor_output.get_qnn_tensor() }; + Qnn_OpConfig_t op_config = { (Qnn_OpConfigVersion_t)1, + .v1 = { "ggml_op_mul_mat", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL, 0, + qnn_params, 2, tensor_inputs, 1, tensor_outputs } }; error = qnn_raw_interface.graphAddNode(graph_handle, op_config); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; } - error = qnn_raw_interface.graphFinalize(graph_handle, - nullptr, nullptr); + error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; } - error = qnn_raw_interface.graphExecute(graph_handle, - tensor_inputs, 2, - tensor_outputs, 1, - nullptr, nullptr); + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); if (ctx->device == QNN_BACKEND_NPU) { if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); @@ -370,24 +324,18 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context* ctx, goto failure; } - auto graph_item = std::make_tuple(graph_handle, - tensor_input0.get_qnn_tensor(), - tensor_input1.get_qnn_tensor(), - tensor_output.get_qnn_tensor()); + auto graph_item = std::make_tuple(graph_handle, tensor_input0.get_qnn_tensor(), tensor_input1.get_qnn_tensor(), + tensor_output.get_qnn_tensor()); instance->_qnn_graph_map[map_entry] = graph_item; - } - else { - auto& graph_item = instance->_qnn_graph_map[map_entry]; + } else { + auto &graph_item = instance->_qnn_graph_map[map_entry]; qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); Qnn_Tensor_t tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }; Qnn_Tensor_t tensor_outputs[] = { *tensor_output.get_qnn_tensor() }; - error = qnn_raw_interface.graphExecute(graph_handle, - tensor_inputs, 2, - tensor_outputs, 1, - nullptr, nullptr); + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); if (ctx->device == QNN_BACKEND_NPU) { if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); @@ -401,181 +349,127 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context* ctx, failure: if (QNN_SUCCESS != error) { - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, src0->type, ggml_type_name(src0->type), - src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], - src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, src1->type, ggml_type_name(src1->type), - src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], - src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], - dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 + ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], + src0->nb[0], src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 + ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], + src1->nb[0], src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 + ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], + dst->nb[1], dst->nb[2]); } perf.info(); } -static void ggml_qnn_repeat(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, const ggml_tensor* src1, - ggml_tensor* dst) { -} +static void ggml_qnn_repeat(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_get_rows(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, const ggml_tensor* src1, - ggml_tensor* dst) { -} +static void ggml_qnn_get_rows(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_acc(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, - const ggml_tensor* src1, ggml_tensor* dst) { -} +static void ggml_qnn_acc(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_div(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, - const ggml_tensor* src1, ggml_tensor* dst) { -} +static void ggml_qnn_div(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_gelu(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, const ggml_tensor* src1, - ggml_tensor* dst) { -} +static void ggml_qnn_gelu(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_silu(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, const ggml_tensor* src1, - ggml_tensor* dst) { -} +static void ggml_qnn_silu(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_gelu_quick(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, - const ggml_tensor* src1, ggml_tensor* dst) { -} +static void ggml_qnn_gelu_quick(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_tanh(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, const ggml_tensor* src1, - ggml_tensor* dst) { -} +static void ggml_qnn_tanh(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_relu(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, const ggml_tensor* src1, - ggml_tensor* dst) { -} +static void ggml_qnn_relu(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_hardsigmoid(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, - const ggml_tensor* src1, ggml_tensor* dst) { -} +static void ggml_qnn_hardsigmoid(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_hardswish(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, const ggml_tensor* src1, - ggml_tensor* dst) { -} +static void ggml_qnn_hardswish(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_leaky_relu(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, - const ggml_tensor* src1, ggml_tensor* dst) { -} +static void ggml_qnn_leaky_relu(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_sqr(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, - const ggml_tensor* src1, ggml_tensor* dst) { -} +static void ggml_qnn_sqr(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_norm(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, const ggml_tensor* src1, - ggml_tensor* dst) { -} +static void ggml_qnn_norm(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_group_norm(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, - const ggml_tensor* src1, ggml_tensor* dst) { -} +static void ggml_qnn_group_norm(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_concat(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, const ggml_tensor* src1, - ggml_tensor* dst) { -} +static void ggml_qnn_concat(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_upscale(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, const ggml_tensor* src1, - ggml_tensor* dst) { -} +static void ggml_qnn_upscale(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_pad(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, - const ggml_tensor* src1, ggml_tensor* dst) { -} +static void ggml_qnn_pad(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_rms_norm(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, const ggml_tensor* src1, - ggml_tensor* dst) { -} +static void ggml_qnn_rms_norm(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_cpy(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, - const ggml_tensor* src1, ggml_tensor* dst) { -} +static void ggml_qnn_cpy(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_dup(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, - const ggml_tensor* src1, ggml_tensor* dst) { +static void ggml_qnn_dup(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) { ggml_qnn_cpy(ctx, src0, dst, nullptr); (void)src1; } -static void ggml_qnn_mul_mat_id(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, - const ggml_tensor* src1, ggml_tensor* dst) { -} +static void ggml_qnn_mul_mat_id(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_scale(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, const ggml_tensor* src1, - ggml_tensor* dst) { -} +static void ggml_qnn_scale(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_clamp(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, const ggml_tensor* src1, - ggml_tensor* dst) { -} +static void ggml_qnn_clamp(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_diag_mask_inf(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, - const ggml_tensor* src1, ggml_tensor* dst) { -} +static void ggml_qnn_diag_mask_inf(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_soft_max(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, const ggml_tensor* src1, - ggml_tensor* dst) { -} +static void ggml_qnn_soft_max(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_rope(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, const ggml_tensor* src1, - ggml_tensor* dst) { +static void ggml_qnn_rope(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) { GGML_ASSERT(ggml_is_contiguous(src0)); } -static void ggml_qnn_pool2d(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, const ggml_tensor* src1, - ggml_tensor* dst) { -} +static void ggml_qnn_pool2d(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_im2col(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, const ggml_tensor* src1, - ggml_tensor* dst) { -} +static void ggml_qnn_im2col(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_sum_rows(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, const ggml_tensor* src1, - ggml_tensor* dst) { +static void ggml_qnn_sum_rows(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) { GGML_ASSERT(ggml_is_contiguous(src0)); } -static void ggml_qnn_argsort(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, const ggml_tensor* src1, - ggml_tensor* dst) { +static void ggml_qnn_argsort(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) { GGML_ASSERT(ggml_is_contiguous(src0)); } -static void ggml_qnn_nop(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, - const ggml_tensor* src1, ggml_tensor* dst) { +static void ggml_qnn_nop(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) { (void)src0; (void)src1; (void)dst; @@ -583,33 +477,33 @@ static void ggml_qnn_nop(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, qnn::ggml_qnn_op_array_t qnn::ggml_qnn_op_array() { static constexpr const qnn::ggml_qnn_op_t kQnnOpsTable[GGML_OP_COUNT] = { - nullptr, // GGML_OP_NONE - nullptr, // GGML_OP_DUP + nullptr, // GGML_OP_NONE + nullptr, // GGML_OP_DUP ggml_qnn_add, // GGML_OP_ADD - nullptr, // GGML_OP_ADD1 - nullptr, // GGML_OP_ACC - nullptr, // GGML_OP_SUB - nullptr, // GGML_OP_MUL - nullptr, // GGML_OP_DIV - nullptr, // GGML_OP_SQR - nullptr, // GGML_OP_SQRT - nullptr, // GGML_OP_LOG - nullptr, // GGML_OP_SUM - nullptr, // GGML_OP_SUM_ROWS - nullptr, // GGML_OP_MEAN - nullptr, // GGML_OP_ARGMAX - nullptr, // GGML_OP_REPEAT - nullptr, // GGML_OP_REPEAT_BACK - nullptr, // GGML_OP_CONCAT - nullptr, // GGML_OP_SILU_BACK - nullptr, // GGML_OP_NORM - nullptr, // GGML_OP_RMS_NORM - nullptr, // GGML_OP_RMS_NORM_BACK - nullptr, // GGML_OP_GROUP_NORM + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + nullptr, // GGML_OP_SUB + nullptr, // GGML_OP_MUL + nullptr, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + nullptr, // GGML_OP_SQRT + nullptr, // GGML_OP_LOG + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + nullptr, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM ggml_qnn_mul_mat, // GGML_OP_MUL_MAT - nullptr, // GGML_OP_MUL_MAT_ID - nullptr, // GGML_OP_OUT_PROD + nullptr, // GGML_OP_MUL_MAT_ID + nullptr, // GGML_OP_OUT_PROD nullptr, // GGML_OP_SCALE nullptr, // GGML_OP_SET diff --git a/ggml/src/ggml-qnn/backend-ops.hpp b/ggml/src/ggml-qnn/backend-ops.hpp index c3dd5de302289..01c23ecff9b16 100644 --- a/ggml/src/ggml-qnn/backend-ops.hpp +++ b/ggml/src/ggml-qnn/backend-ops.hpp @@ -1,17 +1,16 @@ #pragma once #include "ggml.h" + #include "backend.hpp" namespace qnn { - typedef void (*ggml_qnn_op_t)(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, - const ggml_tensor* src1, - ggml_tensor* dst); +typedef void (*ggml_qnn_op_t)(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst); - typedef const ggml_qnn_op_t(&ggml_qnn_op_array_t)[GGML_OP_COUNT]; +typedef const ggml_qnn_op_t (&ggml_qnn_op_array_t)[GGML_OP_COUNT]; - ggml_qnn_op_array_t ggml_qnn_op_array(); +ggml_qnn_op_array_t ggml_qnn_op_array(); -} +} // namespace qnn diff --git a/ggml/src/ggml-qnn/backend.hpp b/ggml/src/ggml-qnn/backend.hpp index dc40090ee6114..74bce38b7111c 100644 --- a/ggml/src/ggml-qnn/backend.hpp +++ b/ggml/src/ggml-qnn/backend.hpp @@ -2,6 +2,7 @@ #pragma once #include "ggml.h" + #include "ggml-backend.h" #include "qnn.hpp" @@ -11,8 +12,8 @@ struct ggml_backend_qnn_context { int threads; char name[GGML_MAX_NAME]; char lib[GGML_MAX_NAME]; - qnn::qnn_instance* instance; - ggml_backend* backend; + qnn::qnn_instance *instance; + ggml_backend *backend; QNN_INTERFACE_VER_TYPE raw_interface; QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface; qnn::qcom_socinfo socinfo; diff --git a/ggml/src/ggml-qnn/logger.cpp b/ggml/src/ggml-qnn/logger.cpp index 43856c9f48a9f..8b29979224866 100644 --- a/ggml/src/ggml-qnn/logger.cpp +++ b/ggml/src/ggml-qnn/logger.cpp @@ -2,30 +2,26 @@ #include "logger.hpp" #include + #include #if (defined __ANDROID__) || (defined ANDROID) #include #endif -#define QNN_LOGBUF_LEN 4096 +#define QNN_LOGBUF_LEN 4096 -void qnn::internal_log(ggml_log_level level, const char* file, - const char* func, int line, - const char* format, ...) { +void qnn::internal_log(ggml_log_level level, const char *file, const char *func, int line, const char *format, ...) { static std::mutex qnn_internal_log_mutex; - static char s_qnn_internal_log_buf[QNN_LOGBUF_LEN]; + static char s_qnn_internal_log_buf[QNN_LOGBUF_LEN]; { std::lock_guard lock(qnn_internal_log_mutex); - va_list args; + va_list args; va_start(args, format); - int len_prefix = - snprintf(s_qnn_internal_log_buf, QNN_LOGBUF_LEN, - "[%s, %d]: ", func, line); - int len = vsnprintf(s_qnn_internal_log_buf + len_prefix, - QNN_LOGBUF_LEN - len_prefix, format, args); + int len_prefix = snprintf(s_qnn_internal_log_buf, QNN_LOGBUF_LEN, "[%s, %d]: ", func, line); + int len = vsnprintf(s_qnn_internal_log_buf + len_prefix, QNN_LOGBUF_LEN - len_prefix, format, args); if (len < (QNN_LOGBUF_LEN - len_prefix)) { #if (defined __ANDROID__) || (defined ANDROID) // for Android APK @@ -38,32 +34,31 @@ void qnn::internal_log(ggml_log_level level, const char* file, } } -void qnn::sdk_logcallback(const char* fmt, QnnLog_Level_t level, - uint64_t timestamp, va_list argp) { +void qnn::sdk_logcallback(const char *fmt, QnnLog_Level_t level, uint64_t timestamp, va_list argp) { #if ENABLE_QNNSDK_LOG - static std::mutex log_mutex; + static std::mutex log_mutex; static unsigned char s_ggml_qnn_logbuf[QNN_LOGBUF_LEN]; - const char* log_level_desc = ""; + const char *log_level_desc = ""; switch (level) { - case QNN_LOG_LEVEL_ERROR: - log_level_desc = "ERROR"; - break; - case QNN_LOG_LEVEL_WARN: - log_level_desc = "WARNING"; - break; - case QNN_LOG_LEVEL_INFO: - log_level_desc = "INFO"; - break; - case QNN_LOG_LEVEL_DEBUG: - log_level_desc = "DEBUG"; - break; - case QNN_LOG_LEVEL_VERBOSE: - log_level_desc = "VERBOSE"; - break; - case QNN_LOG_LEVEL_MAX: - log_level_desc = "UNKNOWN"; - break; + case QNN_LOG_LEVEL_ERROR: + log_level_desc = "ERROR"; + break; + case QNN_LOG_LEVEL_WARN: + log_level_desc = "WARNING"; + break; + case QNN_LOG_LEVEL_INFO: + log_level_desc = "INFO"; + break; + case QNN_LOG_LEVEL_DEBUG: + log_level_desc = "DEBUG"; + break; + case QNN_LOG_LEVEL_VERBOSE: + log_level_desc = "VERBOSE"; + break; + case QNN_LOG_LEVEL_MAX: + log_level_desc = "UNKNOWN"; + break; } double ms = (double)timestamp / 1000000.0; @@ -71,7 +66,7 @@ void qnn::sdk_logcallback(const char* fmt, QnnLog_Level_t level, std::lock_guard lock(log_mutex); memset(s_ggml_qnn_logbuf, 0, QNN_LOGBUF_LEN); - vsnprintf(reinterpret_cast(s_ggml_qnn_logbuf), QNN_LOGBUF_LEN, fmt, argp); + vsnprintf(reinterpret_cast(s_ggml_qnn_logbuf), QNN_LOGBUF_LEN, fmt, argp); QNN_LOG_INFO("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggml_qnn_logbuf); } #endif diff --git a/ggml/src/ggml-qnn/logger.hpp b/ggml/src/ggml-qnn/logger.hpp index 003436da10fae..f81a1814e9756 100644 --- a/ggml/src/ggml-qnn/logger.hpp +++ b/ggml/src/ggml-qnn/logger.hpp @@ -2,48 +2,40 @@ #include -#include "QnnTypes.h" +#include "ggml.h" + #include "QnnCommon.h" #include "QnnInterface.h" +#include "QnnTypes.h" #include "System/QnnSystemInterface.h" -#include "ggml.h" - namespace qnn { - void internal_log(ggml_log_level level, const char* file, - const char* func, int line, - const char* format, ...); - +void internal_log(ggml_log_level level, const char *file, const char *func, int line, const char *format, ...); - void sdk_logcallback(const char* fmt, QnnLog_Level_t level, - uint64_t timestamp, va_list argp); -} +void sdk_logcallback(const char *fmt, QnnLog_Level_t level, uint64_t timestamp, va_list argp); +} // namespace qnn // ================================================================================================= // // QNN backend internal log function // // ================================================================================================= -#define QNN_LOG_ERROR(...) \ - qnn::internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define QNN_LOG_ERROR(...) qnn::internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) -#define QNN_LOG_WARN(...) \ - qnn::internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define QNN_LOG_WARN(...) qnn::internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) -#define QNN_LOG_INFO(...) \ - qnn::internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define QNN_LOG_INFO(...) qnn::internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) #ifdef NDEBUG -#define ENABLE_QNNBACKEND_DEBUG 0 // for troubleshooting QNN backend -#define ENABLE_QNNSDK_LOG 0 // enable/disable QNN SDK's internal log +#define ENABLE_QNNBACKEND_DEBUG 0 // for troubleshooting QNN backend +#define ENABLE_QNNSDK_LOG 0 // enable/disable QNN SDK's internal log #else -#define ENABLE_QNNBACKEND_DEBUG 1 // for troubleshooting QNN backend -#define ENABLE_QNNSDK_LOG 1 // enable/disable QNN SDK's internal log +#define ENABLE_QNNBACKEND_DEBUG 1 // for troubleshooting QNN backend +#define ENABLE_QNNSDK_LOG 1 // enable/disable QNN SDK's internal log #endif #if ENABLE_QNNBACKEND_DEBUG -#define QNN_LOG_DEBUG(...) \ - qnn::internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define QNN_LOG_DEBUG(...) qnn::internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) #else #define QNN_LOG_DEBUG(...) #endif diff --git a/ggml/src/ggml-qnn/qnn-types.hpp b/ggml/src/ggml-qnn/qnn-types.hpp index db1d592f08a20..7c245651032c0 100644 --- a/ggml/src/ggml-qnn/qnn-types.hpp +++ b/ggml/src/ggml-qnn/qnn-types.hpp @@ -1,59 +1,55 @@ #pragma once -#include "QnnTypes.h" #include "QnnCommon.h" #include "QnnInterface.h" +#include "QnnTypes.h" #include "Saver/QnnSaver.h" #include "System/QnnSystemInterface.h" namespace qnn { - // ================================================================================================= - // - // helper data type / data structure / macros / functions of - // Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK - // ref:https://github.com/pytorch/executorch/tree/main/backends/qualcomm - // ================================================================================================= - enum sdk_profile_level { - profile_off = 0, - profile_basic = 1, - profile_detail = 2 - }; - - enum qcom_htp_arch { - NONE = 0, - V68 = 68, - V69 = 69, - V73 = 73, - V75 = 75, - }; - - enum qcom_chipset { - UNKNOWN_SM = 0, - SM8450 = 36, // v69 - SM8475 = 42, // v69 - SM8550 = 43, // v73 - SM8650 = 57, // v75 - }; - - struct qcom_socinfo { - uint32_t soc_model; - size_t htp_arch; - size_t vtcm_size_in_mb; - }; - - using pfn_rpc_mem_init = void (*)(void); - using pfn_rpc_mem_deinit = void (*)(void); - using pfn_rpc_mem_alloc = void* (*) (int, uint32_t, int); - using pfn_rpc_mem_free = void (*)(void*); - using pfn_rpc_mem_to_fd = int (*)(void*); - - using pfn_qnnsaver_initialize = decltype(QnnSaver_initialize); - using pfn_qnninterface_getproviders = decltype(QnnInterface_getProviders); - using pfn_qnnsysteminterface_getproviders = decltype(QnnSystemInterface_getProviders); -} - -#define QNN_VER_PTR(x) (&((x).v1)) // TODO: remove this macro after we have a separate header for QNN - -#define RPCMEM_DEFAULT_FLAGS 1 -#define RPCMEM_HEAP_ID_SYSTEM 25 +// ================================================================================================= +// +// helper data type / data structure / macros / functions of +// Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK +// ref:https://github.com/pytorch/executorch/tree/main/backends/qualcomm +// ================================================================================================= +enum sdk_profile_level { profile_off = 0, profile_basic = 1, profile_detail = 2 }; + +enum qcom_htp_arch { + NONE = 0, + V68 = 68, + V69 = 69, + V73 = 73, + V75 = 75, +}; + +enum qcom_chipset { + UNKNOWN_SM = 0, + SM8450 = 36, // v69 + SM8475 = 42, // v69 + SM8550 = 43, // v73 + SM8650 = 57, // v75 +}; + +struct qcom_socinfo { + uint32_t soc_model; + size_t htp_arch; + size_t vtcm_size_in_mb; +}; + +using pfn_rpc_mem_init = void (*)(void); +using pfn_rpc_mem_deinit = void (*)(void); +using pfn_rpc_mem_alloc = void *(*)(int, uint32_t, int); +using pfn_rpc_mem_free = void (*)(void *); +using pfn_rpc_mem_to_fd = int (*)(void *); + +using pfn_qnnsaver_initialize = decltype(QnnSaver_initialize); +using pfn_qnninterface_getproviders = decltype(QnnInterface_getProviders); +using pfn_qnnsysteminterface_getproviders = decltype(QnnSystemInterface_getProviders); +} // namespace qnn + +#define QNN_VER_PTR(x) (&((x).v1)) // TODO: remove this macro after we have a separate header for QNN + +#define RPCMEM_DEFAULT_FLAGS 1 +#define RPCMEM_HEAP_ID_SYSTEM 25 diff --git a/ggml/src/ggml-qnn/qnn.hpp b/ggml/src/ggml-qnn/qnn.hpp index 6caefb75644f7..bccc3a4ba32ac 100644 --- a/ggml/src/ggml-qnn/qnn.hpp +++ b/ggml/src/ggml-qnn/qnn.hpp @@ -1,1143 +1,961 @@ #pragma once #include + +#include #include #include #include -#include // header file of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK // https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct -#include "QnnTypes.h" -#include "QnnCommon.h" -#include "QnnInterface.h" -#include "QnnContext.h" -#include "QnnBackend.h" -#include "QnnGraph.h" -#include "QnnProperty.h" -#include "QnnTensor.h" -#include "System/QnnSystemInterface.h" -#include "HTP/QnnHtpDevice.h" -#include "HTP/QnnHtpGraph.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include "qnn-types.hpp" #include "utils.hpp" namespace qnn { - // ================================================================================================= - // - // wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK - // ref:https://github.com/pytorch/executorch/tree/main/backends/qualcomm - // ================================================================================================= - class qnn_interface { - -#define DEFINE_SHIM_FUNCTION_INTERFACE(F, pointer_name) \ - template inline auto qnn_##F(Args... args) const { \ - return (_qnn_interface->QNN_INTERFACE_VER_NAME.pointer_name)( \ - std::forward(args)...); \ +// ================================================================================================= +// +// wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK +// ref:https://github.com/pytorch/executorch/tree/main/backends/qualcomm +// ================================================================================================= +class qnn_interface { + +#define DEFINE_SHIM_FUNCTION_INTERFACE(F, pointer_name) \ + template \ + inline auto qnn_##F(Args... args) const { \ + return (_qnn_interface->QNN_INTERFACE_VER_NAME.pointer_name)(std::forward(args)...); \ } -#define DEFINE_SHIM_FUNCTION_SYS_INTERFACE(F, pointer_name) \ - template inline auto qnn_##F(Args... args) const { \ - return ( \ - _qnn_sys_interface->QNN_SYSTEM_INTERFACE_VER_NAME.pointer_name)( \ - std::forward(args)...); \ +#define DEFINE_SHIM_FUNCTION_SYS_INTERFACE(F, pointer_name) \ + template \ + inline auto qnn_##F(Args... args) const { \ + return (_qnn_sys_interface->QNN_SYSTEM_INTERFACE_VER_NAME.pointer_name)(std::forward(args)...); \ } - friend class qnn_instance; - - public: - qnn_interface() = default; + friend class qnn_instance; - // QnnBackend - DEFINE_SHIM_FUNCTION_INTERFACE(backend_create, backendCreate); +public: + qnn_interface() = default; - DEFINE_SHIM_FUNCTION_INTERFACE(backend_free, backendFree); + // QnnBackend + DEFINE_SHIM_FUNCTION_INTERFACE(backend_create, backendCreate); - DEFINE_SHIM_FUNCTION_INTERFACE(backend_register_op_package, - backendRegisterOpPackage); + DEFINE_SHIM_FUNCTION_INTERFACE(backend_free, backendFree); - DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, - backendValidateOpConfig); + DEFINE_SHIM_FUNCTION_INTERFACE(backend_register_op_package, backendRegisterOpPackage); - DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, - backendGetApiVersion); + DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, backendValidateOpConfig); - // QnnDevice - DEFINE_SHIM_FUNCTION_INTERFACE(device_create, deviceCreate); + DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, backendGetApiVersion); - DEFINE_SHIM_FUNCTION_INTERFACE(device_free, deviceFree); + // QnnDevice + DEFINE_SHIM_FUNCTION_INTERFACE(device_create, deviceCreate); - DEFINE_SHIM_FUNCTION_INTERFACE(device_get_infrastructure, - deviceGetInfrastructure); + DEFINE_SHIM_FUNCTION_INTERFACE(device_free, deviceFree); - DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, - deviceGetPlatformInfo); + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_infrastructure, deviceGetInfrastructure); - DEFINE_SHIM_FUNCTION_INTERFACE(device_get_info, deviceGetInfo); + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, deviceGetPlatformInfo); - // QnnContext - DEFINE_SHIM_FUNCTION_INTERFACE(context_create, contextCreate); + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_info, deviceGetInfo); - DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary_size, - contextGetBinarySize); + // QnnContext + DEFINE_SHIM_FUNCTION_INTERFACE(context_create, contextCreate); - DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary, contextGetBinary); + DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary_size, contextGetBinarySize); - DEFINE_SHIM_FUNCTION_INTERFACE(context_create_from_binary, - contextCreateFromBinary); + DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary, contextGetBinary); - DEFINE_SHIM_FUNCTION_INTERFACE(context_free, contextFree); + DEFINE_SHIM_FUNCTION_INTERFACE(context_create_from_binary, contextCreateFromBinary); - // QnnGraph - DEFINE_SHIM_FUNCTION_INTERFACE(graph_create, graphCreate); + DEFINE_SHIM_FUNCTION_INTERFACE(context_free, contextFree); - DEFINE_SHIM_FUNCTION_INTERFACE(graph_add_node, graphAddNode); + // QnnGraph + DEFINE_SHIM_FUNCTION_INTERFACE(graph_create, graphCreate); - DEFINE_SHIM_FUNCTION_INTERFACE(graph_finalize, graphFinalize); + DEFINE_SHIM_FUNCTION_INTERFACE(graph_add_node, graphAddNode); - DEFINE_SHIM_FUNCTION_INTERFACE(graph_execute, graphExecute); + DEFINE_SHIM_FUNCTION_INTERFACE(graph_finalize, graphFinalize); - DEFINE_SHIM_FUNCTION_INTERFACE(graph_retrieve, graphRetrieve); + DEFINE_SHIM_FUNCTION_INTERFACE(graph_execute, graphExecute); - // QnnLog - DEFINE_SHIM_FUNCTION_INTERFACE(log_create, logCreate); + DEFINE_SHIM_FUNCTION_INTERFACE(graph_retrieve, graphRetrieve); - DEFINE_SHIM_FUNCTION_INTERFACE(log_free, logFree); + // QnnLog + DEFINE_SHIM_FUNCTION_INTERFACE(log_create, logCreate); - DEFINE_SHIM_FUNCTION_INTERFACE(log_set_log_level, logSetLogLevel); + DEFINE_SHIM_FUNCTION_INTERFACE(log_free, logFree); - // QnnProfile - DEFINE_SHIM_FUNCTION_INTERFACE(profile_create, profileCreate); + DEFINE_SHIM_FUNCTION_INTERFACE(log_set_log_level, logSetLogLevel); - DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_events, profileGetEvents); + // QnnProfile + DEFINE_SHIM_FUNCTION_INTERFACE(profile_create, profileCreate); - DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_sub_events, profileGetSubEvents); + DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_events, profileGetEvents); - DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_event_data, profileGetEventData); + DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_sub_events, profileGetSubEvents); - DEFINE_SHIM_FUNCTION_INTERFACE(profile_free, profileFree); + DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_event_data, profileGetEventData); - // QnnMem - DEFINE_SHIM_FUNCTION_INTERFACE(mem_register, memRegister); + DEFINE_SHIM_FUNCTION_INTERFACE(profile_free, profileFree); - DEFINE_SHIM_FUNCTION_INTERFACE(mem_de_register, memDeRegister); + // QnnMem + DEFINE_SHIM_FUNCTION_INTERFACE(mem_register, memRegister); - // QnnProperty - DEFINE_SHIM_FUNCTION_INTERFACE(property_has_capability, - propertyHasCapability); + DEFINE_SHIM_FUNCTION_INTERFACE(mem_de_register, memDeRegister); - // QnnTensor - DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_context_tensor, - tensorCreateContextTensor); + // QnnProperty + DEFINE_SHIM_FUNCTION_INTERFACE(property_has_capability, propertyHasCapability); - DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, - tensorCreateGraphTensor); + // QnnTensor + DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_context_tensor, tensorCreateContextTensor); - // QnnSystem - DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_create, - systemContextCreate); + DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, tensorCreateGraphTensor); - DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, - systemContextGetBinaryInfo); - - DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree); - - void set_qnn_interface(const QnnInterface_t* qnn_interface) { - _qnn_interface = qnn_interface; - } + // QnnSystem + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_create, systemContextCreate); - void set_qnn_system_interface( - const QnnSystemInterface_t* qnn_sys_interface) { - _qnn_sys_interface = qnn_sys_interface; - } + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, systemContextGetBinaryInfo); - uint32_t get_backend_id() const { return _qnn_interface->backendId; } + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree); - bool is_loaded() const { - return ((_qnn_sys_interface != nullptr) && (_qnn_interface != nullptr)); - } + void set_qnn_interface(const QnnInterface_t *qnn_interface) { _qnn_interface = qnn_interface; } - private: - const QnnInterface_t* _qnn_interface = nullptr; + void set_qnn_system_interface(const QnnSystemInterface_t *qnn_sys_interface) { + _qnn_sys_interface = qnn_sys_interface; + } - const QnnSystemInterface_t* _qnn_sys_interface = nullptr; - }; + uint32_t get_backend_id() const { return _qnn_interface->backendId; } + bool is_loaded() const { return ((_qnn_sys_interface != nullptr) && (_qnn_interface != nullptr)); } - class qnn_instance { - public: - using BackendIdType = decltype(QnnInterface_t{}.backendId); +private: + const QnnInterface_t *_qnn_interface = nullptr; - explicit qnn_instance(const std::string& lib_path, - const std::string& backend_name, - const std::string& model_name) - : _lib_path(std::move(lib_path)) - , _backend_name(std::move(backend_name)) - , _model_name(std::move(model_name)) {}; + const QnnSystemInterface_t *_qnn_sys_interface = nullptr; +}; - ~qnn_instance() {} +class qnn_instance { +public: + using BackendIdType = decltype(QnnInterface_t{}.backendId); - int qnn_init(const QnnSaver_Config_t** saver_config) { - BackendIdType backend_id = QNN_BACKEND_ID_NULL; - QNN_LOG_DEBUG("enter qni_init\n"); + explicit qnn_instance(const std::string &lib_path, const std::string &backend_name, const std::string &model_name) : + _lib_path(std::move(lib_path)), _backend_name(std::move(backend_name)), _model_name(std::move(model_name)) {}; - std::lock_guard lock(_init_mutex); + ~qnn_instance() {} - if (0 != load_system()) { - QNN_LOG_WARN("can not load QNN system lib, pls check why?\n"); - return 1; - } - else { - QNN_LOG_DEBUG("load QNN system lib successfully\n"); - } + int qnn_init(const QnnSaver_Config_t **saver_config) { + BackendIdType backend_id = QNN_BACKEND_ID_NULL; + QNN_LOG_DEBUG("enter qni_init\n"); - std::string backend_lib_path = _lib_path + _backend_name; - if (0 == _lib_path_to_backend_id.count(backend_lib_path)) { - int is_load_ok = load_backend(backend_lib_path, saver_config); - if (0 != is_load_ok) { - QNN_LOG_WARN("failed to load QNN backend\n"); - return 2; - } - } + std::lock_guard lock(_init_mutex); - backend_id = _lib_path_to_backend_id[backend_lib_path]; - if (0 == _loaded_backend.count(backend_id) || - 0 == _loaded_lib_handle.count(backend_id)) { - QNN_LOG_WARN("library %s is loaded but loaded backend count=%zu, " - "loaded lib_handle count=%zu\n", - backend_lib_path.c_str(), _loaded_backend.count(backend_id), - _loaded_lib_handle.count(backend_id)); - return 3; - } - - _qnn_interface.set_qnn_interface(_loaded_backend[backend_id]); - - _qnn_interface.qnn_log_create(qnn::sdk_logcallback, _qnn_log_level, &_qnn_log_handle); - if (nullptr == _qnn_log_handle) { - // NPU backend not work on Qualcomm SoC equipped low-end phone - QNN_LOG_WARN("why failed to initialize qnn log\n"); - return 4; - } - else { - QNN_LOG_DEBUG("initialize qnn log successfully\n"); - } + if (0 != load_system()) { + QNN_LOG_WARN("can not load QNN system lib, pls check why?\n"); + return 1; + } else { + QNN_LOG_DEBUG("load QNN system lib successfully\n"); + } - std::vector temp_backend_config; - _qnn_interface.qnn_backend_create( - _qnn_log_handle, - temp_backend_config.empty() ? nullptr : temp_backend_config.data(), - &_qnn_backend_handle); - if (nullptr == _qnn_backend_handle) { - QNN_LOG_WARN("why failed to initialize qnn backend\n"); - return 5; - } - else { - QNN_LOG_DEBUG("initialize qnn backend successfully\n"); + std::string backend_lib_path = _lib_path + _backend_name; + if (0 == _lib_path_to_backend_id.count(backend_lib_path)) { + int is_load_ok = load_backend(backend_lib_path, saver_config); + if (0 != is_load_ok) { + QNN_LOG_WARN("failed to load QNN backend\n"); + return 2; } + } - if (nullptr != _qnn_raw_interface.propertyHasCapability) { - Qnn_ErrorHandle_t qnn_status = - _qnn_raw_interface.propertyHasCapability(QNN_PROPERTY_GROUP_DEVICE); - if (QNN_PROPERTY_NOT_SUPPORTED == qnn_status) { - QNN_LOG_WARN("device property is not supported\n"); - } - if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnn_status) { - QNN_LOG_WARN("device property is not known to backend\n"); + backend_id = _lib_path_to_backend_id[backend_lib_path]; + if (0 == _loaded_backend.count(backend_id) || 0 == _loaded_lib_handle.count(backend_id)) { + QNN_LOG_WARN( + "library %s is loaded but loaded backend count=%zu, " + "loaded lib_handle count=%zu\n", + backend_lib_path.c_str(), _loaded_backend.count(backend_id), _loaded_lib_handle.count(backend_id)); + return 3; + } + + _qnn_interface.set_qnn_interface(_loaded_backend[backend_id]); + + _qnn_interface.qnn_log_create(qnn::sdk_logcallback, _qnn_log_level, &_qnn_log_handle); + if (nullptr == _qnn_log_handle) { + // NPU backend not work on Qualcomm SoC equipped low-end phone + QNN_LOG_WARN("why failed to initialize qnn log\n"); + return 4; + } else { + QNN_LOG_DEBUG("initialize qnn log successfully\n"); + } + + std::vector temp_backend_config; + _qnn_interface.qnn_backend_create( + _qnn_log_handle, temp_backend_config.empty() ? nullptr : temp_backend_config.data(), &_qnn_backend_handle); + if (nullptr == _qnn_backend_handle) { + QNN_LOG_WARN("why failed to initialize qnn backend\n"); + return 5; + } else { + QNN_LOG_DEBUG("initialize qnn backend successfully\n"); + } + + if (nullptr != _qnn_raw_interface.propertyHasCapability) { + Qnn_ErrorHandle_t qnn_status = _qnn_raw_interface.propertyHasCapability(QNN_PROPERTY_GROUP_DEVICE); + if (QNN_PROPERTY_NOT_SUPPORTED == qnn_status) { + QNN_LOG_WARN("device property is not supported\n"); + } + if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnn_status) { + QNN_LOG_WARN("device property is not known to backend\n"); + } + } + + Qnn_ErrorHandle_t qnn_status = QNN_SUCCESS; + if (_backend_name.find("Htp") != std::variant_npos) { + const QnnDevice_PlatformInfo_t *p_info = nullptr; + _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info); + QNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices); + QnnDevice_HardwareDeviceInfo_t *infos = p_info->v1.hwDevices; + QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = {}; + for (int i = 0; i < p_info->v1.numHwDevices; i++) { + QNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId, infos[i].v1.deviceType, + infos[i].v1.numCores); + QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension; + chipinfo = devinfo->onChipDevice; + QnnHtpDevice_Arch_t htp_arch = chipinfo.arch; + QNN_LOG_INFO("htp_type:%d(%s)", devinfo->devType, + (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : ""); + QNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB", chipinfo.socModel, + qnn::get_chipset_desc(chipinfo.socModel), htp_arch, qnn::get_htparch_desc(htp_arch), + chipinfo.vtcmSize); + _soc_info = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize }; + } + _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info); + + QnnHtpDevice_CustomConfig_t soc_customconfig; + soc_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_SOC; + soc_customconfig.socModel = chipinfo.socModel; + QnnDevice_Config_t soc_devconfig; + soc_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; + soc_devconfig.customConfig = &soc_customconfig; + + QnnHtpDevice_CustomConfig_t arch_customconfig; + arch_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_ARCH; + arch_customconfig.arch.arch = chipinfo.arch; + arch_customconfig.arch.deviceId = 0; // Id of device to be used. If single device is used by default 0. + QnnDevice_Config_t arch_devconfig; + arch_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; + arch_devconfig.customConfig = &arch_customconfig; + + const QnnDevice_Config_t *p_deviceconfig[] = { &soc_devconfig, &arch_devconfig, nullptr }; + qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, p_deviceconfig, &_qnn_device_handle); + } else { + qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, nullptr, &_qnn_device_handle); + } + if (QNN_SUCCESS != qnn_status && QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnn_status) { + QNN_LOG_WARN("failed to create QNN device\n"); + } else { + QNN_LOG_INFO("create QNN device successfully\n"); + } + + if (qnn::sdk_profile_level::profile_off != _profile_level) { + QNN_LOG_INFO("profiling turned on; level = %d", _profile_level); + if (qnn::sdk_profile_level::profile_basic == _profile_level) { + QNN_LOG_INFO("basic profiling requested. creating Qnn Profile object\n"); + if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate( + _qnn_backend_handle, QNN_PROFILE_LEVEL_BASIC, &_qnn_profile_handle)) { + QNN_LOG_WARN("unable to create profile handle in the backend\n"); + return 6; + } else { + QNN_LOG_DEBUG("initialize qnn profile successfully\n"); } - } - - Qnn_ErrorHandle_t qnn_status = QNN_SUCCESS; - if (_backend_name.find("Htp") != std::variant_npos) { - const QnnDevice_PlatformInfo_t* p_info = nullptr; - _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info); - QNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices); - QnnDevice_HardwareDeviceInfo_t* infos = p_info->v1.hwDevices; - QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = { }; - for (int i = 0; i < p_info->v1.numHwDevices; i++) { - QNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId, - infos[i].v1.deviceType, infos[i].v1.numCores); - QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension; - chipinfo = devinfo->onChipDevice; - QnnHtpDevice_Arch_t htp_arch = chipinfo.arch; - QNN_LOG_INFO("htp_type:%d(%s)", devinfo->devType, (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : ""); - QNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB", - chipinfo.socModel, qnn::get_chipset_desc(chipinfo.socModel), - htp_arch, qnn::get_htparch_desc(htp_arch), chipinfo.vtcmSize); - _soc_info = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize }; + } else if (qnn::sdk_profile_level::profile_detail == _profile_level) { + QNN_LOG_INFO("detailed profiling requested. Creating Qnn Profile object\n"); + if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate(_qnn_backend_handle, + QNN_PROFILE_LEVEL_DETAILED, + &_qnn_profile_handle)) { + QNN_LOG_WARN("unable to create profile handle in the backend\n"); + return 7; + } else { + QNN_LOG_DEBUG("initialize qnn profile successfully\n"); } - _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info); - - QnnHtpDevice_CustomConfig_t soc_customconfig; - soc_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_SOC; - soc_customconfig.socModel = chipinfo.socModel; - QnnDevice_Config_t soc_devconfig; - soc_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; - soc_devconfig.customConfig = &soc_customconfig; - - QnnHtpDevice_CustomConfig_t arch_customconfig; - arch_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_ARCH; - arch_customconfig.arch.arch = chipinfo.arch; - arch_customconfig.arch.deviceId = 0; // Id of device to be used. If single device is used by default 0. - QnnDevice_Config_t arch_devconfig; - arch_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; - arch_devconfig.customConfig = &arch_customconfig; - - const QnnDevice_Config_t* p_deviceconfig[] = { &soc_devconfig, &arch_devconfig, nullptr }; - qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, p_deviceconfig, &_qnn_device_handle); - } - else { - qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, nullptr, &_qnn_device_handle); - } - if (QNN_SUCCESS != qnn_status && - QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnn_status) { - QNN_LOG_WARN("failed to create QNN device\n"); - } - else { - QNN_LOG_INFO("create QNN device successfully\n"); } + } - if (qnn::sdk_profile_level::profile_off != _profile_level) { - QNN_LOG_INFO("profiling turned on; level = %d", _profile_level); - if (qnn::sdk_profile_level::profile_basic == _profile_level) { - QNN_LOG_INFO("basic profiling requested. creating Qnn Profile object\n"); - if (QNN_PROFILE_NO_ERROR != - _qnn_raw_interface.profileCreate(_qnn_backend_handle, - QNN_PROFILE_LEVEL_BASIC, - &_qnn_profile_handle)) { - QNN_LOG_WARN("unable to create profile handle in the backend\n"); - return 6; - } - else { - QNN_LOG_DEBUG("initialize qnn profile successfully\n"); - } - } - else if (qnn::sdk_profile_level::profile_detail == _profile_level) { - QNN_LOG_INFO("detailed profiling requested. Creating Qnn Profile object\n"); - if (QNN_PROFILE_NO_ERROR != - _qnn_raw_interface.profileCreate(_qnn_backend_handle, - QNN_PROFILE_LEVEL_DETAILED, - &_qnn_profile_handle)) { - QNN_LOG_WARN("unable to create profile handle in the backend\n"); - return 7; - } - else { - QNN_LOG_DEBUG("initialize qnn profile successfully\n"); - } + _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL); + if (nullptr == _rpc_lib_handle) { + QNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dlerror()); + return 8; + } else { + QNN_LOG_DEBUG("load rpcmem lib successfully\n"); + set_rpcmem_initialized(true); + } + _pfn_rpc_mem_init = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_init")); + _pfn_rpc_mem_deinit = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_deinit")); + _pfn_rpc_mem_alloc = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_alloc")); + _pfn_rpc_mem_free = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_free")); + _pfn_rpc_mem_to_fd = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_to_fd")); + if (nullptr == _pfn_rpc_mem_alloc || nullptr == _pfn_rpc_mem_free || nullptr == _pfn_rpc_mem_to_fd) { + QNN_LOG_WARN("unable to access symbols in QNN RPC lib. dlerror(): %s", dlerror()); + dlclose(_rpc_lib_handle); + return 9; + } + + if (nullptr != _pfn_rpc_mem_init) { // make Qualcomm's SoC equipped low-end phone happy + _pfn_rpc_mem_init(); + } + + /* TODO: not used, keep it for further usage + QnnContext_Config_t qnn_context_config = QNN_CONTEXT_CONFIG_INIT; + qnn_context_config.priority = QNN_PRIORITY_DEFAULT; + const QnnContext_Config_t * context_configs[] = {&qnn_context_config, nullptr}; + */ + _qnn_interface.qnn_context_create(_qnn_backend_handle, _qnn_device_handle, nullptr, &_qnn_context_handle); + if (nullptr == _qnn_context_handle) { + QNN_LOG_WARN("why failed to initialize qnn context\n"); + return 10; + } else { + QNN_LOG_DEBUG("initialize qnn context successfully\n"); + } + + if (_backend_name.find("Htp") != std::variant_npos) { + // TODO: faster approach to probe the accurate capacity of rpc ion memory + size_t candidate_size = 0; + uint8_t *rpc_buffer = nullptr; + const int size_in_mb = (1 << 20); + size_t probe_slots[] = { 1024, 1536, 2048 - 48, 2048 }; + size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); + for (size_t idx = 0; idx < probe_counts; idx++) { + rpc_buffer = static_cast(alloc_rpcmem(probe_slots[idx] * size_in_mb, 4)); + if (nullptr == rpc_buffer) { + QNN_LOG_INFO("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno)); + break; + } else { + candidate_size = probe_slots[idx]; + free_rpcmem(rpc_buffer); + rpc_buffer = nullptr; } } + if (candidate_size > _rpcmem_capacity) _rpcmem_capacity = candidate_size; + QNN_LOG_INFO("capacity of QNN rpc ion memory is about %d MB\n", _rpcmem_capacity); - _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL); - if (nullptr == _rpc_lib_handle) { - QNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dlerror()); - return 8; - } - else { - QNN_LOG_DEBUG("load rpcmem lib successfully\n"); - set_rpcmem_initialized(true); + if (0 != init_htp_perfinfra()) { + QNN_LOG_WARN("initialize HTP performance failure"); } - _pfn_rpc_mem_init = reinterpret_cast( - dlsym(_rpc_lib_handle, "rpcmem_init")); - _pfn_rpc_mem_deinit = reinterpret_cast( - dlsym(_rpc_lib_handle, "rpcmem_deinit")); - _pfn_rpc_mem_alloc = reinterpret_cast( - dlsym(_rpc_lib_handle, "rpcmem_alloc")); - _pfn_rpc_mem_free = reinterpret_cast( - dlsym(_rpc_lib_handle, "rpcmem_free")); - _pfn_rpc_mem_to_fd = reinterpret_cast( - dlsym(_rpc_lib_handle, "rpcmem_to_fd")); - if (nullptr == _pfn_rpc_mem_alloc || nullptr == _pfn_rpc_mem_free || - nullptr == _pfn_rpc_mem_to_fd) { - QNN_LOG_WARN("unable to access symbols in QNN RPC lib. dlerror(): %s", dlerror()); - dlclose(_rpc_lib_handle); - return 9; + if (0 != set_rpc_polling()) { + QNN_LOG_WARN("set RPC polling failure"); } - - if (nullptr != _pfn_rpc_mem_init) { // make Qualcomm's SoC equipped low-end phone happy - _pfn_rpc_mem_init(); + if (0 != set_high_performance_mode()) { + QNN_LOG_WARN("set HTP high performance mode failure"); } + } - /* TODO: not used, keep it for further usage - QnnContext_Config_t qnn_context_config = QNN_CONTEXT_CONFIG_INIT; - qnn_context_config.priority = QNN_PRIORITY_DEFAULT; - const QnnContext_Config_t * context_configs[] = {&qnn_context_config, nullptr}; - */ - _qnn_interface.qnn_context_create( - _qnn_backend_handle, _qnn_device_handle, - nullptr, - &_qnn_context_handle); - if (nullptr == _qnn_context_handle) { - QNN_LOG_WARN("why failed to initialize qnn context\n"); - return 10; - } - else { - QNN_LOG_DEBUG("initialize qnn context successfully\n"); - } + QNN_LOG_DEBUG("leave qni_init\n"); - if (_backend_name.find("Htp") != std::variant_npos) { - //TODO: faster approach to probe the accurate capacity of rpc ion memory - size_t candidate_size = 0; - uint8_t* rpc_buffer = nullptr; - const int size_in_mb = (1 << 20); - size_t probe_slots[] = { 1024, 1536, 2048 - 48, 2048 }; - size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); - for (size_t idx = 0; idx < probe_counts; idx++) { - rpc_buffer = static_cast(alloc_rpcmem( - probe_slots[idx] * size_in_mb, 4)); - if (nullptr == rpc_buffer) { - QNN_LOG_INFO("alloc rpcmem %d (MB) failure, %s\n", - probe_slots[idx], strerror(errno)); - break; - } - else { - candidate_size = probe_slots[idx]; - free_rpcmem(rpc_buffer); - rpc_buffer = nullptr; - } - } - if (candidate_size > _rpcmem_capacity) - _rpcmem_capacity = candidate_size; - QNN_LOG_INFO("capacity of QNN rpc ion memory is about %d MB\n", _rpcmem_capacity); + return 0; + } - if (0 != init_htp_perfinfra()) { - QNN_LOG_WARN("initialize HTP performance failure"); - } - if (0 != set_rpc_polling()) { - QNN_LOG_WARN("set RPC polling failure"); - } - if (0 != set_high_performance_mode()) { - QNN_LOG_WARN("set HTP high performance mode failure"); - } - } + int qnn_finalize() { + int ret_status = 0; + Qnn_ErrorHandle_t error = QNN_SUCCESS; - QNN_LOG_DEBUG("leave qni_init\n"); + if (nullptr != _pfn_rpc_mem_deinit) // make Qualcomm's SoC equipped low-end phone happy + _pfn_rpc_mem_deinit(); - return 0; + if (dlclose(_rpc_lib_handle) != 0) { + QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dlerror()); + } else { + QNN_LOG_DEBUG("succeed to close rpcmem lib\n"); } - int qnn_finalize() { - int ret_status = 0; - Qnn_ErrorHandle_t error = QNN_SUCCESS; - - if (nullptr != _pfn_rpc_mem_deinit) // make Qualcomm's SoC equipped low-end phone happy - _pfn_rpc_mem_deinit(); + if (_backend_name.find("Htp") != std::variant_npos) { + _qnn_htp_perfinfra->destroyPowerConfigId(_qnn_power_configid); + } - if (dlclose(_rpc_lib_handle) != 0) { - QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dlerror()); - } - else { - QNN_LOG_DEBUG("succeed to close rpcmem lib\n"); + if (nullptr != _qnn_context_handle) { + error = _qnn_interface.qnn_context_free(_qnn_context_handle, _qnn_profile_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n", _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); } + _qnn_context_handle = nullptr; + } - if (_backend_name.find("Htp") != std::variant_npos) { - _qnn_htp_perfinfra->destroyPowerConfigId(_qnn_power_configid); + if (nullptr != _qnn_profile_handle) { + error = _qnn_interface.qnn_profile_free(_qnn_profile_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d\n", _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); } + _qnn_profile_handle = nullptr; + } - if (nullptr != _qnn_context_handle) { - error = _qnn_interface.qnn_context_free(_qnn_context_handle, - _qnn_profile_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), - QNN_GET_ERROR_CODE(error)); - } - _qnn_context_handle = nullptr; + if (nullptr != _qnn_device_handle) { + error = _qnn_interface.qnn_device_free(_qnn_device_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n", _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); } + _qnn_device_handle = nullptr; + } - if (nullptr != _qnn_profile_handle) { - error = _qnn_interface.qnn_profile_free(_qnn_profile_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), - QNN_GET_ERROR_CODE(error)); - } - _qnn_profile_handle = nullptr; + if (nullptr != _qnn_backend_handle) { + error = _qnn_interface.qnn_backend_free(_qnn_backend_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n", _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); } + _qnn_backend_handle = nullptr; + } - if (nullptr != _qnn_device_handle) { - error = _qnn_interface.qnn_device_free(_qnn_device_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), - QNN_GET_ERROR_CODE(error)); - } - _qnn_device_handle = nullptr; + if (nullptr != _qnn_log_handle) { + error = _qnn_interface.qnn_log_free(_qnn_log_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n", _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); } + _qnn_log_handle = nullptr; + } - if (nullptr != _qnn_backend_handle) { - error = _qnn_interface.qnn_backend_free(_qnn_backend_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), - QNN_GET_ERROR_CODE(error)); - } - _qnn_backend_handle = nullptr; - } + unload_backend(); - if (nullptr != _qnn_log_handle) { - error = _qnn_interface.qnn_log_free(_qnn_log_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), - QNN_GET_ERROR_CODE(error)); - } - _qnn_log_handle = nullptr; - } + unload_system(); - unload_backend(); + return ret_status; + } - unload_system(); + const qnn_interface &get_qnn_interface() { + if (!_qnn_interface.is_loaded()) { + QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + } + return _qnn_interface; + } - return ret_status; + const QNN_INTERFACE_VER_TYPE &get_qnn_raw_interface() { + if (!_qnn_interface.is_loaded()) { + QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); } + return _qnn_raw_interface; + } - //TODO:keep it for further usage of offload the entire cgraph to a single QNN DAG directly - // which was used in Qualcomm's dedicated AI technology -#if 0 - int init_qnn_graph(const char* graph_name, bool debug, - uint8_t do_node_validation = true, - const QnnGraph_Config_t** graph_configs = nullptr) { - int result = 0; + const QNN_SYSTEM_INTERFACE_VER_TYPE &get_qnn_raw_system_interface() { + if (!_qnn_interface.is_loaded()) { + QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + } + return _qnn_raw_system_interface; + } - if (nullptr == graph_name) { - QNN_LOG_WARN("graph name is null\n"); - return 1; - } + const Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; } - if (!_graph_name.empty()) { - QNN_LOG_WARN("qnn model for graph %s already initialized\n", graph_name); - return 2; - } + const Qnn_ProfileHandle_t get_qnn_profile_handle() { return _qnn_profile_handle; } - if (!do_node_validation) { - QNN_LOG_WARN("node validation disabled, backend will not perform op " - "validation prior to adding node\n"); - } + const Qnn_DeviceHandle_t get_qnn_device_handle() { return _qnn_device_handle; } - _graph_name = graph_name; - _debug_tensor = debug; - _do_node_validations = do_node_validation; + const Qnn_BackendHandle_t get_qnn_backend_handle() { return _qnn_backend_handle; } - result = _qnn_raw_interface.graphCreate(_qnn_context_handle, graph_name, - graph_configs, &_qnn_graph_handle); - if (result != QNN_GRAPH_NO_ERROR || nullptr == _qnn_graph_handle) { - QNN_LOG_WARN("failed to create graph in qnn context\n"); - return 3; - } - else { - QNN_LOG_INFO("succeed to create graph %s, %p\n", graph_name, _qnn_graph_handle); - } + const Qnn_ContextHandle_t get_qnn_context_handle() { return _qnn_context_handle; } - return 0; - } + const QnnSystemContext_Handle_t get_qnn_system_handle() { return _qnn_system_handle; } - int finalize_qnn_graph() { - if (nullptr != _qnn_graph_handle) { - if (_qnn_raw_interface.graphFinalize(_qnn_graph_handle, - _qnn_profile_handle, - nullptr) != QNN_GRAPH_NO_ERROR) { - QNN_LOG_WARN("finalizing graph failure\n"); - } - } - else { - QNN_LOG_DEBUG("qnn graph handle is null\n"); - } + const Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; } - return 0; + int init_htp_perfinfra() { + QnnDevice_Infrastructure_t device_infra = nullptr; + int error = _qnn_raw_interface.deviceGetInfrastructure(&device_infra); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get qnn device infra\n"); + return 1; + } else { + QNN_LOG_INFO("HTP backend perf_infrastructure creation ok\n"); } -#endif - const qnn_interface& get_qnn_interface() { - if (!_qnn_interface.is_loaded()) { - QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); - } - return _qnn_interface; + QnnHtpDevice_Infrastructure_t *htp_infra = static_cast(device_infra); + QnnHtpDevice_PerfInfrastructure_t *htp_perfinfra = &htp_infra->perfInfra; + uint32_t power_configid = 1; + uint32_t device_id = 0; + uint32_t core_id = 0; + htp_perfinfra->createPowerConfigId(device_id, core_id, &power_configid); + if (htp_infra->infraType != QNN_HTP_DEVICE_INFRASTRUCTURE_TYPE_PERF) { + QNN_LOG_INFO("HTP infra type = %d, which is not perf infra type", htp_infra->infraType); + } else { + QNN_LOG_INFO("HTP infra type = %d, which is perf infra type\n", htp_infra->infraType); } + _qnn_htp_perfinfra = htp_perfinfra; + _qnn_power_configid = power_configid; - const QNN_INTERFACE_VER_TYPE& get_qnn_raw_interface() { - if (!_qnn_interface.is_loaded()) { - QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); - } - return _qnn_raw_interface; - } + return 0; + } - const QNN_SYSTEM_INTERFACE_VER_TYPE& get_qnn_raw_system_interface() { - if (!_qnn_interface.is_loaded()) { - QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + int set_rpc_polling() { + if (_qnn_htp_perfinfra) { + QnnHtpPerfInfrastructure_PowerConfig_t rpc_polling_time; + memset(&rpc_polling_time, 0, sizeof(rpc_polling_time)); + rpc_polling_time.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME; + // use rpc polling time recommended 0-10000 us + rpc_polling_time.rpcPollingTimeConfig = 9999; + + QnnHtpPerfInfrastructure_PowerConfig_t rpc_control_latency; + memset(&rpc_control_latency, 0, sizeof(rpc_control_latency)); + rpc_control_latency.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY; + // use rpc control latency recommended 100 us, refer hexagon sdk + rpc_control_latency.rpcControlLatencyConfig = 100; + + const QnnHtpPerfInfrastructure_PowerConfig_t *power_configs[] = { &rpc_polling_time, &rpc_control_latency, + nullptr }; + Qnn_ErrorHandle_t qnn_status = _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs); + if (qnn_status != QNN_SUCCESS) { + QNN_LOG_WARN("set htp perf failed\n"); + } else { + QNN_LOG_INFO("set htp perf ok\n"); } - return _qnn_raw_system_interface; + } else { + QNN_LOG_WARN("can't set htp perf\n"); } - const Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; } + return 0; + } - const Qnn_ProfileHandle_t get_qnn_profile_handle() { - return _qnn_profile_handle; - } + int set_high_performance_mode() { + if (nullptr == _qnn_htp_perfinfra) { + QNN_LOG_WARN("perf intra is null\n"); + return 1; + } + + QnnHtpPerfInfrastructure_PowerConfig_t power_config; + memset(&power_config, 0, sizeof(power_config)); + power_config.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3; + + power_config.dcvsV3Config.setDcvsEnable = 1; + power_config.dcvsV3Config.dcvsEnable = 0; + power_config.dcvsV3Config.contextId = _qnn_power_configid; + power_config.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE; + power_config.dcvsV3Config.setSleepLatency = 1; // true to consider Latency parameter otherwise false + power_config.dcvsV3Config.sleepLatency = 40; + power_config.dcvsV3Config.setBusParams = 1; // true to consider Bus parameter otherwise false + power_config.dcvsV3Config.setCoreParams = 1; // true to consider Core parameter otherwise false + power_config.dcvsV3Config.sleepDisable = 1; // true to consider sleep/LPM modes, false to enable + power_config.dcvsV3Config.setSleepDisable = + 1; // true to consider sleep disable/enable parameter otherwise false set sleep latency parameter + // set Bus Clock Parameters + power_config.dcvsV3Config.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + power_config.dcvsV3Config.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + power_config.dcvsV3Config.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + // set Core Clock Parameters + power_config.dcvsV3Config.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + power_config.dcvsV3Config.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + power_config.dcvsV3Config.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + + // set power config with different performance parameters + const QnnHtpPerfInfrastructure_PowerConfig_t *power_configs[] = { &power_config, nullptr }; + Qnn_ErrorHandle_t qnn_status = QNN_SUCCESS; + qnn_status = _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs); + if (qnn_status != QNN_SUCCESS) { + QNN_LOG_WARN("set htp high performance mode failed\n"); + } else { + QNN_LOG_INFO("set htp high performance mode ok\n"); + } + + return 0; + } - const Qnn_DeviceHandle_t get_qnn_device_handle() { - return _qnn_device_handle; - } + std::string &get_qnn_graph_name() { return _graph_name; } - const Qnn_BackendHandle_t get_qnn_backend_handle() { - return _qnn_backend_handle; - } + bool is_rpcmem_initialized() { return _rpcmem_initialized; } - const Qnn_ContextHandle_t get_qnn_context_handle() { - return _qnn_context_handle; - } + void set_rpcmem_initialized(bool initialized) { _rpcmem_initialized = initialized; } - const QnnSystemContext_Handle_t get_qnn_system_handle() { - return _qnn_system_handle; - } + size_t get_rpcmem_capacity() { return _rpcmem_capacity; } - const Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; } + bool is_rpcmem_registered(Qnn_MemHandle_t handle) { return _qnn_mem_set.count(handle) != 0U; } - int init_htp_perfinfra() { - QnnDevice_Infrastructure_t device_infra = nullptr; - int error = _qnn_raw_interface.deviceGetInfrastructure(&device_infra); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to get qnn device infra\n"); - return 1; - } - else { - QNN_LOG_INFO("HTP backend perf_infrastructure creation ok\n"); - } + void *alloc_rpcmem(size_t bytes, size_t alignment) { + if (!_rpcmem_initialized) { + QNN_LOG_WARN("rpc memory not initialized\n"); + return nullptr; + } - QnnHtpDevice_Infrastructure_t* htp_infra = static_cast(device_infra); - QnnHtpDevice_PerfInfrastructure_t* htp_perfinfra = &htp_infra->perfInfra; - uint32_t power_configid = 1; - uint32_t device_id = 0; - uint32_t core_id = 0; - htp_perfinfra->createPowerConfigId(device_id, core_id, &power_configid); - if (htp_infra->infraType != QNN_HTP_DEVICE_INFRASTRUCTURE_TYPE_PERF) { - QNN_LOG_INFO("HTP infra type = %d, which is not perf infra type", htp_infra->infraType); - } - else { - QNN_LOG_INFO("HTP infra type = %d, which is perf infra type\n", htp_infra->infraType); - } - _qnn_htp_perfinfra = htp_perfinfra; - _qnn_power_configid = power_configid; - - return 0; - } - - int set_rpc_polling() { - if (_qnn_htp_perfinfra) { - QnnHtpPerfInfrastructure_PowerConfig_t rpc_polling_time; - memset(&rpc_polling_time, 0, sizeof(rpc_polling_time)); - rpc_polling_time.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME; - //use rpc polling time recommended 0-10000 us - rpc_polling_time.rpcPollingTimeConfig = 9999; - - QnnHtpPerfInfrastructure_PowerConfig_t rpc_control_latency; - memset(&rpc_control_latency, 0, sizeof(rpc_control_latency)); - rpc_control_latency.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY; - //use rpc control latency recommended 100 us, refer hexagon sdk - rpc_control_latency.rpcControlLatencyConfig = 100; - - const QnnHtpPerfInfrastructure_PowerConfig_t* power_configs[] = { - &rpc_polling_time, - &rpc_control_latency, - nullptr }; - Qnn_ErrorHandle_t qnn_status = _qnn_htp_perfinfra->setPowerConfig( - _qnn_power_configid, - power_configs); - if (qnn_status != QNN_SUCCESS) { - QNN_LOG_WARN("set htp perf failed\n"); - } - else { - QNN_LOG_INFO("set htp perf ok\n"); - } - } - else { - QNN_LOG_WARN("can't set htp perf\n"); - } + auto allocate_bytes = static_cast(bytes + alignment); + void *buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, allocate_bytes); + if (buf == nullptr) { + QNN_LOG_WARN("failed to allocate rpc memory\n"); + return nullptr; + } - return 0; + auto aligned_buf = reinterpret_cast(qnn::align_to(alignment, reinterpret_cast(buf))); + bool status = _rpcmem_store_map.insert(std::pair(aligned_buf, buf)).second; + if (!status) { + QNN_LOG_WARN("failed to allocate rpc memory\n"); + _pfn_rpc_mem_free(buf); } - int set_high_performance_mode() { - if (nullptr == _qnn_htp_perfinfra) { - QNN_LOG_WARN("perf intra is null\n"); - return 1; - } + return aligned_buf; + } - QnnHtpPerfInfrastructure_PowerConfig_t power_config; - memset(&power_config, 0, sizeof(power_config)); - power_config.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3; - - power_config.dcvsV3Config.setDcvsEnable = 1; - power_config.dcvsV3Config.dcvsEnable = 0; - power_config.dcvsV3Config.contextId = _qnn_power_configid; - power_config.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE; - power_config.dcvsV3Config.setSleepLatency = - 1; // true to consider Latency parameter otherwise false - power_config.dcvsV3Config.sleepLatency = 40; - power_config.dcvsV3Config.setBusParams = - 1; // true to consider Bus parameter otherwise false - power_config.dcvsV3Config.setCoreParams = - 1; // true to consider Core parameter otherwise false - power_config.dcvsV3Config.sleepDisable = - 1; // true to consider sleep/LPM modes, false to enable - power_config.dcvsV3Config.setSleepDisable = - 1; // true to consider sleep disable/enable parameter otherwise false set sleep latency parameter - // set Bus Clock Parameters - power_config.dcvsV3Config.busVoltageCornerMin = - DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - power_config.dcvsV3Config.busVoltageCornerTarget = - DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - power_config.dcvsV3Config.busVoltageCornerMax = - DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - // set Core Clock Parameters - power_config.dcvsV3Config.coreVoltageCornerMin = - DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - power_config.dcvsV3Config.coreVoltageCornerTarget = - DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - power_config.dcvsV3Config.coreVoltageCornerMax = - DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - - // set power config with different performance parameters - const QnnHtpPerfInfrastructure_PowerConfig_t* power_configs[] = { - &power_config, nullptr }; - Qnn_ErrorHandle_t qnn_status = QNN_SUCCESS; - qnn_status = _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs); - if (qnn_status != QNN_SUCCESS) { - QNN_LOG_WARN("set htp high performance mode failed\n"); - } - else { - QNN_LOG_INFO("set htp high performance mode ok\n"); - } + void free_rpcmem(void *buf) { + if (!_rpcmem_initialized) { + QNN_LOG_WARN("rpc memory not initialized\n"); + } else if (0 == _rpcmem_store_map.count(buf)) { + QNN_LOG_WARN("no allocated tensor\n"); + } else { + _pfn_rpc_mem_free(_rpcmem_store_map[buf]); + _rpcmem_store_map.erase(buf); + } + } - return 0; + int32_t rpcmem_to_fd(void *buf) { + int32_t mem_fd = -1; + if (!is_rpcmem_initialized()) { + QNN_LOG_WARN("rpc memory not initialized\n"); + } else { + mem_fd = _pfn_rpc_mem_to_fd(buf); } - std::string& get_qnn_graph_name() { return _graph_name; } + return mem_fd; + } - bool is_rpcmem_initialized() { return _rpcmem_initialized; } + int register_rpcmem(void *p_data, Qnn_Tensor_t *p_tensor) { + if (nullptr == p_data || (nullptr == p_tensor)) { + QNN_LOG_WARN("invalid param\n"); + return 1; + } - void set_rpcmem_initialized(bool initialized) { - _rpcmem_initialized = initialized; + if (!is_rpcmem_initialized()) { + QNN_LOG_WARN("rpc memory not initialized\n"); + return 2; } - size_t get_rpcmem_capacity() { return _rpcmem_capacity; } + if (is_rpcmem_allocated(p_data)) { + QNN_LOG_WARN("rpc memory already allocated\n"); + return 3; + } - bool is_rpcmem_registered(Qnn_MemHandle_t handle) { - return _qnn_mem_set.count(handle) != 0U; + if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) { + QNN_LOG_WARN("tensor %s has been registered shared memory\n", (QNN_VER_PTR(*p_tensor)->name)); + return 4; } - void* alloc_rpcmem(size_t bytes, size_t alignment) { - if (!_rpcmem_initialized) { - QNN_LOG_WARN("rpc memory not initialized\n"); - return nullptr; - } + int32_t mem_fd = rpcmem_to_fd(p_data); + if (-1 == mem_fd) { + QNN_LOG_WARN("failed to get file descriptor\n"); + return 5; + } + QNN_LOG_INFO("mem_fd %d\n", mem_fd); + Qnn_MemDescriptor_t descriptor = { { QNN_VER_PTR(*p_tensor)->rank, QNN_VER_PTR(*p_tensor)->dimensions, + nullptr }, + QNN_VER_PTR(*p_tensor)->dataType, + QNN_MEM_TYPE_ION, + { { mem_fd } } }; + Qnn_MemHandle_t handle = nullptr; + int error = QNN_SUCCESS; + error = _qnn_interface.qnn_mem_register(_qnn_context_handle, &descriptor, + /*numDescriptors=*/1, &handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to register shared memory, error %d, %s\n", QNN_GET_ERROR_CODE(error), + strerror(error)); + return 6; + } else { + QNN_LOG_INFO("tensor %s successfully register shared memory\n", (QNN_VER_PTR(*p_tensor)->name)); + } + QNN_VER_PTR(*p_tensor)->memHandle = handle; + _qnn_mem_set.insert((std::pair(p_data, handle))); - auto allocate_bytes = static_cast(bytes + alignment); - void* buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, - allocate_bytes); - if (buf == nullptr) { - QNN_LOG_WARN("failed to allocate rpc memory\n"); - return nullptr; - } + return 0; + } - auto aligned_buf = reinterpret_cast( - qnn::align_to(alignment, reinterpret_cast(buf))); - bool status = - _rpcmem_store_map.insert(std::pair(aligned_buf, buf)).second; - if (!status) { - QNN_LOG_WARN("failed to allocate rpc memory\n"); - _pfn_rpc_mem_free(buf); + void *get_rpcmem_from_memhandle(Qnn_MemHandle_t mem_handle) { + for (std::unordered_map::iterator it = _qnn_mem_set.begin(); it != _qnn_mem_set.end(); + it++) { + Qnn_MemHandle_t mem_handle = it->second; + if (it->second == mem_handle) { + return it->first; } - - return aligned_buf; } + QNN_LOG_WARN("can't find rpcmem from qnn mem handle %p", mem_handle); + return nullptr; + } - void free_rpcmem(void* buf) { - if (!_rpcmem_initialized) { - QNN_LOG_WARN("rpc memory not initialized\n"); - } - else if (0 == _rpcmem_store_map.count(buf)) { - QNN_LOG_WARN("no allocated tensor\n"); - } - else { - _pfn_rpc_mem_free(_rpcmem_store_map[buf]); - _rpcmem_store_map.erase(buf); - } + void unregister_rpcmem() { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + + if (_qnn_mem_set.empty()) { + QNN_LOG_WARN("no rpcmem registered\n"); } - int32_t rpcmem_to_fd(void* buf) { - int32_t mem_fd = -1; - if (!is_rpcmem_initialized()) { - QNN_LOG_WARN("rpc memory not initialized\n"); - } - else { - mem_fd = _pfn_rpc_mem_to_fd(buf); + for (std::unordered_map::iterator it = _qnn_mem_set.begin(); it != _qnn_mem_set.end(); + it++) { + Qnn_MemHandle_t mem_handle = it->second; + error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to unregister shared memory, error %d\n", QNN_GET_ERROR_CODE(error)); } - - return mem_fd; } + _qnn_mem_set.clear(); + } - int register_rpcmem(void* p_data, Qnn_Tensor_t* p_tensor) { - if (nullptr == p_data || (nullptr == p_tensor)) { - QNN_LOG_WARN("invalid param\n"); - return 1; - } + bool is_rpcmem_allocated(void *buf) { return _qnn_mem_set.count(buf) != 0U; } - if (!is_rpcmem_initialized()) { - QNN_LOG_WARN("rpc memory not initialized\n"); - return 2; - } + const qnn::qcom_socinfo &get_soc_info() { return _soc_info; } - if (is_rpcmem_allocated(p_data)) { - QNN_LOG_WARN("rpc memory already allocated\n"); - return 3; - } +public: + std::map> _qnn_graph_map; - if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) { - QNN_LOG_WARN("tensor %s has been registered shared memory\n", - (QNN_VER_PTR(*p_tensor)->name)); - return 4; - } +private: + int load_system() { + Qnn_ErrorHandle_t error = QNN_SUCCESS; - int32_t mem_fd = rpcmem_to_fd(p_data); - if (-1 == mem_fd) { - QNN_LOG_WARN("failed to get file descriptor\n"); - return 5; - } - QNN_LOG_INFO("mem_fd %d\n", mem_fd); - Qnn_MemDescriptor_t descriptor = { {QNN_VER_PTR(*p_tensor)->rank, - QNN_VER_PTR(*p_tensor)->dimensions, - nullptr}, - QNN_VER_PTR(*p_tensor)->dataType, - QNN_MEM_TYPE_ION, - {{mem_fd}} }; - Qnn_MemHandle_t handle = nullptr; - int error = QNN_SUCCESS; - error = _qnn_interface.qnn_mem_register(_qnn_context_handle, &descriptor, - /*numDescriptors=*/1, &handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to register shared memory, error %d, %s\n", - QNN_GET_ERROR_CODE(error), strerror(error)); - return 6; - } - else { - QNN_LOG_INFO("tensor %s successfully register shared memory\n", - (QNN_VER_PTR(*p_tensor)->name)); - } - QNN_VER_PTR(*p_tensor)->memHandle = handle; - _qnn_mem_set.insert((std::pair(p_data, handle))); + std::string system_lib_path = _lib_path + "libQnnSystem.so"; + QNN_LOG_DEBUG("system_lib_path:%s\n", system_lib_path.c_str()); - return 0; + _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL); + if (nullptr == _system_lib_handle) { + QNN_LOG_WARN("can not open QNN library %s, error: %s\n", system_lib_path.c_str(), dlerror()); + return 1; } - void* get_rpcmem_from_memhandle(Qnn_MemHandle_t mem_handle) { - for (std::unordered_map::iterator it = _qnn_mem_set.begin(); - it != _qnn_mem_set.end(); - it++) { - Qnn_MemHandle_t mem_handle = it->second; - if (it->second == mem_handle) { - return it->first; - } - } - QNN_LOG_WARN("can't find rpcmem from qnn mem handle %p", mem_handle); - return nullptr; + auto *get_providers = reinterpret_cast( + dlsym(_system_lib_handle, "QnnSystemInterface_getProviders")); + if (nullptr == get_providers) { + QNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s\n", dlerror()); + return 2; } - void unregister_rpcmem() { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - - if (_qnn_mem_set.empty()) { - QNN_LOG_WARN("no rpcmem registered\n"); - } - - for (std::unordered_map::iterator it = _qnn_mem_set.begin(); - it != _qnn_mem_set.end(); - it++) { - Qnn_MemHandle_t mem_handle = it->second; - error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to unregister shared memory, error %d\n", - QNN_GET_ERROR_CODE(error)); - } - } - _qnn_mem_set.clear(); + uint32_t num_providers = 0; + const QnnSystemInterface_t **provider_list = nullptr; + error = get_providers(&provider_list, &num_providers); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get providers, error %d\n", QNN_GET_ERROR_CODE(error)); + return 3; } - bool is_rpcmem_allocated(void* buf) { - return _qnn_mem_set.count(buf) != 0U; + if (num_providers != _required_num_providers) { + QNN_LOG_WARN("providers is %d instead of required %d\n", num_providers, _required_num_providers); + return 4; } - const qnn::qcom_socinfo& get_soc_info() { return _soc_info; } - - public: - std::map> _qnn_graph_map; - - private: - int load_system() { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - - std::string system_lib_path = _lib_path + "libQnnSystem.so"; - QNN_LOG_DEBUG("system_lib_path:%s\n", system_lib_path.c_str()); + if (nullptr == provider_list) { + QNN_LOG_WARN("can not get providers\n"); + return 5; + } - _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL); - if (nullptr == _system_lib_handle) { - QNN_LOG_WARN("can not open QNN library %s, error: %s\n", - system_lib_path.c_str(), dlerror()); - return 1; - } - - auto* get_providers = - reinterpret_cast( - dlsym(_system_lib_handle, "QnnSystemInterface_getProviders")); - if (nullptr == get_providers) { - QNN_LOG_WARN( - "can not load QNN symbol QnnSystemInterface_getProviders: %s\n", - dlerror()); - return 2; + QNN_SYSTEM_INTERFACE_VER_TYPE qnn_system_interface; + bool found_valid_system_interface = false; + for (size_t idx = 0; idx < num_providers; idx++) { + if (QNN_SYSTEM_API_VERSION_MAJOR == provider_list[idx]->systemApiVersion.major && + QNN_SYSTEM_API_VERSION_MINOR <= provider_list[idx]->systemApiVersion.minor) { + found_valid_system_interface = true; + qnn_system_interface = provider_list[idx]->QNN_SYSTEM_INTERFACE_VER_NAME; + break; } + } + if (!found_valid_system_interface) { + QNN_LOG_WARN("unable to find a valid qnn system interface\n"); + return 6; + } else { + QNN_LOG_INFO("find a valid qnn system interface\n"); + } + set_qnn_raw_system_interface(qnn_system_interface); - uint32_t num_providers = 0; - const QnnSystemInterface_t** provider_list = nullptr; - error = get_providers(&provider_list, &num_providers); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to get providers, error %d\n", - QNN_GET_ERROR_CODE(error)); - return 3; - } + _qnn_interface.set_qnn_system_interface(provider_list[0]); - if (num_providers != _required_num_providers) { - QNN_LOG_WARN("providers is %d instead of required %d\n", num_providers, - _required_num_providers); - return 4; - } + _qnn_interface.qnn_system_context_create(&_qnn_system_handle); + if (nullptr == _qnn_system_handle) { + QNN_LOG_WARN("can not create QNN system contenxt\n"); + } else { + QNN_LOG_INFO("initialize qnn system successfully\n"); + } - if (nullptr == provider_list) { - QNN_LOG_WARN("can not get providers\n"); - return 5; - } + return 0; + } - QNN_SYSTEM_INTERFACE_VER_TYPE qnn_system_interface; - bool found_valid_system_interface = false; - for (size_t idx = 0; idx < num_providers; idx++) { - if (QNN_SYSTEM_API_VERSION_MAJOR == - provider_list[idx]->systemApiVersion.major && - QNN_SYSTEM_API_VERSION_MINOR <= - provider_list[idx]->systemApiVersion.minor) { - found_valid_system_interface = true; - qnn_system_interface = - provider_list[idx]->QNN_SYSTEM_INTERFACE_VER_NAME; - break; - } - } - if (!found_valid_system_interface) { - QNN_LOG_WARN("unable to find a valid qnn system interface\n"); - return 6; - } - else { - QNN_LOG_INFO("find a valid qnn system interface\n"); - } - set_qnn_raw_system_interface(qnn_system_interface); + int unload_system() { + int result = 0; - _qnn_interface.set_qnn_system_interface(provider_list[0]); + if (nullptr == _system_lib_handle) { + QNN_LOG_WARN("system lib handle is null\n"); + return 1; + } - _qnn_interface.qnn_system_context_create(&_qnn_system_handle); - if (nullptr == _qnn_system_handle) { - QNN_LOG_WARN("can not create QNN system contenxt\n"); - } - else { - QNN_LOG_INFO("initialize qnn system successfully\n"); + if (nullptr != _qnn_system_handle) { + result = _qnn_interface.qnn_system_context_free(_qnn_system_handle); + if (result != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN system context\n"); } - - return 0; + _qnn_system_handle = nullptr; } - int unload_system() { - int result = 0; - - if (nullptr == _system_lib_handle) { - QNN_LOG_WARN("system lib handle is null\n"); - return 1; - } + int dlclose_error = dlclose(_system_lib_handle); + if (dlclose_error != 0) { + QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", dlerror()); + return 2; + } - if (nullptr != _qnn_system_handle) { - result = _qnn_interface.qnn_system_context_free(_qnn_system_handle); - if (result != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN system context\n"); - } - _qnn_system_handle = nullptr; - } + _system_lib_handle = nullptr; - int dlclose_error = dlclose(_system_lib_handle); - if (dlclose_error != 0) { - QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", dlerror()); - return 2; - } + return result; + } - _system_lib_handle = nullptr; + int load_backend(std::string &lib_path, const QnnSaver_Config_t **saver_config) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + QNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str()); - return result; + void *lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL); + if (nullptr == lib_handle) { + QNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dlerror()); + return 1; } - int load_backend(std::string& lib_path, const QnnSaver_Config_t** saver_config) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - QNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str()); + auto get_providers = qnn::load_qnn_functionpointers( + lib_handle, "QnnInterface_getProviders"); + if (nullptr == get_providers) { + QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dlerror()); + return 2; + } - void* lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL); - if (nullptr == lib_handle) { - QNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dlerror()); - return 1; - } + std::uint32_t num_providers = 0; + const QnnInterface_t **provider_list = nullptr; + error = get_providers(&provider_list, &num_providers); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error)); + return 3; + } + QNN_LOG_DEBUG("num_providers=%d\n", num_providers); + if (num_providers != _required_num_providers) { + QNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers); + return 4; + } - auto get_providers = - qnn::load_qnn_functionpointers( - lib_handle, "QnnInterface_getProviders"); - if (nullptr == get_providers) { - QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dlerror()); - return 2; + if (nullptr == provider_list) { + QNN_LOG_WARN("failed to get qnn interface providers\n"); + return 5; + } + bool found_valid_interface = false; + QNN_INTERFACE_VER_TYPE qnn_interface; + for (size_t idx = 0; idx < num_providers; idx++) { + if (QNN_API_VERSION_MAJOR == provider_list[idx]->apiVersion.coreApiVersion.major && + QNN_API_VERSION_MINOR <= provider_list[idx]->apiVersion.coreApiVersion.minor) { + found_valid_interface = true; + qnn_interface = provider_list[idx]->QNN_INTERFACE_VER_NAME; + break; } + } - std::uint32_t num_providers = 0; - const QnnInterface_t** provider_list = nullptr; - error = get_providers(&provider_list, &num_providers); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error)); - return 3; - } - QNN_LOG_DEBUG("num_providers=%d\n", num_providers); - if (num_providers != _required_num_providers) { - QNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers); - return 4; - } + if (!found_valid_interface) { + QNN_LOG_WARN("unable to find a valid qnn interface\n"); + return 6; + } else { + QNN_LOG_INFO("find a valid qnn interface\n"); + } + set_qnn_raw_interface(qnn_interface); - if (nullptr == provider_list) { - QNN_LOG_WARN("failed to get qnn interface providers\n"); - return 5; - } - bool found_valid_interface = false; - QNN_INTERFACE_VER_TYPE qnn_interface; - for (size_t idx = 0; idx < num_providers; idx++) { - if (QNN_API_VERSION_MAJOR == - provider_list[idx]->apiVersion.coreApiVersion.major && - QNN_API_VERSION_MINOR <= - provider_list[idx]->apiVersion.coreApiVersion.minor) { - found_valid_interface = true; - qnn_interface = provider_list[idx]->QNN_INTERFACE_VER_NAME; - break; - } + BackendIdType backend_id = provider_list[0]->backendId; + _lib_path_to_backend_id[lib_path] = backend_id; + if (_loaded_backend.count(backend_id) > 0) { + QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n", lib_path.c_str(), backend_id); + } + _loaded_backend[backend_id] = provider_list[0]; + if (_loaded_lib_handle.count(backend_id) > 0) { + QNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]); + int dlclose_error = dlclose(_loaded_lib_handle[backend_id]); + if (dlclose_error != 0) { + QNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], dlerror()); } + } + _loaded_lib_handle[backend_id] = lib_handle; + _backend_id = backend_id; - if (!found_valid_interface) { - QNN_LOG_WARN("unable to find a valid qnn interface\n"); - return 6; - } - else { - QNN_LOG_INFO("find a valid qnn interface\n"); - } - set_qnn_raw_interface(qnn_interface); + return 0; + } - BackendIdType backend_id = provider_list[0]->backendId; - _lib_path_to_backend_id[lib_path] = backend_id; - if (_loaded_backend.count(backend_id) > 0) { - QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n", lib_path.c_str(), backend_id); - } - _loaded_backend[backend_id] = provider_list[0]; - if (_loaded_lib_handle.count(backend_id) > 0) { - QNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]); - int dlclose_error = dlclose(_loaded_lib_handle[backend_id]); - if (dlclose_error != 0) { - QNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], dlerror()); - } + int unload_backend() { + int dlclose_error = 0; + for (auto &it : _loaded_lib_handle) { + dlclose_error = dlclose(it.second); + if (dlclose_error != 0) { + QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, dlerror()); } - _loaded_lib_handle[backend_id] = lib_handle; - _backend_id = backend_id; - - return 0; } - int unload_backend() { - int dlclose_error = 0; - for (auto& it : _loaded_lib_handle) { - dlclose_error = dlclose(it.second); - if (dlclose_error != 0) { - QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, dlerror()); - } - } + _loaded_lib_handle.clear(); + _lib_path_to_backend_id.clear(); + _loaded_backend.clear(); - _loaded_lib_handle.clear(); - _lib_path_to_backend_id.clear(); - _loaded_backend.clear(); - - return 0; - } + return 0; + } - void set_qnn_raw_interface(QNN_INTERFACE_VER_TYPE& raw_interface) { - _qnn_raw_interface = raw_interface; - } + void set_qnn_raw_interface(QNN_INTERFACE_VER_TYPE &raw_interface) { _qnn_raw_interface = raw_interface; } - void set_qnn_raw_system_interface(QNN_SYSTEM_INTERFACE_VER_TYPE& raw_interface) { - _qnn_raw_system_interface = raw_interface; - } + void set_qnn_raw_system_interface(QNN_SYSTEM_INTERFACE_VER_TYPE &raw_interface) { + _qnn_raw_system_interface = raw_interface; + } - private: - static constexpr const int _required_num_providers = 1; +private: + static constexpr const int _required_num_providers = 1; - std::string _lib_path; - std::string _backend_name; - std::string _model_name; // Qualcomm's dedicated prebuilt model name, keep it for further usage - BackendIdType _backend_id; + std::string _lib_path; + std::string _backend_name; + std::string _model_name; // Qualcomm's dedicated prebuilt model name, keep it for further usage + BackendIdType _backend_id; - bool _debug_tensor = false; - bool _do_node_validations = true; + bool _debug_tensor = false; + bool _do_node_validations = true; - QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_DEBUG; + QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_DEBUG; - qnn::sdk_profile_level _profile_level = qnn::sdk_profile_level::profile_detail; + qnn::sdk_profile_level _profile_level = qnn::sdk_profile_level::profile_detail; - qnn_interface _qnn_interface; + qnn_interface _qnn_interface; - void* _system_lib_handle = nullptr; + void *_system_lib_handle = nullptr; - Qnn_GraphHandle_t _qnn_graph_handle = nullptr; + Qnn_GraphHandle_t _qnn_graph_handle = nullptr; - Qnn_LogHandle_t _qnn_log_handle = nullptr; + Qnn_LogHandle_t _qnn_log_handle = nullptr; - Qnn_ProfileHandle_t _qnn_profile_handle = nullptr; + Qnn_ProfileHandle_t _qnn_profile_handle = nullptr; - Qnn_DeviceHandle_t _qnn_device_handle = nullptr; + Qnn_DeviceHandle_t _qnn_device_handle = nullptr; - Qnn_BackendHandle_t _qnn_backend_handle = nullptr; + Qnn_BackendHandle_t _qnn_backend_handle = nullptr; - Qnn_ContextHandle_t _qnn_context_handle = nullptr; + Qnn_ContextHandle_t _qnn_context_handle = nullptr; - QnnSystemContext_Handle_t _qnn_system_handle = nullptr; + QnnSystemContext_Handle_t _qnn_system_handle = nullptr; - QnnHtpDevice_PerfInfrastructure_t* _qnn_htp_perfinfra = nullptr; - uint32_t _qnn_power_configid = 1; + QnnHtpDevice_PerfInfrastructure_t *_qnn_htp_perfinfra = nullptr; + uint32_t _qnn_power_configid = 1; - QNN_INTERFACE_VER_TYPE _qnn_raw_interface; - QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface; + QNN_INTERFACE_VER_TYPE _qnn_raw_interface; + QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface; - std::unordered_map _qnn_mem_set; + std::unordered_map _qnn_mem_set; - std::mutex _init_mutex; - std::unordered_map _loaded_lib_handle; - std::unordered_map _lib_path_to_backend_id; - std::unordered_map _loaded_backend; + std::mutex _init_mutex; + std::unordered_map _loaded_lib_handle; + std::unordered_map _lib_path_to_backend_id; + std::unordered_map _loaded_backend; - void* _rpc_lib_handle = nullptr; - std::atomic_bool _rpcmem_initialized{ false }; - qnn::pfn_rpc_mem_alloc _pfn_rpc_mem_alloc; - qnn::pfn_rpc_mem_free _pfn_rpc_mem_free; - qnn::pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd; - qnn::pfn_rpc_mem_init _pfn_rpc_mem_init; - qnn::pfn_rpc_mem_deinit _pfn_rpc_mem_deinit; - std::unordered_map _rpcmem_store_map; - size_t _rpcmem_capacity = 512; + void *_rpc_lib_handle = nullptr; + std::atomic_bool _rpcmem_initialized{ false }; + qnn::pfn_rpc_mem_alloc _pfn_rpc_mem_alloc; + qnn::pfn_rpc_mem_free _pfn_rpc_mem_free; + qnn::pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd; + qnn::pfn_rpc_mem_init _pfn_rpc_mem_init; + qnn::pfn_rpc_mem_deinit _pfn_rpc_mem_deinit; + std::unordered_map _rpcmem_store_map; + size_t _rpcmem_capacity = 512; - std::string _graph_name; + std::string _graph_name; - qnn::qcom_socinfo _soc_info = {}; - }; + qnn::qcom_socinfo _soc_info = {}; +}; -} +} // namespace qnn diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index de0d1dc2dbbef..0ec75c03f0e53 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -1,146 +1,127 @@ #pragma once +#include "ggml-qnn.h" + #include "QnnTensor.h" #include "System/QnnSystemInterface.h" - -#include "ggml-qnn.h" #include "backend.hpp" #include "qnn.hpp" namespace qnn { - template class ggml_qnn_tensor_readwrite { - public: - ggml_qnn_tensor_readwrite(const ggml_tensor* tensor, - Qnn_GraphHandle_t graph_handle, - ggml_backend_qnn_context* ctx) - : _tensor(tensor), - _qnn_tensor(reinterpret_cast(tensor->extra)), - _context(ctx) { - _old_dimensions = QNN_VER_PTR(*_qnn_tensor)->dimensions; - const auto qnn_data_type = datatype_from_ggml_datatype(tensor->type); - const bool is_npu = ctx->device == QNN_BACKEND_NPU; - QNN_VER_PTR(*_qnn_tensor)->type = _tensorType; - if (is_npu) { - QNN_VER_PTR(*_qnn_tensor)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; - QNN_VER_PTR(*_qnn_tensor)->clientBuf = { .data = nullptr, .dataSize = 0 }; - } +template +class ggml_qnn_tensor_readwrite { +public: + explicit ggml_qnn_tensor_readwrite(const ggml_tensor *tensor, Qnn_GraphHandle_t graph_handle, + ggml_backend_qnn_context *ctx) : + _tensor(tensor), _qnn_tensor(reinterpret_cast(tensor->extra)), _context(ctx) { + _old_dimensions = QNN_VER_PTR(*_qnn_tensor)->dimensions; + const auto qnn_data_type = datatype_from_ggml_datatype(tensor->type); + const bool is_npu = ctx->device == QNN_BACKEND_NPU; + QNN_VER_PTR(*_qnn_tensor)->type = _tensorType; + if (is_npu) { + QNN_VER_PTR(*_qnn_tensor)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; + QNN_VER_PTR(*_qnn_tensor)->clientBuf = { .data = nullptr, .dataSize = 0 }; + } + + auto err = ctx->raw_interface.tensorCreateGraphTensor(graph_handle, _qnn_tensor); + if (err != QNN_SUCCESS) { + QNN_LOG_INFO("error = %d\n", err); + QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor)); + _context = nullptr; + return; + } - auto err = - ctx->raw_interface.tensorCreateGraphTensor(graph_handle, _qnn_tensor); - if (err != QNN_SUCCESS) { - QNN_LOG_INFO("error = %d\n", err); - QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, - QNN_TENSOR_GET_NAME(*_qnn_tensor)); + _dimensions[0] = (uint32_t)tensor->ne[0]; + _dimensions[1] = (uint32_t)tensor->ne[1]; + _dimensions[2] = (uint32_t)tensor->ne[2]; + _dimensions[3] = (uint32_t)tensor->ne[3]; + QNN_VER_PTR(*_qnn_tensor)->dimensions = _dimensions; + QNN_VER_PTR(*_qnn_tensor)->rank = qnn::get_ggml_tensor_rank(tensor); + QNN_VER_PTR(*_qnn_tensor)->dataType = qnn_data_type; + + if (is_npu) { + auto *instance = ctx->instance; + uint8_t *qnn_buffer = static_cast(instance->alloc_rpcmem(ggml_nbytes(tensor), alignof(void *))); + if (!qnn_buffer) { + QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor)); _context = nullptr; + // No free for _qnn_tensor, because it's not registered. return; + } else { + QNN_LOG_INFO("alloc rpcmem successfully\n"); } - _dimensions[0] = (uint32_t)tensor->ne[0]; - _dimensions[1] = (uint32_t)tensor->ne[1]; - _dimensions[2] = (uint32_t)tensor->ne[2]; - _dimensions[3] = (uint32_t)tensor->ne[3]; - QNN_VER_PTR(*_qnn_tensor)->dimensions = _dimensions; - QNN_VER_PTR(*_qnn_tensor)->rank = qnn::get_ggml_tensor_rank(tensor); - QNN_VER_PTR(*_qnn_tensor)->dataType = qnn_data_type; - - if (is_npu) { - auto* instance = ctx->instance; - uint8_t* qnn_buffer = static_cast( - instance->alloc_rpcmem(ggml_nbytes(tensor), alignof(void*))); - if (!qnn_buffer) { - QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); - QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, - QNN_TENSOR_GET_NAME(*_qnn_tensor)); - _context = nullptr; - // No free for _qnn_tensor, because it's not registered. - return; - } - else { - QNN_LOG_INFO("alloc rpcmem successfully\n"); - } - - instance->register_rpcmem(qnn_buffer, _qnn_tensor); - if (_tensorType == QNN_TENSOR_TYPE_APP_WRITE || - _tensorType == QNN_TENSOR_TYPE_APP_READWRITE) { - memcpy(qnn_buffer, tensor->data, ggml_nbytes(tensor)); - } - } - else { - QNN_VER_PTR(*_qnn_tensor)->clientBuf = { - tensor->data, get_ggml_tensor_data_size(tensor) }; + instance->register_rpcmem(qnn_buffer, _qnn_tensor); + if (_tensorType == QNN_TENSOR_TYPE_APP_WRITE || _tensorType == QNN_TENSOR_TYPE_APP_READWRITE) { + memcpy(qnn_buffer, tensor->data, ggml_nbytes(tensor)); } + } else { + QNN_VER_PTR(*_qnn_tensor)->clientBuf = { tensor->data, get_ggml_tensor_data_size(tensor) }; } - - ggml_qnn_tensor_readwrite(const ggml_tensor* tensor, Qnn_Tensor_t* qnn_tensor, - ggml_backend_qnn_context* ctx) - : _tensor(tensor), _qnn_tensor(qnn_tensor), _context(ctx) { - _old_dimensions = QNN_VER_PTR(*_qnn_tensor)->dimensions; - const auto qnn_data_type = qnn::datatype_from_ggml_datatype(tensor->type); - const bool is_npu = ctx->device == QNN_BACKEND_NPU; - - _dimensions[0] = (uint32_t)tensor->ne[0]; - _dimensions[1] = (uint32_t)tensor->ne[1]; - _dimensions[2] = (uint32_t)tensor->ne[2]; - _dimensions[3] = (uint32_t)tensor->ne[3]; - QNN_VER_PTR(*_qnn_tensor)->dimensions = _dimensions; - QNN_VER_PTR(*_qnn_tensor)->rank = get_ggml_tensor_rank(tensor); - QNN_VER_PTR(*_qnn_tensor)->dataType = qnn_data_type; - - if (is_npu) { - uint8_t* qnn_buffer = - static_cast(ctx->instance->get_rpcmem_from_memhandle( - QNN_VER_PTR(*_qnn_tensor)->memHandle)); - if (qnn_buffer) { - memcpy(qnn_buffer, tensor->data, ggml_nbytes(tensor)); - } - else { - QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n"); - QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, - QNN_TENSOR_GET_NAME(*_qnn_tensor)); - _context = nullptr; - return; - } - } - else { - QNN_VER_PTR(*_qnn_tensor)->clientBuf = { - tensor->data, get_ggml_tensor_data_size(tensor) }; + } + + explicit ggml_qnn_tensor_readwrite(const ggml_tensor *tensor, Qnn_Tensor_t *qnn_tensor, + ggml_backend_qnn_context *ctx) : + _tensor(tensor), _qnn_tensor(qnn_tensor), _context(ctx) { + _old_dimensions = QNN_VER_PTR(*_qnn_tensor)->dimensions; + const auto qnn_data_type = qnn::datatype_from_ggml_datatype(tensor->type); + const bool is_npu = ctx->device == QNN_BACKEND_NPU; + + _dimensions[0] = (uint32_t)tensor->ne[0]; + _dimensions[1] = (uint32_t)tensor->ne[1]; + _dimensions[2] = (uint32_t)tensor->ne[2]; + _dimensions[3] = (uint32_t)tensor->ne[3]; + QNN_VER_PTR(*_qnn_tensor)->dimensions = _dimensions; + QNN_VER_PTR(*_qnn_tensor)->rank = get_ggml_tensor_rank(tensor); + QNN_VER_PTR(*_qnn_tensor)->dataType = qnn_data_type; + + if (is_npu) { + uint8_t *qnn_buffer = + static_cast(ctx->instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*_qnn_tensor)->memHandle)); + if (qnn_buffer) { + memcpy(qnn_buffer, tensor->data, ggml_nbytes(tensor)); + } else { + QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n"); + QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor)); + _context = nullptr; + return; } + } else { + QNN_VER_PTR(*_qnn_tensor)->clientBuf = { tensor->data, get_ggml_tensor_data_size(tensor) }; + } + } + + ~ggml_qnn_tensor_readwrite() { + if ((_tensorType == QNN_TENSOR_TYPE_APP_READWRITE || _tensorType == QNN_TENSOR_TYPE_APP_READ) && _context && + _context->device == QNN_BACKEND_NPU) { + uint8_t *qnn_buffer = static_cast( + _context->instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*_qnn_tensor)->memHandle)); + memcpy(_tensor->data, qnn_buffer, ggml_nbytes(_tensor)); } - ~ggml_qnn_tensor_readwrite() { - if ((_tensorType == QNN_TENSOR_TYPE_APP_READWRITE || - _tensorType == QNN_TENSOR_TYPE_APP_READ) && - _context && _context->device == QNN_BACKEND_NPU) { - uint8_t* qnn_buffer = - static_cast(_context->instance->get_rpcmem_from_memhandle( - QNN_VER_PTR(*_qnn_tensor)->memHandle)); - memcpy(_tensor->data, qnn_buffer, ggml_nbytes(_tensor)); - } + QNN_VER_PTR(*_qnn_tensor)->dimensions = _old_dimensions; + } - QNN_VER_PTR(*_qnn_tensor)->dimensions = _old_dimensions; - } + bool is_valid() const { return _context; } + Qnn_Tensor_t *get_qnn_tensor() const { return _qnn_tensor; } + +private: + const ggml_tensor *_tensor; + Qnn_Tensor_t *_qnn_tensor; + ggml_backend_qnn_context *_context; + uint32_t *_old_dimensions; + uint32_t _dimensions[4] = {}; + + ggml_qnn_tensor_readwrite(const ggml_qnn_tensor_readwrite &) = delete; + void operator=(const ggml_qnn_tensor_readwrite &) = delete; + ggml_qnn_tensor_readwrite(ggml_qnn_tensor_readwrite &&) = delete; + void operator=(ggml_qnn_tensor_readwrite &&) = delete; +}; - bool is_valid() const { return _context; } - Qnn_Tensor_t* get_qnn_tensor() const { return _qnn_tensor; } - - private: - const ggml_tensor* _tensor; - Qnn_Tensor_t* _qnn_tensor; - ggml_backend_qnn_context* _context; - uint32_t* _old_dimensions; - uint32_t _dimensions[4] = {}; - - ggml_qnn_tensor_readwrite(const ggml_qnn_tensor_readwrite&) = delete; - void operator=(const ggml_qnn_tensor_readwrite&) = delete; - ggml_qnn_tensor_readwrite(ggml_qnn_tensor_readwrite&&) = delete; - void operator=(ggml_qnn_tensor_readwrite&&) = delete; - }; - - using ggml_qnn_tensor_output = - ggml_qnn_tensor_readwrite; - using ggml_qnn_tensor_input = - ggml_qnn_tensor_readwrite; +using ggml_qnn_tensor_output = ggml_qnn_tensor_readwrite; +using ggml_qnn_tensor_input = ggml_qnn_tensor_readwrite; } // namespace qnn diff --git a/ggml/src/ggml-qnn/utils.cpp b/ggml/src/ggml-qnn/utils.cpp index 798445c02fd76..2368b466c8187 100644 --- a/ggml/src/ggml-qnn/utils.cpp +++ b/ggml/src/ggml-qnn/utils.cpp @@ -2,14 +2,15 @@ #include "utils.hpp" #include "ggml-qnn.h" + #include "qnn-types.hpp" namespace qnn { - // TODO: mapping more ggml data type to QNN data type - // ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684 - Qnn_DataType_t datatype_from_ggml_datatype(enum ggml_type ggmltype) { - switch (ggmltype) { +// TODO: mapping more ggml data type to QNN data type +// ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684 +Qnn_DataType_t datatype_from_ggml_datatype(enum ggml_type ggmltype) { + switch (ggmltype) { case GGML_TYPE_F16: return QNN_DATATYPE_FLOAT_16; case GGML_TYPE_F32: @@ -22,24 +23,22 @@ namespace qnn { return QNN_DATATYPE_SFIXED_POINT_4; default: break; - } - return QNN_DATATYPE_UNDEFINED; } + return QNN_DATATYPE_UNDEFINED; +} - - uint32_t get_ggml_tensor_rank(const ggml_tensor* tensor) { - uint32_t rank = 0; - for (int i = 0; i < GGML_MAX_DIMS; i++) { - if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) { - rank++; - } +uint32_t get_ggml_tensor_rank(const ggml_tensor *tensor) { + uint32_t rank = 0; + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) { + rank++; } - return rank; } + return rank; +} - - const char* get_backend_name(int n_backend_type) { - switch (n_backend_type) { +const char *get_backend_name(int n_backend_type) { + switch (n_backend_type) { case QNN_BACKEND_CPU: return "QNN-CPU"; case QNN_BACKEND_GPU: @@ -50,11 +49,11 @@ namespace qnn { return "ggml"; //"fake" QNN backend, used for compare performance between QNN backend and original GGML default: return "unknown"; - } } +} - const char* get_chipset_desc(uint32_t chipset_id) { - switch (chipset_id) { +const char *get_chipset_desc(uint32_t chipset_id) { + switch (chipset_id) { case SM8450: return "SM8450"; case SM8475: @@ -65,11 +64,11 @@ namespace qnn { return "SM8650"; default: return "unknown"; - } } +} - const char* get_htparch_desc(size_t htp_arch) { - switch (htp_arch) { +const char *get_htparch_desc(size_t htp_arch) { + switch (htp_arch) { case V68: return "QCOM_HTP_V68"; case V69: @@ -80,37 +79,36 @@ namespace qnn { return "QCOM_HTP_V75"; default: return "unknown"; - } - } - - intptr_t align_to(size_t alignment, intptr_t offset) { - return offset % alignment == 0 - ? offset - : offset + (static_cast(alignment) - - offset % static_cast(alignment)); } +} - uint32_t get_ggml_tensor_data_size(const ggml_tensor* tensor) { - /* - size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); - size_t n_dims = qnn_get_ggml_tensor_rank(tensor); - for (int i = 1; i < n_dims; i++) { - data_size *= tensor->ne[i]; - } +intptr_t align_to(size_t alignment, intptr_t offset) { + return offset % alignment == 0 + ? offset + : offset + (static_cast(alignment) - offset % static_cast(alignment)); +} - return data_size; - */ - return ggml_nbytes(tensor); +uint32_t get_ggml_tensor_data_size(const ggml_tensor *tensor) { + /* + size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); + size_t n_dims = qnn_get_ggml_tensor_rank(tensor); + for (int i = 1; i < n_dims; i++) { + data_size *= tensor->ne[i]; } - // ================================================================================================= - // - // QNN backend internal helper functions - // - // ================================================================================================= - // TODO: only support GGML_OP_ADD/GGML_OP_MUL/GGML_OP_MUL_MAT - const char* opname_from_ggmlop(enum ggml_op ggmlop) { - switch (ggmlop) { + return data_size; + */ + return ggml_nbytes(tensor); +} + +// ================================================================================================= +// +// QNN backend internal helper functions +// +// ================================================================================================= +// TODO: only support GGML_OP_ADD/GGML_OP_MUL/GGML_OP_MUL_MAT +const char *opname_from_ggmlop(enum ggml_op ggmlop) { + switch (ggmlop) { case GGML_OP_ADD: return QNN_OP_ELEMENT_WISE_ADD; case GGML_OP_MUL: @@ -119,8 +117,8 @@ namespace qnn { return QNN_OP_MAT_MUL; default: break; - } - return nullptr; } - + return nullptr; } + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp index 4889c6dc8601c..673fb90e63de9 100644 --- a/ggml/src/ggml-qnn/utils.hpp +++ b/ggml/src/ggml-qnn/utils.hpp @@ -1,246 +1,239 @@ #pragma once -#include -#include -#include #include #include -#include +#include +#include +#include -#include "QnnTypes.h" +#include #include "ggml.h" +#include "QnnTypes.h" #include "logger.hpp" namespace qnn { - Qnn_DataType_t datatype_from_ggml_datatype(enum ggml_type ggmltype); - uint32_t get_ggml_tensor_rank(const ggml_tensor* tensor); - const char* get_backend_name(int n_backend_type); - const char* get_chipset_desc(uint32_t chipset_id); - const char* get_htparch_desc(size_t htp_arch); - intptr_t align_to(size_t alignment, intptr_t offset); - uint32_t get_ggml_tensor_data_size(const ggml_tensor* tensor); +Qnn_DataType_t datatype_from_ggml_datatype(enum ggml_type ggmltype); +uint32_t get_ggml_tensor_rank(const ggml_tensor *tensor); +const char *get_backend_name(int n_backend_type); +const char *get_chipset_desc(uint32_t chipset_id); +const char *get_htparch_desc(size_t htp_arch); +intptr_t align_to(size_t alignment, intptr_t offset); +uint32_t get_ggml_tensor_data_size(const ggml_tensor *tensor); - const char* opname_from_ggmlop(enum ggml_op ggmlop); +const char *opname_from_ggmlop(enum ggml_op ggmlop); - template Fn load_qnn_functionpointers(void* handle, const char* function_name) { - return reinterpret_cast(dlsym(handle, function_name)); - } +template +Fn load_qnn_functionpointers(void *handle, const char *function_name) { + return reinterpret_cast(dlsym(handle, function_name)); +} - inline int validate_tensor_version(Qnn_Tensor_t tensor) { - if (tensor.version != QNN_TENSOR_VERSION_1) { - QNN_LOG_WARN( - "validate_tensor_version() tensor %s, got unsupported version %d\n", - tensor.v1.name, tensor.version); - return 1; - } - return 0; +inline int validate_tensor_version(Qnn_Tensor_t tensor) { + if (tensor.version != QNN_TENSOR_VERSION_1) { + QNN_LOG_WARN("validate_tensor_version() tensor %s, got unsupported version %d\n", tensor.v1.name, + tensor.version); + return 1; } + return 0; +} - inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t& tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.id; - } - - return 0u; +inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t &tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.id; } - inline const char* get_qnn_tensorname(const Qnn_Tensor_t& tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.name; - } - return nullptr; - } + return 0u; +} - inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t& tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.type; - } - return QNN_TENSOR_TYPE_UNDEFINED; +inline const char *get_qnn_tensorname(const Qnn_Tensor_t &tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.name; } + return nullptr; +} - inline Qnn_TensorDataFormat_t - get_qnn_tensor_dataformat(const Qnn_Tensor_t& tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.dataFormat; - } - return QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER; +inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t &tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.type; } + return QNN_TENSOR_TYPE_UNDEFINED; +} - inline Qnn_DataType_t - get_qnn_tensor_datatype(const Qnn_Tensor_t& tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.dataType; - } - return QNN_DATATYPE_UNDEFINED; +inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_t &tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.dataFormat; } + return QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER; +} - inline Qnn_QuantizeParams_t - get_qnn_tensor_quantparams(const Qnn_Tensor_t& tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.quantizeParams; - } - return QNN_QUANTIZE_PARAMS_INIT; +inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t &tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.dataType; } + return QNN_DATATYPE_UNDEFINED; +} - inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t& tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.rank; - } - return 0u; +inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t &tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.quantizeParams; } + return QNN_QUANTIZE_PARAMS_INIT; +} - inline uint32_t* get_qnn_tensor_dimensions(const Qnn_Tensor_t& tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.dimensions; - } - return nullptr; +inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t &tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.rank; } + return 0u; +} - inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t& tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.memType; - } - return QNN_TENSORMEMTYPE_UNDEFINED; +inline uint32_t *get_qnn_tensor_dimensions(const Qnn_Tensor_t &tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.dimensions; } + return nullptr; +} - inline void set_qnn_tensor_id(Qnn_Tensor_t& tensor, uint32_t id) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.id = id; - } +inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t &tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.memType; } + return QNN_TENSORMEMTYPE_UNDEFINED; +} - inline void set_qnn_tensor_name(Qnn_Tensor_t& tensor, const char* name) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.name = name; - } +inline void set_qnn_tensor_id(Qnn_Tensor_t &tensor, uint32_t id) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.id = id; } +} - inline void set_qnn_tensor_type(Qnn_Tensor_t& tensor, Qnn_TensorType_t type) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.type = type; - } +inline void set_qnn_tensor_name(Qnn_Tensor_t &tensor, const char *name) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.name = name; } +} - inline void set_qnn_tensor_dataformat(Qnn_Tensor_t& tensor, Qnn_TensorDataFormat_t format) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.dataFormat = format; - } +inline void set_qnn_tensor_type(Qnn_Tensor_t &tensor, Qnn_TensorType_t type) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.type = type; } +} - inline void set_qnn_tensor_datatype(Qnn_Tensor_t& tensor, Qnn_DataType_t dataType) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.dataType = dataType; - } +inline void set_qnn_tensor_dataformat(Qnn_Tensor_t &tensor, Qnn_TensorDataFormat_t format) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.dataFormat = format; } +} - inline void set_qnn_tensor_quantparams(Qnn_Tensor_t& tensor, Qnn_QuantizeParams_t params) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.quantizeParams = params; - } +inline void set_qnn_tensor_datatype(Qnn_Tensor_t &tensor, Qnn_DataType_t dataType) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.dataType = dataType; } +} - inline void set_qnn_tensor_rank(Qnn_Tensor_t& tensor, uint32_t rank) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.rank = rank; - } +inline void set_qnn_tensor_quantparams(Qnn_Tensor_t &tensor, Qnn_QuantizeParams_t params) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.quantizeParams = params; } +} - inline void set_qnn_tensor_dimensions(Qnn_Tensor_t& tensor, uint32_t* dims) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.dimensions = dims; - } +inline void set_qnn_tensor_rank(Qnn_Tensor_t &tensor, uint32_t rank) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.rank = rank; } +} - inline void set_qnn_tensor_memtype(Qnn_Tensor_t& tensor, Qnn_TensorMemType_t mem_type) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.memType = mem_type; - } +inline void set_qnn_tensor_dimensions(Qnn_Tensor_t &tensor, uint32_t *dims) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.dimensions = dims; } +} - inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t& tensor, Qnn_ClientBuffer_t client_buf) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.clientBuf = client_buf; - } +inline void set_qnn_tensor_memtype(Qnn_Tensor_t &tensor, Qnn_TensorMemType_t mem_type) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.memType = mem_type; } +} - inline void set_qnn_tensor_memhandle(Qnn_Tensor_t& tensor, Qnn_MemHandle_t handle) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.memHandle = handle; - } +inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t &tensor, Qnn_ClientBuffer_t client_buf) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.clientBuf = client_buf; } +} +inline void set_qnn_tensor_memhandle(Qnn_Tensor_t &tensor, Qnn_MemHandle_t handle) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.memHandle = handle; + } +} #if ENABLE_QNNBACKEND_PERF - class qnn_perf { - public: - qnn_perf(const std::string& perf_name) : _perf_name(std::move(perf_name)) {}; - qnn_perf() = delete; - qnn_perf(const qnn_perf&) = delete; - qnn_perf& operator= (const qnn_perf&) = delete; - - void start() { - _begin_time = ggml_time_us(); - } - - void info() { - _end_time = ggml_time_us(); - _duration = (_end_time - _begin_time); - QNN_LOG_INFO("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration); - } - - private: - int64_t _begin_time = 0LL; - int64_t _end_time = 0LL; - int64_t _duration = 0LL; - std::string _perf_name; - }; +class qnn_perf { +public: + qnn_perf(const std::string &perf_name) : _perf_name(std::move(perf_name)) {}; + qnn_perf() = delete; + qnn_perf(const qnn_perf &) = delete; + qnn_perf &operator=(const qnn_perf &) = delete; + + void start() { _begin_time = ggml_time_us(); } + + void info() { + _end_time = ggml_time_us(); + _duration = (_end_time - _begin_time); + QNN_LOG_INFO("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration); + } + +private: + int64_t _begin_time = 0LL; + int64_t _end_time = 0LL; + int64_t _duration = 0LL; + std::string _perf_name; +}; #else - class qnn_perf { - public: - qnn_perf(const std::string& perf_name) {} - qnn_perf() = delete; - qnn_perf(const qnn_perf&) = delete; - qnn_perf& operator= (const qnn_perf&) = delete; - - void start() {} - void info() {} - }; +class qnn_perf { +public: + qnn_perf(const std::string &perf_name) {} + qnn_perf() = delete; + qnn_perf(const qnn_perf &) = delete; + qnn_perf &operator=(const qnn_perf &) = delete; + + void start() {} + void info() {} +}; #endif -} - +} // namespace qnn -#define VALIDATE(value, status) \ - do { \ - status = value; \ - if (status != QNN_SUCCESS) { \ - QNN_LOG_WARN("%s expected QNN_SUCCESS\n", #value); \ - return status; \ - } \ +#define VALIDATE(value, status) \ + do { \ + status = value; \ + if (status != QNN_SUCCESS) { \ + QNN_LOG_WARN("%s expected QNN_SUCCESS\n", #value); \ + return status; \ + } \ } while (0) -#define QNN_TENSOR_GET_ID(tensor) qnn::get_qnn_tensorid(tensor) -#define QNN_TENSOR_GET_NAME(tensor) qnn::get_qnn_tensorname(tensor) -#define QNN_TENSOR_GET_TYPE(tensor) qnn::get_qnn_tensortype(tensor) -#define QNN_TENSOR_GET_DATA_FORMAT(tensor) qnn::get_qnn_tensor_dataformat(tensor) -#define QNN_TENSOR_GET_DATA_TYPE(tensor) qnn::get_qnn_tensor_datatype(tensor) -#define QNN_TENSOR_GET_QUANT_PARAMS(tensor) qnn::get_qnn_tensor_quantparams(tensor) -#define QNN_TENSOR_GET_RANK(tensor) qnn::get_qnn_tensor_rank(tensor) -#define QNN_TENSOR_GET_DIMENSIONS(tensor) qnn::get_qnn_tensor_dimensions(tensor) -#define QNN_TENSOR_GET_MEM_TYPE(tensor) qnn::get_qnn_tensor_memtype(tensor) - -#define QNN_TENSOR_SET_ID(tensor, value) qnn::set_qnn_tensor_id(tensor, value) -#define QNN_TENSOR_SET_NAME(tensor, value) qnn::set_qnn_tensor_name(tensor, value) -#define QNN_TENSOR_SET_TYPE(tensor, value) qnn::set_qnn_tensor_type(tensor, value) -#define QNN_TENSOR_SET_DATA_FORMAT(tensor, value) qnn::set_qnn_tensor_dataformat(tensor, value) -#define QNN_TENSOR_SET_DATA_TYPE(tensor, value) qnn::set_qnn_tensor_datatype(tensor, value) -#define QNN_TENSOR_SET_QUANT_PARAMS(tensor, value) qnn::set_qnn_tensor_quantparams(tensor, value) -#define QNN_TENSOR_SET_RANK(tensor, value) qnn::set_qnn_tensor_rank(tensor, value) -#define QNN_TENSOR_SET_DIMENSIONS(tensor, value) qnn::set_qnn_tensor_dimensions(tensor, value) -#define QNN_TENSOR_SET_MEM_TYPE(tensor, value) qnn::set_qnn_tensor_memtype(tensor, value) -#define QNN_TENSOR_SET_CLIENT_BUF(tensor, value) qnn::set_qnn_tensor_clientbuf(tensor, value) -#define QNN_TENSOR_SET_MEM_HANDLE(tensor, value) qnn::set_qnn_tensor_memhandle(tensor, value) -#define VALIDATE_TENSOR_VERSION(tensor, err) VALIDATE(qnn::validate_tensor_version(tensor), err) +#define QNN_TENSOR_GET_ID(tensor) qnn::get_qnn_tensorid(tensor) +#define QNN_TENSOR_GET_NAME(tensor) qnn::get_qnn_tensorname(tensor) +#define QNN_TENSOR_GET_TYPE(tensor) qnn::get_qnn_tensortype(tensor) +#define QNN_TENSOR_GET_DATA_FORMAT(tensor) qnn::get_qnn_tensor_dataformat(tensor) +#define QNN_TENSOR_GET_DATA_TYPE(tensor) qnn::get_qnn_tensor_datatype(tensor) +#define QNN_TENSOR_GET_QUANT_PARAMS(tensor) qnn::get_qnn_tensor_quantparams(tensor) +#define QNN_TENSOR_GET_RANK(tensor) qnn::get_qnn_tensor_rank(tensor) +#define QNN_TENSOR_GET_DIMENSIONS(tensor) qnn::get_qnn_tensor_dimensions(tensor) +#define QNN_TENSOR_GET_MEM_TYPE(tensor) qnn::get_qnn_tensor_memtype(tensor) + +#define QNN_TENSOR_SET_ID(tensor, value) qnn::set_qnn_tensor_id(tensor, value) +#define QNN_TENSOR_SET_NAME(tensor, value) qnn::set_qnn_tensor_name(tensor, value) +#define QNN_TENSOR_SET_TYPE(tensor, value) qnn::set_qnn_tensor_type(tensor, value) +#define QNN_TENSOR_SET_DATA_FORMAT(tensor, value) qnn::set_qnn_tensor_dataformat(tensor, value) +#define QNN_TENSOR_SET_DATA_TYPE(tensor, value) qnn::set_qnn_tensor_datatype(tensor, value) +#define QNN_TENSOR_SET_QUANT_PARAMS(tensor, value) qnn::set_qnn_tensor_quantparams(tensor, value) +#define QNN_TENSOR_SET_RANK(tensor, value) qnn::set_qnn_tensor_rank(tensor, value) +#define QNN_TENSOR_SET_DIMENSIONS(tensor, value) qnn::set_qnn_tensor_dimensions(tensor, value) +#define QNN_TENSOR_SET_MEM_TYPE(tensor, value) qnn::set_qnn_tensor_memtype(tensor, value) +#define QNN_TENSOR_SET_CLIENT_BUF(tensor, value) qnn::set_qnn_tensor_clientbuf(tensor, value) +#define QNN_TENSOR_SET_MEM_HANDLE(tensor, value) qnn::set_qnn_tensor_memhandle(tensor, value) +#define VALIDATE_TENSOR_VERSION(tensor, err) VALIDATE(qnn::validate_tensor_version(tensor), err) From ca0d999c2ab97c11174a1f30852a311038792192 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Thu, 4 Jul 2024 23:32:21 +0800 Subject: [PATCH 037/166] add ggml_qnn_graph --- ggml/src/ggml-qnn/backend-ops.cpp | 5 +- ggml/src/ggml-qnn/graph.hpp | 136 ++++++++++++++++++++++++++++++ 2 files changed, 139 insertions(+), 2 deletions(-) create mode 100644 ggml/src/ggml-qnn/graph.hpp diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index cde1bd248cc29..3365e85b846a8 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -1,6 +1,7 @@ #include "backend-ops.hpp" +#include "graph.hpp" #include "logger.hpp" #include "tensor.hpp" #include "utils.hpp" @@ -130,7 +131,7 @@ static void ggml_qnn_add(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, Qnn_Tensor_t tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }; Qnn_Tensor_t tensor_outputs[] = { *tensor_output.get_qnn_tensor() }; - Qnn_OpConfig_t op_config = { (Qnn_OpConfigVersion_t)1, + Qnn_OpConfig_t op_config = { QNN_OPCONFIG_VERSION_1, .v1 = { "ggml_op_add", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_ELEMENT_WISE_ADD, 0, qnn_params, 2, tensor_inputs, 1, tensor_outputs } }; error = qnn_raw_interface.graphAddNode(graph_handle, op_config); @@ -300,7 +301,7 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context *ctx, const ggml_tensor *s Qnn_Tensor_t tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }; Qnn_Tensor_t tensor_outputs[] = { *tensor_output.get_qnn_tensor() }; - Qnn_OpConfig_t op_config = { (Qnn_OpConfigVersion_t)1, + Qnn_OpConfig_t op_config = { QNN_OPCONFIG_VERSION_1, .v1 = { "ggml_op_mul_mat", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL, 0, qnn_params, 2, tensor_inputs, 1, tensor_outputs } }; error = qnn_raw_interface.graphAddNode(graph_handle, op_config); diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp new file mode 100644 index 0000000000000..f2c27aeb3a2be --- /dev/null +++ b/ggml/src/ggml-qnn/graph.hpp @@ -0,0 +1,136 @@ + +#pragma once + +#include + +#include "ggml-qnn.h" + +#include "logger.hpp" +#include "qnn.hpp" + +namespace qnn { + +template +class ggml_qnn_graph { +public: + typedef std::array input_tensor_array_t; + typedef std::array output_tensor_array_t; + + explicit ggml_qnn_graph(const std::string &graph_name, QNNBackend device, Qnn_ContextHandle_t qnn_context, + QNN_INTERFACE_VER_TYPE qnn_interface, size_t vtcm_size_in_mb) : + _device(device), _qnn_interface(qnn_interface) { + QNN_LOG_INFO("graph name %s", graph_name.c_str()); + + Qnn_ErrorHandle_t error = QNN_SUCCESS; + Qnn_GraphHandle_t graph_handle = nullptr; + if (device == QNN_BACKEND_NPU) { + // TODO: fix graph config here for NPU + QnnHtpGraph_CustomConfig_t hvx_config; + hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; + hvx_config.numHvxThreads = 8; + QnnGraph_Config_t graph_hvx_config; + graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_hvx_config.customConfig = &hvx_config; + + QnnHtpGraph_CustomConfig_t dlbc_config; + dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; + dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; + dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC + QnnGraph_Config_t graph_dlbc_config; + graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_dlbc_config.customConfig = &dlbc_config; + + QnnHtpGraph_CustomConfig_t opt_config; + opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; + opt_config.optimizationOption.floatValue = 1; // 1 / 3 + QnnGraph_Config_t graph_opt_config; + graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_opt_config.customConfig = &opt_config; + + QnnHtpGraph_CustomConfig_t vtcm_config; + vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; + vtcm_config.vtcmSizeInMB = vtcm_size_in_mb; + QnnGraph_Config_t graph_vtcm_config; + graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_vtcm_config.customConfig = &vtcm_config; + + const QnnGraph_Config_t *p_graphconfig[] = { &graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config, + &graph_opt_config, nullptr }; + error = qnn_interface.graphCreate(qnn_context, graph_name.c_str(), p_graphconfig, &graph_handle); + } else { + error = qnn_interface.graphCreate(qnn_context, graph_name.c_str(), nullptr, &graph_handle); + } + + if (error != QNN_SUCCESS) { + QNN_LOG_INFO( + "can't create qnn graph handle with graph name %s, " + "error = %d\n", + graph_name.c_str(), error); + return; + } else { + QNN_LOG_INFO("create qnn graph handle with graph name %s ok\n", graph_name.c_str()); + } + + _graph_handle = graph_handle; + } + + bool add_nodes(const input_tensor_array_t &tensor_inputs, const output_tensor_array_t &tensor_outputs) { + if (!is_valid()) { + QNN_LOG_ERROR("Invalid graph\n"); + return false; + } + + Qnn_Param_t qnn_params[] = {}; + Qnn_OpConfig_t op_config = { QNN_OPCONFIG_VERSION_1, + .v1 = { "ggml_op_add", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_ELEMENT_WISE_ADD, 0, + qnn_params, _tensor_inputs.size(), _tensor_inputs.data(), + _tensor_outputs.size(), _tensor_outputs.data() } }; + auto error = _qnn_interface.graphAddNode(_graph_handle, op_config); + if (error != QNN_SUCCESS) { + QNN_LOG_ERROR("graphAddNode.error = %d\n", error); + return false; + } + + error = _qnn_interface.graphFinalize(_graph_handle, nullptr, nullptr); + if (error != QNN_SUCCESS) { + QNN_LOG_ERROR("graphFinalize.error = %d\n", error); + return false; + } + + return true; + } + + bool execute() { + auto error = _qnn_interface.graphExecute(_graph_handle, _tensor_inputs.data(), _tensor_inputs.size(), + _tensor_outputs.data(), _tensor_outputs.size(), nullptr, nullptr); + if (_device == QNN_BACKEND_NPU) { + if (error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) { + QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); + } + } + + if (error != QNN_SUCCESS) { + QNN_LOG_INFO("error = %d\n", error); + return false; + } + + return true; + } + + bool is_valid() const { return _graph_handle != nullptr; } + + Qnn_GraphHandle_t get_graph_handler() const { return _graph_handle; } + +private: + const QNNBackend _device; + const QNN_INTERFACE_VER_TYPE _qnn_interface; + Qnn_GraphHandle_t _graph_handle = nullptr; + std::array _tensor_inputs; + std::array _tensor_outputs; + + ggml_qnn_graph(const ggml_qnn_graph &) = delete; + void operator=(const ggml_qnn_graph &) = delete; + ggml_qnn_graph(ggml_qnn_graph &&) = delete; + void operator=(ggml_qnn_graph &&) = delete; +}; +} // namespace qnn From 4b2ee61f62e2e666ea47fcef4717739cd66fefcc Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 5 Jul 2024 11:56:31 +0800 Subject: [PATCH 038/166] move graph map to backend object --- ggml/src/ggml-qnn.cpp | 15 +++++---------- ggml/src/ggml-qnn/backend-ops.cpp | 16 ++++++++-------- ggml/src/ggml-qnn/backend.hpp | 3 +++ ggml/src/ggml-qnn/qnn.hpp | 4 ---- 4 files changed, 16 insertions(+), 22 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index e5fc00045beb3..9e6404e5c1b53 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -529,18 +529,13 @@ GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { auto *instance = g_qnn_mgr[ctx->device].instance; if (instance != nullptr) { - // TODO: this should be done inside the destructor - std::map>::iterator graph_it; - for (graph_it = instance->_qnn_graph_map.begin(); - graph_it != instance->_qnn_graph_map.end(); graph_it++) { - auto & graph_item = graph_it->second; - Qnn_GraphHandle_t & graph_handle = std::get<0>(graph_item); + for (const auto &graph_item: ctx->qnn_graph_map) { + Qnn_GraphHandle_t graph_handle = std::get<0>(graph_item.second); GGML_UNUSED(graph_handle); - QNN_LOG_INFO("graph type:%s", graph_it->first.c_str()); + QNN_LOG_INFO("graph type:%s", graph_item.first.c_str()); } - instance->_qnn_graph_map.clear(); + + ctx->qnn_graph_map.clear(); instance->qnn_finalize(); delete instance; diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 3365e85b846a8..d0c132b9bca8f 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -57,9 +57,9 @@ static void ggml_qnn_add(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, perf.start(); std::string map_entry(ggml_op_name(ggmlop)); - if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { + if (ctx->qnn_graph_map.find(map_entry) != ctx->qnn_graph_map.end()) { graph_initialized = true; - auto &graph_item = instance->_qnn_graph_map[map_entry]; + auto &graph_item = ctx->qnn_graph_map[map_entry]; graph_handle = std::get<0>(graph_item); } @@ -157,9 +157,9 @@ static void ggml_qnn_add(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, auto graph_item = std::make_tuple(graph_handle, tensor_input0.get_qnn_tensor(), tensor_input1.get_qnn_tensor(), tensor_output.get_qnn_tensor()); - instance->_qnn_graph_map[map_entry] = graph_item; + ctx->qnn_graph_map[map_entry] = graph_item; } else { - auto &graph_item = instance->_qnn_graph_map[map_entry]; + auto &graph_item = ctx->qnn_graph_map[map_entry]; qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); @@ -226,9 +226,9 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context *ctx, const ggml_tensor *s perf.start(); std::string map_entry = std::string(ggml_op_name(ggmlop)); - if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { + if (ctx->qnn_graph_map.find(map_entry) != ctx->qnn_graph_map.end()) { graph_initialized = true; - auto &graph_item = instance->_qnn_graph_map[map_entry]; + auto &graph_item = ctx->qnn_graph_map[map_entry]; graph_handle = std::get<0>(graph_item); } @@ -327,9 +327,9 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context *ctx, const ggml_tensor *s auto graph_item = std::make_tuple(graph_handle, tensor_input0.get_qnn_tensor(), tensor_input1.get_qnn_tensor(), tensor_output.get_qnn_tensor()); - instance->_qnn_graph_map[map_entry] = graph_item; + ctx->qnn_graph_map[map_entry] = graph_item; } else { - auto &graph_item = instance->_qnn_graph_map[map_entry]; + auto &graph_item = ctx->qnn_graph_map[map_entry]; qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); diff --git a/ggml/src/ggml-qnn/backend.hpp b/ggml/src/ggml-qnn/backend.hpp index 74bce38b7111c..dd15b05807641 100644 --- a/ggml/src/ggml-qnn/backend.hpp +++ b/ggml/src/ggml-qnn/backend.hpp @@ -1,6 +1,8 @@ #pragma once +#include + #include "ggml.h" #include "ggml-backend.h" @@ -17,4 +19,5 @@ struct ggml_backend_qnn_context { QNN_INTERFACE_VER_TYPE raw_interface; QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface; qnn::qcom_socinfo socinfo; + std::unordered_map> qnn_graph_map; }; diff --git a/ggml/src/ggml-qnn/qnn.hpp b/ggml/src/ggml-qnn/qnn.hpp index bccc3a4ba32ac..26465c96a0793 100644 --- a/ggml/src/ggml-qnn/qnn.hpp +++ b/ggml/src/ggml-qnn/qnn.hpp @@ -2,7 +2,6 @@ #include -#include #include #include #include @@ -705,9 +704,6 @@ class qnn_instance { const qnn::qcom_socinfo &get_soc_info() { return _soc_info; } -public: - std::map> _qnn_graph_map; - private: int load_system() { Qnn_ErrorHandle_t error = QNN_SUCCESS; From a688ed324b339eb2ca455becf1661272d28e1d99 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 5 Jul 2024 13:07:48 +0800 Subject: [PATCH 039/166] add op param to add_nodes --- ggml/src/ggml-qnn/graph.hpp | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index f2c27aeb3a2be..700114d6f8a26 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -18,7 +18,7 @@ class ggml_qnn_graph { explicit ggml_qnn_graph(const std::string &graph_name, QNNBackend device, Qnn_ContextHandle_t qnn_context, QNN_INTERFACE_VER_TYPE qnn_interface, size_t vtcm_size_in_mb) : - _device(device), _qnn_interface(qnn_interface) { + _graph_name(graph_name), _device(device), _qnn_interface(qnn_interface) { QNN_LOG_INFO("graph name %s", graph_name.c_str()); Qnn_ErrorHandle_t error = QNN_SUCCESS; @@ -74,7 +74,8 @@ class ggml_qnn_graph { _graph_handle = graph_handle; } - bool add_nodes(const input_tensor_array_t &tensor_inputs, const output_tensor_array_t &tensor_outputs) { + bool add_nodes(const std::string &op_name, const input_tensor_array_t &tensor_inputs, + const output_tensor_array_t &tensor_outputs) { if (!is_valid()) { QNN_LOG_ERROR("Invalid graph\n"); return false; @@ -82,7 +83,7 @@ class ggml_qnn_graph { Qnn_Param_t qnn_params[] = {}; Qnn_OpConfig_t op_config = { QNN_OPCONFIG_VERSION_1, - .v1 = { "ggml_op_add", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_ELEMENT_WISE_ADD, 0, + .v1 = { _graph_name.c_str(), QNN_OP_PACKAGE_NAME_QTI_AISW, op_name.c_str(), 0, qnn_params, _tensor_inputs.size(), _tensor_inputs.data(), _tensor_outputs.size(), _tensor_outputs.data() } }; auto error = _qnn_interface.graphAddNode(_graph_handle, op_config); @@ -122,6 +123,7 @@ class ggml_qnn_graph { Qnn_GraphHandle_t get_graph_handler() const { return _graph_handle; } private: + const std::string _graph_name; const QNNBackend _device; const QNN_INTERFACE_VER_TYPE _qnn_interface; Qnn_GraphHandle_t _graph_handle = nullptr; @@ -133,4 +135,8 @@ class ggml_qnn_graph { ggml_qnn_graph(ggml_qnn_graph &&) = delete; void operator=(ggml_qnn_graph &&) = delete; }; + +using ggml_qnn_graph_binary = ggml_qnn_graph<2, 1>; +using ggml_qnn_graph_unary = ggml_qnn_graph<1, 1>; + } // namespace qnn From 13dc3a02c371e533ffd379446cbf53b4c3bb5599 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 5 Jul 2024 13:08:14 +0800 Subject: [PATCH 040/166] use qnn graph inside add and mul ops --- ggml/src/ggml-qnn.cpp | 2 - ggml/src/ggml-qnn/backend-ops.cpp | 325 +++++++----------------------- ggml/src/ggml-qnn/backend.hpp | 6 +- ggml/src/ggml-qnn/graph.hpp | 9 +- 4 files changed, 89 insertions(+), 253 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 9e6404e5c1b53..19c970c5f0fd6 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -530,8 +530,6 @@ GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { auto *instance = g_qnn_mgr[ctx->device].instance; if (instance != nullptr) { for (const auto &graph_item: ctx->qnn_graph_map) { - Qnn_GraphHandle_t graph_handle = std::get<0>(graph_item.second); - GGML_UNUSED(graph_handle); QNN_LOG_INFO("graph type:%s", graph_item.first.c_str()); } diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index d0c132b9bca8f..79e280fcbe088 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -1,13 +1,23 @@ #include "backend-ops.hpp" +#include + #include "graph.hpp" #include "logger.hpp" #include "tensor.hpp" #include "utils.hpp" -static bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) { +namespace { + +void print_ggml_tensor(const ggml_tensor *tensor) { + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + tensor->name, tensor->type, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1], tensor->ne[2], + tensor->nb[0], tensor->nb[1], tensor->nb[2]); +} + +bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) { if (!ctx || !src0 || !src1 || !dst) { QNN_LOG_WARN("invalid params\n"); return false; @@ -25,6 +35,8 @@ static bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor return true; } +} // namespace + #ifndef NDEBUG #define CHECK_PARAMS(ctx, src0, src1, dst) \ do { \ @@ -41,157 +53,65 @@ static bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor // keep it for illustrate how to implement a specified GGMPL OP using QNN API + QNN RPC static void ggml_qnn_add(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - bool graph_initialized = false; - qnn::qnn_instance *instance = nullptr; - std::string graph_name = "ggml_op_qnn_add"; - Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Param_t qnn_params[] = {}; - enum ggml_op ggmlop = GGML_OP_ADD; - CHECK_PARAMS(ctx, src0, src1, dst); - instance = ctx->instance; - auto qnn_raw_interface = ctx->raw_interface; - qnn::qnn_perf perf("ggml_qnn_add"); + std::string graph_name = "ggml_op_qnn_add"; + qnn::qnn_perf perf(graph_name); perf.start(); - std::string map_entry(ggml_op_name(ggmlop)); - if (ctx->qnn_graph_map.find(map_entry) != ctx->qnn_graph_map.end()) { - graph_initialized = true; - auto &graph_item = ctx->qnn_graph_map[map_entry]; - graph_handle = std::get<0>(graph_item); - } - - if (!graph_initialized) { + bool succeed = false; + std::string graph_key(ggml_op_name(GGML_OP_ADD)); + auto it = ctx->qnn_graph_map.find(graph_key); + if (it != ctx->qnn_graph_map.end()) { + const auto &graph_item = it->second; + qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); + qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); + qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); + std::get<0>(graph_item)->execute(); + } else { graph_name = graph_name + "_" + std::to_string(ctx->threads) + "_" + src0->name + "_" + src1->name; - QNN_LOG_INFO("graph name %s", graph_name.c_str()); - if (ctx->device == QNN_BACKEND_NPU) { - QnnHtpGraph_CustomConfig_t hvx_config; - hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; - hvx_config.numHvxThreads = 8; - QnnGraph_Config_t graph_hvx_config; - graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_hvx_config.customConfig = &hvx_config; - - QnnHtpGraph_CustomConfig_t dlbc_config; - dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; - dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; - dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC - QnnGraph_Config_t graph_dlbc_config; - graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_dlbc_config.customConfig = &dlbc_config; - - QnnHtpGraph_CustomConfig_t opt_config; - opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; - opt_config.optimizationOption.floatValue = 1; // 1 / 3 - QnnGraph_Config_t graph_opt_config; - graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_opt_config.customConfig = &opt_config; - - QnnHtpGraph_CustomConfig_t vtcm_config; - vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; - vtcm_config.vtcmSizeInMB = ctx->socinfo.vtcm_size_in_mb; - QnnGraph_Config_t graph_vtcm_config; - graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_vtcm_config.customConfig = &vtcm_config; - - const QnnGraph_Config_t *p_graphconfig[] = { &graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config, - &graph_opt_config, NULL }; - error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig, - &graph_handle); - } else { - error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, - &graph_handle); - } + auto graph = std::make_unique(graph_name, (QNNBackend)(ctx->device), + ctx->instance->get_qnn_context_handle(), + ctx->raw_interface, ctx->socinfo.vtcm_size_in_mb); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO( - "can't create qnn graph handle with graph name %s, " - "error = %d\n", - graph_name.c_str(), error); + if (!graph->is_valid()) { goto failure; - } else { - QNN_LOG_INFO("create qnn graph handle with graph name %s ok\n", graph_name.c_str()); } - qnn::ggml_qnn_tensor_input tensor_input0(src0, graph_handle, ctx); + qnn::ggml_qnn_tensor_input tensor_input0(src0, graph->get_graph_handler(), ctx); if (!tensor_input0.is_valid()) { goto failure; } - qnn::ggml_qnn_tensor_input tensor_input1(src1, graph_handle, ctx); + qnn::ggml_qnn_tensor_input tensor_input1(src1, graph->get_graph_handler(), ctx); if (!tensor_input1.is_valid()) { - QNN_LOG_INFO("error = %d\n", error); goto failure; } - qnn::ggml_qnn_tensor_output tensor_output(dst, graph_handle, ctx); + qnn::ggml_qnn_tensor_output tensor_output(dst, graph->get_graph_handler(), ctx); if (!tensor_output.is_valid()) { goto failure; } - Qnn_Tensor_t tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }; - Qnn_Tensor_t tensor_outputs[] = { *tensor_output.get_qnn_tensor() }; - Qnn_OpConfig_t op_config = { QNN_OPCONFIG_VERSION_1, - .v1 = { "ggml_op_add", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_ELEMENT_WISE_ADD, 0, - qnn_params, 2, tensor_inputs, 1, tensor_outputs } }; - error = qnn_raw_interface.graphAddNode(graph_handle, op_config); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); - if (ctx->device == QNN_BACKEND_NPU) { - if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { - QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); - } - } - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); + if (!graph->add_nodes(QNN_OP_ELEMENT_WISE_ADD, + { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }, + { *tensor_output.get_qnn_tensor() })) { goto failure; } - auto graph_item = std::make_tuple(graph_handle, tensor_input0.get_qnn_tensor(), tensor_input1.get_qnn_tensor(), - tensor_output.get_qnn_tensor()); - ctx->qnn_graph_map[map_entry] = graph_item; - } else { - auto &graph_item = ctx->qnn_graph_map[map_entry]; - qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); - qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); - qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); - - Qnn_Tensor_t tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }; - Qnn_Tensor_t tensor_outputs[] = { *tensor_output.get_qnn_tensor() }; - error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); - if (ctx->device == QNN_BACKEND_NPU) { - if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { - QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); - } - } - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); + if (!graph->execute()) { goto failure; } + + ctx->qnn_graph_map[graph_key] = std::make_tuple(std::move(graph), tensor_input0.get_qnn_tensor(), + tensor_input1.get_qnn_tensor(), tensor_output.get_qnn_tensor()); } + succeed = true; + failure: - if (QNN_SUCCESS != error) { - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 - ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], - src0->nb[0], src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 - ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], - src1->nb[0], src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 - ", nb = (%5zi, %5zi, %5zi)\n", - dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], - dst->nb[1], dst->nb[2]); + if (!succeed) { + print_ggml_tensor(src0); + print_ggml_tensor(src1); + print_ggml_tensor(dst); } perf.info(); @@ -210,158 +130,69 @@ static void ggml_qnn_add(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, */ static void ggml_qnn_mul_mat(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - bool graph_initialized = false; - qnn::qnn_instance *instance = nullptr; - std::string graph_name = "ggml_op_qnn_mul_mat"; - Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Param_t qnn_params[] = {}; - enum ggml_op ggmlop = GGML_OP_MUL_MAT; - CHECK_PARAMS(ctx, src0, src1, dst); - instance = ctx->instance; - auto qnn_raw_interface = ctx->raw_interface; - qnn::qnn_perf perf("ggml_qnn_mul_mat"); + std::string graph_name = "ggml_op_qnn_mul_mat"; + qnn::qnn_perf perf(graph_name); perf.start(); - std::string map_entry = std::string(ggml_op_name(ggmlop)); - if (ctx->qnn_graph_map.find(map_entry) != ctx->qnn_graph_map.end()) { - graph_initialized = true; - auto &graph_item = ctx->qnn_graph_map[map_entry]; - graph_handle = std::get<0>(graph_item); - } - // TODO: for scenarios of quantized data in src0 // pass-1: dequantize src0 to FP32 // pass-2: dq-src0 * src1 // the performance gains is worth although there is performance loss in pass-1 - if (!graph_initialized) { + bool succeed = false; + std::string graph_key(ggml_op_name(GGML_OP_MUL_MAT)); + auto it = ctx->qnn_graph_map.find(graph_key); + if (it != ctx->qnn_graph_map.end()) { + const auto &graph_item = it->second; + qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); + qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); + qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); + std::get<0>(graph_item)->execute(); + } else { graph_name = graph_name + "_" + std::to_string(ctx->threads) + "_" + src0->name + "_" + src1->name; - QNN_LOG_INFO("graph name %s", graph_name.c_str()); - if (ctx->device == QNN_BACKEND_NPU) { - QnnHtpGraph_CustomConfig_t hvx_config; - hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; - hvx_config.numHvxThreads = 8; - QnnGraph_Config_t graph_hvx_config; - graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_hvx_config.customConfig = &hvx_config; - - QnnHtpGraph_CustomConfig_t dlbc_config; - dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; - dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; - dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC - QnnGraph_Config_t graph_dlbc_config; - graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_dlbc_config.customConfig = &dlbc_config; - - QnnHtpGraph_CustomConfig_t opt_config; - opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; - opt_config.optimizationOption.floatValue = 1; // 1 / 3 - QnnGraph_Config_t graph_opt_config; - graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_opt_config.customConfig = &opt_config; - - QnnHtpGraph_CustomConfig_t vtcm_config; - vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; - vtcm_config.vtcmSizeInMB = ctx->socinfo.vtcm_size_in_mb; - QnnGraph_Config_t graph_vtcm_config; - graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_vtcm_config.customConfig = &vtcm_config; - - const QnnGraph_Config_t *p_graphconfig[] = { &graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config, - &graph_opt_config, NULL }; - error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig, - &graph_handle); - } else { - error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, - &graph_handle); - } - if (QNN_SUCCESS != error) { - QNN_LOG_INFO( - "can't create qnn graph handle with graph name %s, " - "error = %d\n", - graph_name.c_str(), error); + auto graph = std::make_unique(graph_name, (QNNBackend)(ctx->device), + ctx->instance->get_qnn_context_handle(), + ctx->raw_interface, ctx->socinfo.vtcm_size_in_mb); + + if (!graph->is_valid()) { goto failure; } - qnn::ggml_qnn_tensor_input tensor_input0(src0, graph_handle, ctx); + qnn::ggml_qnn_tensor_input tensor_input0(src0, graph->get_graph_handler(), ctx); if (!tensor_input0.is_valid()) { goto failure; } - qnn::ggml_qnn_tensor_input tensor_input1(src1, graph_handle, ctx); + qnn::ggml_qnn_tensor_input tensor_input1(src1, graph->get_graph_handler(), ctx); if (!tensor_input1.is_valid()) { goto failure; } - qnn::ggml_qnn_tensor_output tensor_output(dst, graph_handle, ctx); + qnn::ggml_qnn_tensor_output tensor_output(dst, graph->get_graph_handler(), ctx); if (!tensor_output.is_valid()) { goto failure; } - Qnn_Tensor_t tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }; - Qnn_Tensor_t tensor_outputs[] = { *tensor_output.get_qnn_tensor() }; - Qnn_OpConfig_t op_config = { QNN_OPCONFIG_VERSION_1, - .v1 = { "ggml_op_mul_mat", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL, 0, - qnn_params, 2, tensor_inputs, 1, tensor_outputs } }; - error = qnn_raw_interface.graphAddNode(graph_handle, op_config); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); - if (ctx->device == QNN_BACKEND_NPU) { - if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { - QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); - } - } - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); + if (!graph->add_nodes(QNN_OP_MAT_MUL, { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }, + { *tensor_output.get_qnn_tensor() })) { goto failure; } - auto graph_item = std::make_tuple(graph_handle, tensor_input0.get_qnn_tensor(), tensor_input1.get_qnn_tensor(), - tensor_output.get_qnn_tensor()); - ctx->qnn_graph_map[map_entry] = graph_item; - } else { - auto &graph_item = ctx->qnn_graph_map[map_entry]; - qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); - qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); - qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); - - Qnn_Tensor_t tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }; - Qnn_Tensor_t tensor_outputs[] = { *tensor_output.get_qnn_tensor() }; - error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); - if (ctx->device == QNN_BACKEND_NPU) { - if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { - QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); - } - } - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); + if (!graph->execute()) { goto failure; } + + ctx->qnn_graph_map[graph_key] = std::make_tuple(std::move(graph), tensor_input0.get_qnn_tensor(), + tensor_input1.get_qnn_tensor(), tensor_output.get_qnn_tensor()); } + succeed = true; + failure: - if (QNN_SUCCESS != error) { - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 - ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], - src0->nb[0], src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 - ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], - src1->nb[0], src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 - ", nb = (%5zi, %5zi, %5zi)\n", - dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], - dst->nb[1], dst->nb[2]); + if (!succeed) { + print_ggml_tensor(src0); + print_ggml_tensor(src1); + print_ggml_tensor(dst); } perf.info(); diff --git a/ggml/src/ggml-qnn/backend.hpp b/ggml/src/ggml-qnn/backend.hpp index dd15b05807641..d60b334c0b2b5 100644 --- a/ggml/src/ggml-qnn/backend.hpp +++ b/ggml/src/ggml-qnn/backend.hpp @@ -1,12 +1,14 @@ #pragma once +#include #include #include "ggml.h" #include "ggml-backend.h" +#include "graph.hpp" #include "qnn.hpp" struct ggml_backend_qnn_context { @@ -19,5 +21,7 @@ struct ggml_backend_qnn_context { QNN_INTERFACE_VER_TYPE raw_interface; QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface; qnn::qcom_socinfo socinfo; - std::unordered_map> qnn_graph_map; + std::unordered_map, Qnn_Tensor_t *, + Qnn_Tensor_t *, Qnn_Tensor_t *>> + qnn_graph_map; }; diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 700114d6f8a26..1aad145c32896 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -81,11 +81,14 @@ class ggml_qnn_graph { return false; } + _tensor_inputs = tensor_inputs; + _tensor_outputs = tensor_outputs; + Qnn_Param_t qnn_params[] = {}; - Qnn_OpConfig_t op_config = { QNN_OPCONFIG_VERSION_1, + Qnn_OpConfig_t op_config = { .version = QNN_OPCONFIG_VERSION_1, .v1 = { _graph_name.c_str(), QNN_OP_PACKAGE_NAME_QTI_AISW, op_name.c_str(), 0, - qnn_params, _tensor_inputs.size(), _tensor_inputs.data(), - _tensor_outputs.size(), _tensor_outputs.data() } }; + qnn_params, (uint32_t)_tensor_inputs.size(), _tensor_inputs.data(), + (uint32_t)_tensor_outputs.size(), _tensor_outputs.data() } }; auto error = _qnn_interface.graphAddNode(_graph_handle, op_config); if (error != QNN_SUCCESS) { QNN_LOG_ERROR("graphAddNode.error = %d\n", error); From 58cec140920985f2e038512001869fdb3cf86ad8 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 5 Jul 2024 17:31:22 +0800 Subject: [PATCH 041/166] reformat --- ggml/src/ggml-qnn.cpp | 457 ++++++++++++++++-------------------- ggml/src/ggml-qnn/utils.hpp | 2 +- 2 files changed, 205 insertions(+), 254 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 19c970c5f0fd6..a590dd5f56cfb 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -1,46 +1,46 @@ +#include "ggml-qnn.h" + +#include #include #include -#include #include +#include #include #include -#include -#include -#include -#include -#include -#include -#include +#include +#include +#include #include +#include #include -#include -#include #include -#include +#include +#include #include -#include -#include -#include +#include +#include +#include +#include +#include #include #include - -#include "ggml-qnn.h" +#include #include "ggml-backend-impl.h" +#include "ggml-qnn/backend-ops.hpp" +#include "ggml-qnn/backend.hpp" #include "ggml-qnn/logger.hpp" -#include "ggml-qnn/utils.hpp" #include "ggml-qnn/tensor.hpp" -#include "ggml-qnn/backend.hpp" -#include "ggml-qnn/backend-ops.hpp" +#include "ggml-qnn/utils.hpp" // ================================================================================================= // // forward declaration // // ================================================================================================= -static int free_qnn_tensor(Qnn_Tensor_t & tensor); +static int free_qnn_tensor(Qnn_Tensor_t &tensor); // ================================================================================================= // @@ -48,37 +48,25 @@ static int free_qnn_tensor(Qnn_Tensor_t & tensor); // // ================================================================================================= #ifdef NDEBUG -#define ENABLE_QNNBACKEND_PERF 0 // enable/disable op's perf info +#define ENABLE_QNNBACKEND_PERF 0 // enable/disable op's perf info #else -#define ENABLE_QNNBACKEND_PERF 1 // enable/disable op's perf info +#define ENABLE_QNNBACKEND_PERF 1 // enable/disable op's perf info #endif -#define QNN_BACKEND_NAME "qnn" +#define QNN_BACKEND_NAME "qnn" static struct qnn::qcom_socinfo g_qnn_soc_info_table[] = { - /* Qualcomm SnapDragon 8 Gen 1 */ - [qnn::SM8450] = { - .soc_model = qnn::SM8450, - .htp_arch = qnn::V69, - .vtcm_size_in_mb = 8}, - - /* Qualcomm SnapDragon 8 Gen 1+ */ - [qnn::SM8475] = { - .soc_model = qnn::SM8475, - .htp_arch = qnn::V69, - .vtcm_size_in_mb = 8}, - - /* Qualcomm SnapDragon 8 Gen 2 */ - [qnn::SM8550] = { - .soc_model = qnn::SM8550, - .htp_arch = qnn::V73, - .vtcm_size_in_mb = 8}, - - /* Qualcomm SnapDragon 8 Gen 3 */ - [qnn::SM8650] = { - .soc_model = qnn::SM8650, - .htp_arch = qnn::V75, - .vtcm_size_in_mb = 8}, + /* Qualcomm SnapDragon 8 Gen 1 */ + [qnn::SM8450] = { .soc_model = qnn::SM8450, .htp_arch = qnn::V69, .vtcm_size_in_mb = 8 }, + + /* Qualcomm SnapDragon 8 Gen 1+ */ + [qnn::SM8475] = { .soc_model = qnn::SM8475, .htp_arch = qnn::V69, .vtcm_size_in_mb = 8 }, + + /* Qualcomm SnapDragon 8 Gen 2 */ + [qnn::SM8550] = { .soc_model = qnn::SM8550, .htp_arch = qnn::V73, .vtcm_size_in_mb = 8 }, + + /* Qualcomm SnapDragon 8 Gen 3 */ + [qnn::SM8650] = { .soc_model = qnn::SM8650, .htp_arch = qnn::V75, .vtcm_size_in_mb = 8 }, }; @@ -96,52 +84,50 @@ static struct qnn::qcom_socinfo g_qnn_soc_info_table[] = { // HMX(Hexagon Matrix eXtensions)/HTA(Hexagon Tensor Accelerator) static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { - [QNN_BACKEND_CPU] = {.device = 0, - .threads = 1, - .name = "qnn-cpu", - .lib = "libQnnCpu.so", - .instance = nullptr, - .backend = nullptr, - .raw_interface = {}, - .raw_system_interface = {}, - .socinfo = {}}, - - [QNN_BACKEND_GPU] = {.device = 1, - .threads = 1, - .name = "qnn-gpu", - .lib = "libQnnGpu.so", - .instance = nullptr, - .backend = nullptr, - .raw_interface = {}, - .raw_system_interface = {}, - .socinfo = {}}, - - [QNN_BACKEND_NPU] = {.device = 2, - .threads = 1, - .name = "qnn-npu", - .lib = "libQnnHtp.so", - .instance = nullptr, - .backend = nullptr, - .raw_interface = {}, - .raw_system_interface = {}, - .socinfo = {}}, + [QNN_BACKEND_CPU] = { .device = 0, + .threads = 1, + .name = "qnn-cpu", + .lib = "libQnnCpu.so", + .instance = nullptr, + .backend = nullptr, + .raw_interface = {}, + .raw_system_interface = {}, + .socinfo = {} }, + + [QNN_BACKEND_GPU] = { .device = 1, + .threads = 1, + .name = "qnn-gpu", + .lib = "libQnnGpu.so", + .instance = nullptr, + .backend = nullptr, + .raw_interface = {}, + .raw_system_interface = {}, + .socinfo = {} }, + + [QNN_BACKEND_NPU] = { .device = 2, + .threads = 1, + .name = "qnn-npu", + .lib = "libQnnHtp.so", + .instance = nullptr, + .backend = nullptr, + .raw_interface = {}, + .raw_system_interface = {}, + .socinfo = {} }, }; struct ggml_backend_qnn_buffer_context { - ggml_backend_qnn_buffer_context(size_t device) - : device(device) - , name(QNN_BACKEND_NAME + std::to_string(device)) {} + ggml_backend_qnn_buffer_context(size_t device) : device(device), name(QNN_BACKEND_NAME + std::to_string(device)) {} ~ggml_backend_qnn_buffer_context() { if (buffer) { free(buffer); } - for (auto * sub_buffer : sub_buffers) { + for (auto *sub_buffer : sub_buffers) { free(sub_buffer); } - for (auto * qnn_tensor : qnn_tensors) { + for (auto *qnn_tensor : qnn_tensors) { free_qnn_tensor(*qnn_tensor); free(qnn_tensor); } @@ -149,19 +135,19 @@ struct ggml_backend_qnn_buffer_context { sub_buffers.clear(); qnn_tensors.clear(); } - void * buffer = nullptr; + void *buffer = nullptr; - struct ggml_backend_qnn_context * backend_ctx = nullptr; + struct ggml_backend_qnn_context *backend_ctx = nullptr; - size_t buffer_size = 0; - std::vector sub_buffers; + size_t buffer_size = 0; + std::vector sub_buffers; std::vector qnn_tensors; - size_t device; - std::string name; + size_t device; + std::string name; }; struct ggml_backend_qnn_buffer_type_context { - size_t device; + size_t device; std::string name; }; @@ -170,7 +156,7 @@ struct ggml_backend_qnn_buffer_type_context { // QNN backend internal helper functions // // ================================================================================================= -static size_t memscpy(void * dst, size_t dst_size, const void * src, size_t copy_size) { +static size_t memscpy(void *dst, size_t dst_size, const void *src, size_t copy_size) { if (!dst || !src || !dst_size || !copy_size) return 0; size_t min_size = dst_size < copy_size ? dst_size : copy_size; @@ -180,13 +166,12 @@ static size_t memscpy(void * dst, size_t dst_size, const void * src, size_t copy return min_size; } -static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) { +static int deep_copy_qnn_tensors(Qnn_Tensor_t &src, Qnn_Tensor_t &dst) { int err = 0; VALIDATE_TENSOR_VERSION(src, err); dst.version = src.version; - QNN_TENSOR_SET_NAME( - dst, ::strndup(QNN_TENSOR_GET_NAME(src),std::string(QNN_TENSOR_GET_NAME(src)).size())); + QNN_TENSOR_SET_NAME(dst, ::strndup(QNN_TENSOR_GET_NAME(src), std::string(QNN_TENSOR_GET_NAME(src)).size())); if (nullptr == QNN_TENSOR_GET_NAME(dst)) { return 1; } @@ -197,7 +182,7 @@ static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) { QNN_TENSOR_SET_MEM_TYPE(dst, QNN_TENSOR_GET_MEM_TYPE(src)); if (QNN_TENSOR_GET_MEM_TYPE(src) == QNN_TENSORMEMTYPE_RAW) { - Qnn_ClientBuffer_t client_buf = {nullptr, 0}; + Qnn_ClientBuffer_t client_buf = { nullptr, 0 }; QNN_TENSOR_SET_CLIENT_BUF(dst, client_buf); } else if (QNN_TENSOR_GET_MEM_TYPE(src) == QNN_TENSORMEMTYPE_MEMHANDLE) { QNN_TENSOR_SET_MEM_HANDLE(dst, nullptr); @@ -205,33 +190,29 @@ static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) { return 1; } - Qnn_QuantizeParams_t src_qparam = QNN_TENSOR_GET_QUANT_PARAMS(src); - Qnn_QuantizationEncoding_t encoding = src_qparam.quantizationEncoding; + Qnn_QuantizeParams_t src_qparam = QNN_TENSOR_GET_QUANT_PARAMS(src); + Qnn_QuantizationEncoding_t encoding = src_qparam.quantizationEncoding; if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) { - Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; - Qnn_AxisScaleOffset_t & axis_scale_offset = src_qparam_cpy.axisScaleOffsetEncoding; - Qnn_ScaleOffset_t ** scaleOffset = & axis_scale_offset.scaleOffset; - size_t scaleOffsetSize = axis_scale_offset.numScaleOffsets * sizeof(Qnn_ScaleOffset_t); - *scaleOffset = (Qnn_ScaleOffset_t *) malloc(scaleOffsetSize); - memscpy(*scaleOffset, scaleOffsetSize, - src_qparam.axisScaleOffsetEncoding.scaleOffset, - scaleOffsetSize); + Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; + Qnn_AxisScaleOffset_t &axis_scale_offset = src_qparam_cpy.axisScaleOffsetEncoding; + Qnn_ScaleOffset_t **scaleOffset = &axis_scale_offset.scaleOffset; + size_t scaleOffsetSize = axis_scale_offset.numScaleOffsets * sizeof(Qnn_ScaleOffset_t); + *scaleOffset = (Qnn_ScaleOffset_t *)malloc(scaleOffsetSize); + memscpy(*scaleOffset, scaleOffsetSize, src_qparam.axisScaleOffsetEncoding.scaleOffset, scaleOffsetSize); QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam_cpy); } else if (encoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) { - Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; - Qnn_BwAxisScaleOffset_t & bwaxis_scale_offset = src_qparam_cpy.bwAxisScaleOffsetEncoding; - size_t scaleSize = bwaxis_scale_offset.numElements * sizeof(float); - float ** scales = &bwaxis_scale_offset.scales; - int32_t ** offsets = &bwaxis_scale_offset.offsets; - *scales = (float *) malloc(scaleSize); - memscpy(*scales, scaleSize, src_qparam.bwAxisScaleOffsetEncoding.scales, - scaleSize); + Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; + Qnn_BwAxisScaleOffset_t &bwaxis_scale_offset = src_qparam_cpy.bwAxisScaleOffsetEncoding; + size_t scaleSize = bwaxis_scale_offset.numElements * sizeof(float); + float **scales = &bwaxis_scale_offset.scales; + int32_t **offsets = &bwaxis_scale_offset.offsets; + *scales = (float *)malloc(scaleSize); + memscpy(*scales, scaleSize, src_qparam.bwAxisScaleOffsetEncoding.scales, scaleSize); if (bwaxis_scale_offset.offsets != nullptr) { size_t offsetSize = bwaxis_scale_offset.numElements * sizeof(int32_t); - *offsets = (int32_t *) malloc(offsetSize); - memscpy(*offsets, offsetSize, - src_qparam.bwAxisScaleOffsetEncoding.offsets, offsetSize); + *offsets = (int32_t *)malloc(offsetSize); + memscpy(*offsets, offsetSize, src_qparam.bwAxisScaleOffsetEncoding.offsets, offsetSize); } QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam_cpy); } else { @@ -240,12 +221,13 @@ static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) { uint32_t rank = QNN_TENSOR_GET_RANK(src); QNN_TENSOR_SET_RANK(dst, rank); - size_t dim_size = rank * sizeof(uint32_t); - uint32_t * dimensions = (uint32_t *) malloc(dim_size); + size_t dim_size = rank * sizeof(uint32_t); + uint32_t *dimensions = (uint32_t *)malloc(dim_size); if (dimensions == nullptr) { - QNN_LOG_WARN("deep_copy_qnn_tensors() allocation error while copying " - "tensor %s\n", - QNN_TENSOR_GET_NAME(src)); + QNN_LOG_WARN( + "deep_copy_qnn_tensors() allocation error while copying " + "tensor %s\n", + QNN_TENSOR_GET_NAME(src)); return 1; } memscpy(dimensions, dim_size, QNN_TENSOR_GET_DIMENSIONS(src), dim_size); @@ -254,11 +236,11 @@ static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) { return err; } -static int free_qnn_tensor(Qnn_Tensor_t & tensor) { +static int free_qnn_tensor(Qnn_Tensor_t &tensor) { int err = 0; VALIDATE_TENSOR_VERSION(tensor, err); - free((void *) QNN_TENSOR_GET_NAME(tensor)); + free((void *)QNN_TENSOR_GET_NAME(tensor)); free(QNN_TENSOR_GET_DIMENSIONS(tensor)); return err; @@ -269,15 +251,14 @@ static int free_qnn_tensor(Qnn_Tensor_t & tensor) { // implementation of QNN backend for GGML // // ================================================================================================= -static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, - const struct ggml_tensor * tensor, +static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, const struct ggml_tensor *tensor, bool b_dump_tensor_info) { if (ggml_is_empty(tensor) || !qnn::ggml_qnn_op_array()[tensor->op]) { return false; } - const struct ggml_tensor * src0 = tensor->src[0]; - const struct ggml_tensor * src1 = tensor->src[1]; + const struct ggml_tensor *src0 = tensor->src[0]; + const struct ggml_tensor *src1 = tensor->src[1]; if (nullptr == src0 || nullptr == src1) { return false; } @@ -304,7 +285,7 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, return false; } - //TODO: support other quantized data type + // TODO: support other quantized data type if (ggml_is_quantized(src0->type)) { if (src0->type != GGML_TYPE_Q8_0 && src0->type != GGML_TYPE_Q4_0) { return false; @@ -313,15 +294,15 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, if (tensor->op == GGML_OP_MUL_MAT) { if (ne00 <= 32 || ne01 <= 32 || ne10 <= 32 || ne11 <= 32) { - //comment it for make UT of mul_mat with QNN RPC happy - //return false; + // comment it for make UT of mul_mat with QNN RPC happy + // return false; } } return true; } -bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx, struct ggml_tensor * tensor) { +bool ggml_qnn_compute_forward(ggml_backend_qnn_context *ctx, struct ggml_tensor *tensor) { auto func = qnn::ggml_qnn_op_array()[tensor->op]; if (!func) { QNN_LOG_WARN("unsupported op %d", tensor->op); @@ -332,7 +313,7 @@ bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx, struct ggml_tensor return true; } -static const char * ggml_backend_qnn_buffer_get_name(ggml_backend_buffer_t buffer) { +static const char *ggml_backend_qnn_buffer_get_name(ggml_backend_buffer_t buffer) { GGML_UNUSED(buffer); return "QNN"; } @@ -342,31 +323,28 @@ GGML_CALL static bool ggml_backend_buffer_is_qnn(ggml_backend_buffer_t buffer) { } GGML_CALL static void ggml_backend_qnn_buffer_free_buffer(ggml_backend_buffer_t buffer) { - ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; + ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context; delete ctx; } -GGML_CALL static void * ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t buffer) { - ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; +GGML_CALL static void *ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t buffer) { + ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context; return ctx->buffer; } -GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, - ggml_tensor * tensor) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; +GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context; - static int idx = 0; - char tensor_name[GGML_MAX_NAME] = {0}; + static int idx = 0; + char tensor_name[GGML_MAX_NAME] = { 0 }; snprintf(tensor_name, GGML_MAX_NAME, "tensor_%04d", idx++); - uint32_t dimensions[] = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1], - (uint32_t) tensor->ne[2], - (uint32_t) tensor->ne[3]}; - Qnn_DataType_t qnn_data_type = - qnn::datatype_from_ggml_datatype(tensor->type); + uint32_t dimensions[] = { (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], + (uint32_t)tensor->ne[3] }; + Qnn_DataType_t qnn_data_type = qnn::datatype_from_ggml_datatype(tensor->type); Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; if (tensor->flags & GGML_TENSOR_FLAG_INPUT) { @@ -381,25 +359,22 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t qnn_mem_type = QNN_TENSORMEMTYPE_MEMHANDLE; } - qnn_tensor = { - .version = QNN_TENSOR_VERSION_1, - {.v1 = {.id = 0, - .name = tensor_name, - .type = qnn_tensor_type, - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = qnn_data_type, - .quantizeParams = - {QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, - .offset = 0}}}, - .rank = qnn::get_ggml_tensor_rank(tensor), - .dimensions = dimensions, - .memType = qnn_mem_type, - {.clientBuf = {.data = nullptr, .dataSize = 0}}}}}; - - Qnn_Tensor_t * p_qnn_tensor = - (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t)); + qnn_tensor = { .version = QNN_TENSOR_VERSION_1, + { .v1 = { + .id = 0, + .name = tensor_name, + .type = qnn_tensor_type, + .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, + .dataType = qnn_data_type, + .quantizeParams = { QNN_DEFINITION_UNDEFINED, + QNN_QUANTIZATION_ENCODING_UNDEFINED, + { .scaleOffsetEncoding = { .scale = 0.0000000000000000f, .offset = 0 } } }, + .rank = qnn::get_ggml_tensor_rank(tensor), + .dimensions = dimensions, + .memType = qnn_mem_type, + { .clientBuf = { .data = nullptr, .dataSize = 0 } } } } }; + + Qnn_Tensor_t *p_qnn_tensor = (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t)); if (nullptr == p_qnn_tensor) { QNN_LOG_WARN("calloc failed"); return; @@ -414,24 +389,21 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t ctx->qnn_tensors.push_back(p_qnn_tensor); } -GGML_CALL static void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer, - ggml_tensor * tensor, const void * data, - size_t offset, size_t size) { +GGML_CALL static void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor, + const void *data, size_t offset, size_t size) { GGML_UNUSED(buffer); - memcpy((char *) tensor->data + offset, data, size); + memcpy((char *)tensor->data + offset, data, size); } -GGML_CALL static void ggml_backend_qnn_buffer_get_tensor(ggml_backend_buffer_t buffer, - const ggml_tensor * tensor, void * data, - size_t offset, size_t size) { +GGML_CALL static void ggml_backend_qnn_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor *tensor, + void *data, size_t offset, size_t size) { GGML_UNUSED(buffer); - memcpy(data, (const char *) tensor->data + offset, size); + memcpy(data, (const char *)tensor->data + offset, size); } -GGML_CALL static bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t buffer, - const struct ggml_tensor * src, - struct ggml_tensor * dst) { +GGML_CALL static bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor *src, + struct ggml_tensor *dst) { GGML_UNUSED(buffer); if (ggml_backend_buffer_is_host(src->buffer)) { memcpy(dst->data, src->data, ggml_nbytes(src)); @@ -442,7 +414,7 @@ GGML_CALL static bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t b } GGML_CALL static void ggml_backend_qnn_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { - ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; + ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context; memset(ctx->buffer, value, ctx->buffer_size); } @@ -459,13 +431,11 @@ static ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = { /* .reset = */ nullptr, }; -GGML_CALL static const char * ggml_backend_qnn_buffer_type_name(ggml_backend_buffer_type_t buft) { - return "QNN"; -} +GGML_CALL static const char *ggml_backend_qnn_buffer_type_name(ggml_backend_buffer_type_t buft) { return "QNN"; } -static void * ggml_qnn_host_malloc(size_t n) { - void * data = nullptr; - int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n); +static void *ggml_qnn_host_malloc(size_t n) { + void *data = nullptr; + int result = posix_memalign((void **)&data, sysconf(_SC_PAGESIZE), n); if (result != 0) { QNN_LOG_WARN("%s: error: posix_memalign failed\n", __func__); return nullptr; @@ -474,10 +444,10 @@ static void * ggml_qnn_host_malloc(size_t n) { return data; } -GGML_CALL static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer( - ggml_backend_buffer_type_t buft, size_t size) { - ggml_backend_qnn_buffer_type_context * buft_ctx = (ggml_backend_qnn_buffer_type_context *)buft->context; - ggml_backend_qnn_buffer_context * ctx = new ggml_backend_qnn_buffer_context(buft_ctx->device); +GGML_CALL static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, + size_t size) { + ggml_backend_qnn_buffer_type_context *buft_ctx = (ggml_backend_qnn_buffer_type_context *)buft->context; + ggml_backend_qnn_buffer_context *ctx = new ggml_backend_qnn_buffer_context(buft_ctx->device); size_t size_page = sysconf(_SC_PAGESIZE); @@ -487,7 +457,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer } // TODO:use pre-allocated buffer in internal memory pool - ctx->buffer = ggml_qnn_host_malloc(size_aligned); + ctx->buffer = ggml_qnn_host_malloc(size_aligned); ctx->buffer_size = size_aligned; ctx->backend_ctx = &g_qnn_mgr[buft_ctx->device]; @@ -497,11 +467,10 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer return nullptr; } - return ggml_backend_buffer_init(buft, ggml_backend_qnn_buffer_interface,ctx, size); + return ggml_backend_buffer_init(buft, ggml_backend_qnn_buffer_interface, ctx, size); } -GGML_CALL static size_t ggml_backend_qnn_buffer_type_get_alignment( - ggml_backend_buffer_type_t buft) { +GGML_CALL static size_t ggml_backend_qnn_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { GGML_UNUSED(buft); return 32; } @@ -518,18 +487,16 @@ GGML_CALL static bool ggml_backend_qnn_buffer_is_host(ggml_backend_buffer_type_t return true; } -GGML_CALL static const char * ggml_backend_qnn_name(ggml_backend_t backend) { - return "QNN"; -} +GGML_CALL static const char *ggml_backend_qnn_name(ggml_backend_t backend) { return "QNN"; } GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { QNN_LOG_INFO("enter %s", __func__); - ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; + ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *)backend->context; QNN_LOG_INFO("idx %d, name:%s", ctx->device, g_qnn_mgr[ctx->device].name); auto *instance = g_qnn_mgr[ctx->device].instance; if (instance != nullptr) { - for (const auto &graph_item: ctx->qnn_graph_map) { + for (const auto &graph_item : ctx->qnn_graph_map) { QNN_LOG_INFO("graph type:%s", graph_item.first.c_str()); } @@ -548,21 +515,20 @@ GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { } GGML_CALL static ggml_backend_buffer_type_t ggml_backend_qnn_get_default_buffer_type(ggml_backend_t backend) { - ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; + ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *)backend->context; return ggml_backend_qnn_buffer_type(ctx->device); } -GGML_CALL static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { - enum ggml_status result = GGML_STATUS_SUCCESS; - ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; +GGML_CALL static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, ggml_cgraph *cgraph) { + enum ggml_status result = GGML_STATUS_SUCCESS; + ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *)backend->context; GGML_UNUSED(ctx); for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_tensor * node = cgraph->nodes[i]; - if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || - node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || - node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { + ggml_tensor *node = cgraph->nodes[i]; + if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || + node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { continue; } bool ok = ggml_qnn_compute_forward(ctx, node); @@ -574,15 +540,14 @@ GGML_CALL static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backe return result; } -GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, - const ggml_tensor * op) { - ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *) backend->context; +GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const ggml_tensor *op) { + ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *)backend->context; return (ggml_qnn_can_handle_op(ctx, op, false)); } -GGML_CALL static bool ggml_backend_qnn_offload_op(ggml_backend_t backend,const ggml_tensor * tensor) { - ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; +GGML_CALL static bool ggml_backend_qnn_offload_op(ggml_backend_t backend, const ggml_tensor *tensor) { + ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *)backend->context; return ggml_qnn_can_handle_op(ctx, tensor, false); } @@ -611,21 +576,19 @@ static ggml_backend_i ggml_backend_qnn_interface = { }; static ggml_guid_t ggml_backend_qnn_guid() { - static ggml_guid guid = { - 0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81, - 0x92, 0xa3, 0xb4, 0xc5, 0xd6, 0xe7, 0xf8, 0x09 - }; + static ggml_guid guid = { 0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81, + 0x92, 0xa3, 0xb4, 0xc5, 0xd6, 0xe7, 0xf8, 0x09 }; return &guid; } -static ggml_backend_t ggml_backend_qnn_reg_init(const char * params, void * user_data) { +static ggml_backend_t ggml_backend_qnn_reg_init(const char *params, void *user_data) { if (nullptr == params) { // QNN library path // can be hardcoded to "/data/local/tmp/" for Android command line application // or specified in JNI layer for Android APK params = "/data/local/tmp/"; } - ggml_backend_t qnn_backend = ggml_backend_qnn_init((int) (intptr_t) user_data, params); + ggml_backend_t qnn_backend = ggml_backend_qnn_init((int)(intptr_t)user_data, params); return qnn_backend; } @@ -637,19 +600,15 @@ bool ggml_backend_is_qnn(ggml_backend_t backend) { void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int n_threads) { GGML_ASSERT(ggml_backend_is_qnn(backend)); - auto * ctx = (ggml_backend_qnn_context *) backend->context; + auto *ctx = (ggml_backend_qnn_context *)backend->context; ctx->threads = n_threads; } -const char * ggml_backend_qnn_get_name(ggml_backend_t backend) { - return backend->iface.get_name(backend); -} +const char *ggml_backend_qnn_get_name(ggml_backend_t backend) { return backend->iface.get_name(backend); } -int ggml_backend_qnn_get_device_count() { - return GGML_QNN_MAX_DEVICES; -} +int ggml_backend_qnn_get_device_count() { return GGML_QNN_MAX_DEVICES; } -void ggml_backend_qnn_get_device_description(size_t dev_num, char * description, size_t description_size) { +void ggml_backend_qnn_get_device_description(size_t dev_num, char *description, size_t description_size) { if (nullptr == description || 0 == description_size) { QNN_LOG_WARN("invalid param"); return; @@ -665,9 +624,10 @@ void ggml_backend_qnn_get_device_description(size_t dev_num, char * description, ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device) { if (device >= GGML_QNN_MAX_DEVICES) { - QNN_LOG_DEBUG("ggml_backend_qnn_buffer_type error: device_index:%d is " - "out of range [0, %d]\n", - device, GGML_QNN_MAX_DEVICES - 1); + QNN_LOG_DEBUG( + "ggml_backend_qnn_buffer_type error: device_index:%d is " + "out of range [0, %d]\n", + device, GGML_QNN_MAX_DEVICES - 1); return nullptr; } @@ -676,17 +636,15 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device) { static bool ggml_backend_qnn_buffer_type_initialized = false; if (!ggml_backend_qnn_buffer_type_initialized) { for (size_t i = 0; i < GGML_QNN_MAX_DEVICES; i++) { - auto & context = ggml_backend_qnn_buffer_type_contexts[i]; + auto &context = ggml_backend_qnn_buffer_type_contexts[i]; context = { i, std::string(QNN_BACKEND_NAME) + std::to_string(i) }; ggml_backend_qnn_buffer_types[i] = { - /* .iface = */ { - /* .get_name = */ ggml_backend_qnn_buffer_type_name, - /* .alloc_buffer = */ ggml_backend_qnn_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_qnn_buffer_type_get_alignment, - /* .get_max_size = */ ggml_backend_qnn_buffer_type_get_max_size, - /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes - /* .is_host = */ ggml_backend_qnn_buffer_is_host - }, + /* .iface = */ { /* .get_name = */ ggml_backend_qnn_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_qnn_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_qnn_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_qnn_buffer_type_get_max_size, + /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes + /* .is_host = */ ggml_backend_qnn_buffer_is_host }, /* .context = */ &context, }; } @@ -702,7 +660,7 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device) { * @param qnn_lib_path qnn library path, such as "/data/local/tmp/" on Android or specified in JNI layer * @return */ -ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { +ggml_backend_t ggml_backend_qnn_init(size_t device, const char *qnn_lib_path) { int result = 0; if (nullptr == qnn_lib_path) { @@ -729,9 +687,8 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { QNN_LOG_ERROR("QNN NPU backend setenv failure"); } if (0 == setenv("ADSP_LIBRARY_PATH", - (path + - ";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/" - "rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp") + (path + ";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/" + "rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp") .c_str(), 1)) { QNN_LOG_INFO("QNN NPU backend setenv successfully"); @@ -740,20 +697,16 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { } } else { if (0 == setenv("LD_LIBRARY_PATH", path.c_str(), 1)) { - QNN_LOG_INFO("%s backend setenv successfully\n", - qnn::get_backend_name(device)); + QNN_LOG_INFO("%s backend setenv successfully\n", qnn::get_backend_name(device)); } else { - QNN_LOG_ERROR("%s backend setenv failure\n", - qnn::get_backend_name(device)); + QNN_LOG_ERROR("%s backend setenv failure\n", qnn::get_backend_name(device)); } } auto *instance = new qnn::qnn_instance(qnn_lib_path, g_qnn_mgr[device].lib, ""); result = instance->qnn_init(nullptr); if (0 != result) { - QNN_LOG_WARN( - "init qnn subsystem failed with qnn backend %s, pls check why\n", - qnn::get_backend_name(device)); + QNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why\n", qnn::get_backend_name(device)); delete instance; return nullptr; } @@ -766,15 +719,14 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { std::string device_name = qnn::get_backend_name(device); QNN_LOG_INFO("qnn device name %s", device_name.c_str()); - g_qnn_mgr[device].instance = instance; - g_qnn_mgr[device].raw_interface = instance->get_qnn_raw_interface(); + g_qnn_mgr[device].instance = instance; + g_qnn_mgr[device].raw_interface = instance->get_qnn_raw_interface(); g_qnn_mgr[device].raw_system_interface = instance->get_qnn_raw_system_interface(); - g_qnn_mgr[device].socinfo = instance->get_soc_info(); + g_qnn_mgr[device].socinfo = instance->get_soc_info(); - ggml_backend_t qnn_backend = - new ggml_backend{/* .guid = */ ggml_backend_qnn_guid(), - /* .iface = */ ggml_backend_qnn_interface, - /* .context = */ &g_qnn_mgr[device]}; + ggml_backend_t qnn_backend = new ggml_backend{ /* .guid = */ ggml_backend_qnn_guid(), + /* .iface = */ ggml_backend_qnn_interface, + /* .context = */ &g_qnn_mgr[device] }; g_qnn_mgr[device].backend = qnn_backend; return qnn_backend; @@ -786,9 +738,8 @@ GGML_CALL int ggml_backend_qnn_reg_devices() { for (size_t idx = 0; idx < GGML_QNN_MAX_DEVICES; idx++) { char name[GGML_MAX_NAME]; ggml_backend_qnn_get_device_description(idx, name, GGML_MAX_NAME); - ggml_backend_register(name, ggml_backend_qnn_reg_init, - ggml_backend_qnn_buffer_type(idx), - (void *) (intptr_t) idx); + ggml_backend_register(name, ggml_backend_qnn_reg_init, ggml_backend_qnn_buffer_type(idx), + (void *)(intptr_t)idx); } return GGML_QNN_MAX_DEVICES; diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp index 673fb90e63de9..2d830f6786b7d 100644 --- a/ggml/src/ggml-qnn/utils.hpp +++ b/ggml/src/ggml-qnn/utils.hpp @@ -30,7 +30,7 @@ Fn load_qnn_functionpointers(void *handle, const char *function_name) { return reinterpret_cast(dlsym(handle, function_name)); } -inline int validate_tensor_version(Qnn_Tensor_t tensor) { +inline int validate_tensor_version(const Qnn_Tensor_t &tensor) { if (tensor.version != QNN_TENSOR_VERSION_1) { QNN_LOG_WARN("validate_tensor_version() tensor %s, got unsupported version %d\n", tensor.v1.name, tensor.version); From 0f2e68713cd9f0d8a8de6412ade139b4fdea82b4 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 5 Jul 2024 18:38:20 +0800 Subject: [PATCH 042/166] move tensor related function to utils --- ggml/src/ggml-qnn.cpp | 131 ++++-------------------------------- ggml/src/ggml-qnn/utils.cpp | 115 +++++++++++++++++++++++++++++++ ggml/src/ggml-qnn/utils.hpp | 17 ++--- 3 files changed, 134 insertions(+), 129 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index a590dd5f56cfb..d6feea0437511 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -35,13 +35,6 @@ #include "ggml-qnn/tensor.hpp" #include "ggml-qnn/utils.hpp" -// ================================================================================================= -// -// forward declaration -// -// ================================================================================================= -static int free_qnn_tensor(Qnn_Tensor_t &tensor); - // ================================================================================================= // // self-defined macro / data structure @@ -128,7 +121,7 @@ struct ggml_backend_qnn_buffer_context { } for (auto *qnn_tensor : qnn_tensors) { - free_qnn_tensor(*qnn_tensor); + qnn::device_tensor_free(*qnn_tensor); free(qnn_tensor); } @@ -156,95 +149,6 @@ struct ggml_backend_qnn_buffer_type_context { // QNN backend internal helper functions // // ================================================================================================= -static size_t memscpy(void *dst, size_t dst_size, const void *src, size_t copy_size) { - if (!dst || !src || !dst_size || !copy_size) return 0; - - size_t min_size = dst_size < copy_size ? dst_size : copy_size; - - memcpy(dst, src, min_size); - - return min_size; -} - -static int deep_copy_qnn_tensors(Qnn_Tensor_t &src, Qnn_Tensor_t &dst) { - int err = 0; - VALIDATE_TENSOR_VERSION(src, err); - - dst.version = src.version; - QNN_TENSOR_SET_NAME(dst, ::strndup(QNN_TENSOR_GET_NAME(src), std::string(QNN_TENSOR_GET_NAME(src)).size())); - if (nullptr == QNN_TENSOR_GET_NAME(dst)) { - return 1; - } - QNN_TENSOR_SET_ID(dst, QNN_TENSOR_GET_ID(src)); - QNN_TENSOR_SET_TYPE(dst, QNN_TENSOR_GET_TYPE(src)); - QNN_TENSOR_SET_DATA_FORMAT(dst, QNN_TENSOR_GET_DATA_FORMAT(src)); - QNN_TENSOR_SET_DATA_TYPE(dst, QNN_TENSOR_GET_DATA_TYPE(src)); - QNN_TENSOR_SET_MEM_TYPE(dst, QNN_TENSOR_GET_MEM_TYPE(src)); - - if (QNN_TENSOR_GET_MEM_TYPE(src) == QNN_TENSORMEMTYPE_RAW) { - Qnn_ClientBuffer_t client_buf = { nullptr, 0 }; - QNN_TENSOR_SET_CLIENT_BUF(dst, client_buf); - } else if (QNN_TENSOR_GET_MEM_TYPE(src) == QNN_TENSORMEMTYPE_MEMHANDLE) { - QNN_TENSOR_SET_MEM_HANDLE(dst, nullptr); - } else { - return 1; - } - - Qnn_QuantizeParams_t src_qparam = QNN_TENSOR_GET_QUANT_PARAMS(src); - Qnn_QuantizationEncoding_t encoding = src_qparam.quantizationEncoding; - if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) { - Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; - Qnn_AxisScaleOffset_t &axis_scale_offset = src_qparam_cpy.axisScaleOffsetEncoding; - Qnn_ScaleOffset_t **scaleOffset = &axis_scale_offset.scaleOffset; - size_t scaleOffsetSize = axis_scale_offset.numScaleOffsets * sizeof(Qnn_ScaleOffset_t); - *scaleOffset = (Qnn_ScaleOffset_t *)malloc(scaleOffsetSize); - memscpy(*scaleOffset, scaleOffsetSize, src_qparam.axisScaleOffsetEncoding.scaleOffset, scaleOffsetSize); - QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam_cpy); - } else if (encoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) { - Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; - Qnn_BwAxisScaleOffset_t &bwaxis_scale_offset = src_qparam_cpy.bwAxisScaleOffsetEncoding; - size_t scaleSize = bwaxis_scale_offset.numElements * sizeof(float); - float **scales = &bwaxis_scale_offset.scales; - int32_t **offsets = &bwaxis_scale_offset.offsets; - *scales = (float *)malloc(scaleSize); - memscpy(*scales, scaleSize, src_qparam.bwAxisScaleOffsetEncoding.scales, scaleSize); - - if (bwaxis_scale_offset.offsets != nullptr) { - size_t offsetSize = bwaxis_scale_offset.numElements * sizeof(int32_t); - *offsets = (int32_t *)malloc(offsetSize); - memscpy(*offsets, offsetSize, src_qparam.bwAxisScaleOffsetEncoding.offsets, offsetSize); - } - QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam_cpy); - } else { - QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam); - } - - uint32_t rank = QNN_TENSOR_GET_RANK(src); - QNN_TENSOR_SET_RANK(dst, rank); - size_t dim_size = rank * sizeof(uint32_t); - uint32_t *dimensions = (uint32_t *)malloc(dim_size); - if (dimensions == nullptr) { - QNN_LOG_WARN( - "deep_copy_qnn_tensors() allocation error while copying " - "tensor %s\n", - QNN_TENSOR_GET_NAME(src)); - return 1; - } - memscpy(dimensions, dim_size, QNN_TENSOR_GET_DIMENSIONS(src), dim_size); - QNN_TENSOR_SET_DIMENSIONS(dst, dimensions); - - return err; -} - -static int free_qnn_tensor(Qnn_Tensor_t &tensor) { - int err = 0; - VALIDATE_TENSOR_VERSION(tensor, err); - - free((void *)QNN_TENSOR_GET_NAME(tensor)); - free(QNN_TENSOR_GET_DIMENSIONS(tensor)); - - return err; -} // ================================================================================================= // @@ -335,9 +239,14 @@ GGML_CALL static void *ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t bu } GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context; + Qnn_Tensor_t *p_qnn_tensor = (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t)); + if (!p_qnn_tensor) { + QNN_LOG_WARN("calloc failed"); + return; + } + static int idx = 0; char tensor_name[GGML_MAX_NAME] = { 0 }; snprintf(tensor_name, GGML_MAX_NAME, "tensor_%04d", idx++); @@ -352,39 +261,23 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t } else if (tensor->flags & GGML_TENSOR_FLAG_OUTPUT) { qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ; } - Qnn_Tensor_t qnn_tensor = QNN_TENSOR_INIT; Qnn_TensorMemType_t qnn_mem_type = QNN_TENSORMEMTYPE_RAW; if (ctx->device == QNN_BACKEND_GPU) { qnn_mem_type = QNN_TENSORMEMTYPE_MEMHANDLE; } - qnn_tensor = { .version = QNN_TENSOR_VERSION_1, - { .v1 = { - .id = 0, - .name = tensor_name, - .type = qnn_tensor_type, - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = qnn_data_type, - .quantizeParams = { QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - { .scaleOffsetEncoding = { .scale = 0.0000000000000000f, .offset = 0 } } }, - .rank = qnn::get_ggml_tensor_rank(tensor), - .dimensions = dimensions, - .memType = qnn_mem_type, - { .clientBuf = { .data = nullptr, .dataSize = 0 } } } } }; + Qnn_Tensor_t qnn_tensor; + qnn::device_tensor_init(qnn_tensor, qnn::get_ggml_tensor_rank(tensor), qnn_mem_type, tensor_name, qnn_tensor_type, + qnn_data_type, dimensions); - Qnn_Tensor_t *p_qnn_tensor = (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t)); - if (nullptr == p_qnn_tensor) { - QNN_LOG_WARN("calloc failed"); - return; - } - error = deep_copy_qnn_tensors(qnn_tensor, *p_qnn_tensor); + Qnn_ErrorHandle_t error = qnn::device_tensor_deep_copy(qnn_tensor, *p_qnn_tensor); if (error != QNN_SUCCESS) { free(p_qnn_tensor); QNN_LOG_WARN("init tensor failed"); return; } + tensor->extra = p_qnn_tensor; ctx->qnn_tensors.push_back(p_qnn_tensor); } diff --git a/ggml/src/ggml-qnn/utils.cpp b/ggml/src/ggml-qnn/utils.cpp index 2368b466c8187..89982449a8eba 100644 --- a/ggml/src/ggml-qnn/utils.cpp +++ b/ggml/src/ggml-qnn/utils.cpp @@ -5,6 +5,20 @@ #include "qnn-types.hpp" +namespace { + +size_t memscpy(void *dst, size_t dst_size, const void *src, size_t copy_size) { + if (!dst || !src || !dst_size || !copy_size) return 0; + + size_t min_size = dst_size < copy_size ? dst_size : copy_size; + + memcpy(dst, src, min_size); + + return min_size; +} + +} // namespace + namespace qnn { // TODO: mapping more ggml data type to QNN data type @@ -121,4 +135,105 @@ const char *opname_from_ggmlop(enum ggml_op ggmlop) { return nullptr; } +void device_tensor_init(Qnn_Tensor_t &tensor, uint32_t rank, Qnn_TensorMemType_t mem_type, const char *tensor_name, + Qnn_TensorType_t qnn_tensor_type, Qnn_DataType_t qnn_data_type, uint32_t *dimensions) { + tensor = QNN_TENSOR_INIT; + tensor = { .version = QNN_TENSOR_VERSION_1, + { .v1 = { .id = 0, + .name = tensor_name, + .type = qnn_tensor_type, + .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, + .dataType = qnn_data_type, + .quantizeParams = { QNN_DEFINITION_UNDEFINED, + QNN_QUANTIZATION_ENCODING_UNDEFINED, + { .scaleOffsetEncoding = { .scale = 0.0000000000000000f, .offset = 0 } } }, + .rank = rank, + .dimensions = dimensions, + .memType = mem_type, + { .clientBuf = {} } } } }; +} + +Qnn_ErrorHandle_t device_tensor_deep_copy(const Qnn_Tensor_t &src, Qnn_Tensor_t &dst) { + Qnn_ErrorHandle_t err = validate_tensor_version(src); + if (err != QNN_SUCCESS) { + QNN_LOG_WARN("validate_tensor_version expected QNN_SUCCESS\n"); + return err; + } + + dst.version = src.version; + QNN_TENSOR_SET_NAME(dst, ::strndup(QNN_TENSOR_GET_NAME(src), std::string(QNN_TENSOR_GET_NAME(src)).size())); + if (nullptr == QNN_TENSOR_GET_NAME(dst)) { + return (Qnn_ErrorHandle_t)1; + } + QNN_TENSOR_SET_ID(dst, QNN_TENSOR_GET_ID(src)); + QNN_TENSOR_SET_TYPE(dst, QNN_TENSOR_GET_TYPE(src)); + QNN_TENSOR_SET_DATA_FORMAT(dst, QNN_TENSOR_GET_DATA_FORMAT(src)); + QNN_TENSOR_SET_DATA_TYPE(dst, QNN_TENSOR_GET_DATA_TYPE(src)); + QNN_TENSOR_SET_MEM_TYPE(dst, QNN_TENSOR_GET_MEM_TYPE(src)); + + if (QNN_TENSOR_GET_MEM_TYPE(src) == QNN_TENSORMEMTYPE_RAW) { + Qnn_ClientBuffer_t client_buf = { nullptr, 0 }; + QNN_TENSOR_SET_CLIENT_BUF(dst, client_buf); + } else if (QNN_TENSOR_GET_MEM_TYPE(src) == QNN_TENSORMEMTYPE_MEMHANDLE) { + QNN_TENSOR_SET_MEM_HANDLE(dst, nullptr); + } else { + return (Qnn_ErrorHandle_t)1; + } + + Qnn_QuantizeParams_t src_qparam = QNN_TENSOR_GET_QUANT_PARAMS(src); + Qnn_QuantizationEncoding_t encoding = src_qparam.quantizationEncoding; + if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) { + Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; + Qnn_AxisScaleOffset_t &axis_scale_offset = src_qparam_cpy.axisScaleOffsetEncoding; + Qnn_ScaleOffset_t **scaleOffset = &axis_scale_offset.scaleOffset; + size_t scaleOffsetSize = axis_scale_offset.numScaleOffsets * sizeof(Qnn_ScaleOffset_t); + *scaleOffset = (Qnn_ScaleOffset_t *)malloc(scaleOffsetSize); + memscpy(*scaleOffset, scaleOffsetSize, src_qparam.axisScaleOffsetEncoding.scaleOffset, scaleOffsetSize); + QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam_cpy); + } else if (encoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) { + Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; + Qnn_BwAxisScaleOffset_t &bwaxis_scale_offset = src_qparam_cpy.bwAxisScaleOffsetEncoding; + size_t scaleSize = bwaxis_scale_offset.numElements * sizeof(float); + float **scales = &bwaxis_scale_offset.scales; + int32_t **offsets = &bwaxis_scale_offset.offsets; + *scales = (float *)malloc(scaleSize); + memscpy(*scales, scaleSize, src_qparam.bwAxisScaleOffsetEncoding.scales, scaleSize); + + if (bwaxis_scale_offset.offsets != nullptr) { + size_t offsetSize = bwaxis_scale_offset.numElements * sizeof(int32_t); + *offsets = (int32_t *)malloc(offsetSize); + memscpy(*offsets, offsetSize, src_qparam.bwAxisScaleOffsetEncoding.offsets, offsetSize); + } + QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam_cpy); + } else { + QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam); + } + + uint32_t rank = QNN_TENSOR_GET_RANK(src); + QNN_TENSOR_SET_RANK(dst, rank); + size_t dim_size = rank * sizeof(uint32_t); + uint32_t *dimensions = (uint32_t *)malloc(dim_size); + if (dimensions == nullptr) { + QNN_LOG_WARN( + "deep_copy_qnn_tensors() allocation error while copying " + "tensor %s\n", + QNN_TENSOR_GET_NAME(src)); + return (Qnn_ErrorHandle_t)1; + } + memscpy(dimensions, dim_size, QNN_TENSOR_GET_DIMENSIONS(src), dim_size); + QNN_TENSOR_SET_DIMENSIONS(dst, dimensions); + + return err; +} + +void device_tensor_free(Qnn_Tensor_t &tensor) { + if (validate_tensor_version(tensor) != QNN_SUCCESS) { + QNN_LOG_WARN("validate_tensor_version expected QNN_SUCCESS\n"); + return; + } + + free((void *)QNN_TENSOR_GET_NAME(tensor)); + free(QNN_TENSOR_GET_DIMENSIONS(tensor)); +} + } // namespace qnn diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp index 2d830f6786b7d..aa824379a8d9b 100644 --- a/ggml/src/ggml-qnn/utils.hpp +++ b/ggml/src/ggml-qnn/utils.hpp @@ -169,6 +169,13 @@ inline void set_qnn_tensor_memhandle(Qnn_Tensor_t &tensor, Qnn_MemHandle_t handl } } +void device_tensor_init(Qnn_Tensor_t &tensor, uint32_t rank, Qnn_TensorMemType_t mem_type, const char *tensor_name, + Qnn_TensorType_t qnn_tensor_type, Qnn_DataType_t qnn_data_type, uint32_t *dimensions); + +Qnn_ErrorHandle_t device_tensor_deep_copy(const Qnn_Tensor_t &src, Qnn_Tensor_t &dst); + +void device_tensor_free(Qnn_Tensor_t &tensor); + #if ENABLE_QNNBACKEND_PERF class qnn_perf { public: @@ -206,15 +213,6 @@ class qnn_perf { } // namespace qnn -#define VALIDATE(value, status) \ - do { \ - status = value; \ - if (status != QNN_SUCCESS) { \ - QNN_LOG_WARN("%s expected QNN_SUCCESS\n", #value); \ - return status; \ - } \ - } while (0) - #define QNN_TENSOR_GET_ID(tensor) qnn::get_qnn_tensorid(tensor) #define QNN_TENSOR_GET_NAME(tensor) qnn::get_qnn_tensorname(tensor) #define QNN_TENSOR_GET_TYPE(tensor) qnn::get_qnn_tensortype(tensor) @@ -236,4 +234,3 @@ class qnn_perf { #define QNN_TENSOR_SET_MEM_TYPE(tensor, value) qnn::set_qnn_tensor_memtype(tensor, value) #define QNN_TENSOR_SET_CLIENT_BUF(tensor, value) qnn::set_qnn_tensor_clientbuf(tensor, value) #define QNN_TENSOR_SET_MEM_HANDLE(tensor, value) qnn::set_qnn_tensor_memhandle(tensor, value) -#define VALIDATE_TENSOR_VERSION(tensor, err) VALIDATE(qnn::validate_tensor_version(tensor), err) From 4b0f6b0cd6f24b16a2fc8022345161811c01bcc2 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 5 Jul 2024 19:34:56 +0800 Subject: [PATCH 043/166] add helper function to get Qnn_TensorType_t from ggml_tensor --- ggml/src/ggml-qnn.cpp | 16 ++++------------ ggml/src/ggml-qnn/tensor.hpp | 4 ++-- ggml/src/ggml-qnn/utils.cpp | 16 ++++++++++++++-- ggml/src/ggml-qnn/utils.hpp | 4 +++- 4 files changed, 23 insertions(+), 17 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index d6feea0437511..632ce8ee5c19e 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -250,23 +250,15 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t static int idx = 0; char tensor_name[GGML_MAX_NAME] = { 0 }; snprintf(tensor_name, GGML_MAX_NAME, "tensor_%04d", idx++); - - uint32_t dimensions[] = { (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], - (uint32_t)tensor->ne[3] }; - Qnn_DataType_t qnn_data_type = qnn::datatype_from_ggml_datatype(tensor->type); - Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; - - if (tensor->flags & GGML_TENSOR_FLAG_INPUT) { - qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; - } else if (tensor->flags & GGML_TENSOR_FLAG_OUTPUT) { - qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ; - } - + Qnn_DataType_t qnn_data_type = qnn::device_datatype_from_ggml_datatype(tensor->type); + Qnn_TensorType_t qnn_tensor_type = qnn::device_tensortype_from_ggml_tensor(tensor); Qnn_TensorMemType_t qnn_mem_type = QNN_TENSORMEMTYPE_RAW; if (ctx->device == QNN_BACKEND_GPU) { qnn_mem_type = QNN_TENSORMEMTYPE_MEMHANDLE; } + uint32_t dimensions[] = { (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], + (uint32_t)tensor->ne[3] }; Qnn_Tensor_t qnn_tensor; qnn::device_tensor_init(qnn_tensor, qnn::get_ggml_tensor_rank(tensor), qnn_mem_type, tensor_name, qnn_tensor_type, qnn_data_type, dimensions); diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index 0ec75c03f0e53..8a9196616fcae 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -17,7 +17,7 @@ class ggml_qnn_tensor_readwrite { ggml_backend_qnn_context *ctx) : _tensor(tensor), _qnn_tensor(reinterpret_cast(tensor->extra)), _context(ctx) { _old_dimensions = QNN_VER_PTR(*_qnn_tensor)->dimensions; - const auto qnn_data_type = datatype_from_ggml_datatype(tensor->type); + const auto qnn_data_type = device_datatype_from_ggml_datatype(tensor->type); const bool is_npu = ctx->device == QNN_BACKEND_NPU; QNN_VER_PTR(*_qnn_tensor)->type = _tensorType; if (is_npu) { @@ -67,7 +67,7 @@ class ggml_qnn_tensor_readwrite { ggml_backend_qnn_context *ctx) : _tensor(tensor), _qnn_tensor(qnn_tensor), _context(ctx) { _old_dimensions = QNN_VER_PTR(*_qnn_tensor)->dimensions; - const auto qnn_data_type = qnn::datatype_from_ggml_datatype(tensor->type); + const auto qnn_data_type = device_datatype_from_ggml_datatype(tensor->type); const bool is_npu = ctx->device == QNN_BACKEND_NPU; _dimensions[0] = (uint32_t)tensor->ne[0]; diff --git a/ggml/src/ggml-qnn/utils.cpp b/ggml/src/ggml-qnn/utils.cpp index 89982449a8eba..7c25314f731f0 100644 --- a/ggml/src/ggml-qnn/utils.cpp +++ b/ggml/src/ggml-qnn/utils.cpp @@ -23,8 +23,8 @@ namespace qnn { // TODO: mapping more ggml data type to QNN data type // ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684 -Qnn_DataType_t datatype_from_ggml_datatype(enum ggml_type ggmltype) { - switch (ggmltype) { +Qnn_DataType_t device_datatype_from_ggml_datatype(ggml_type ggml_type) { + switch (ggml_type) { case GGML_TYPE_F16: return QNN_DATATYPE_FLOAT_16; case GGML_TYPE_F32: @@ -41,6 +41,18 @@ Qnn_DataType_t datatype_from_ggml_datatype(enum ggml_type ggmltype) { return QNN_DATATYPE_UNDEFINED; } +Qnn_TensorType_t device_tensortype_from_ggml_tensor(ggml_tensor *ggml_tensor) { + Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; + + if (ggml_tensor->flags & GGML_TENSOR_FLAG_INPUT) { + qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; + } else if (ggml_tensor->flags & GGML_TENSOR_FLAG_OUTPUT) { + qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ; + } + + return qnn_tensor_type; +} + uint32_t get_ggml_tensor_rank(const ggml_tensor *tensor) { uint32_t rank = 0; for (int i = 0; i < GGML_MAX_DIMS; i++) { diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp index aa824379a8d9b..87d908f1e15fb 100644 --- a/ggml/src/ggml-qnn/utils.hpp +++ b/ggml/src/ggml-qnn/utils.hpp @@ -15,7 +15,6 @@ namespace qnn { -Qnn_DataType_t datatype_from_ggml_datatype(enum ggml_type ggmltype); uint32_t get_ggml_tensor_rank(const ggml_tensor *tensor); const char *get_backend_name(int n_backend_type); const char *get_chipset_desc(uint32_t chipset_id); @@ -169,6 +168,9 @@ inline void set_qnn_tensor_memhandle(Qnn_Tensor_t &tensor, Qnn_MemHandle_t handl } } +Qnn_DataType_t device_datatype_from_ggml_datatype(ggml_type ggml_type); +Qnn_TensorType_t device_tensortype_from_ggml_tensor(ggml_tensor *ggml_tensor); + void device_tensor_init(Qnn_Tensor_t &tensor, uint32_t rank, Qnn_TensorMemType_t mem_type, const char *tensor_name, Qnn_TensorType_t qnn_tensor_type, Qnn_DataType_t qnn_data_type, uint32_t *dimensions); From 263ffa962ea3ebfceaa1f9f52c24c67930fbface Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 5 Jul 2024 23:07:27 +0800 Subject: [PATCH 044/166] small opt of the qnn graph config init --- ggml/src/ggml-qnn/graph.hpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 1aad145c32896..651fc1c5301ec 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -54,9 +54,9 @@ class ggml_qnn_graph { graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; graph_vtcm_config.customConfig = &vtcm_config; - const QnnGraph_Config_t *p_graphconfig[] = { &graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config, + const QnnGraph_Config_t *graph_configs[] = { &graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config, &graph_opt_config, nullptr }; - error = qnn_interface.graphCreate(qnn_context, graph_name.c_str(), p_graphconfig, &graph_handle); + error = qnn_interface.graphCreate(qnn_context, graph_name.c_str(), graph_configs, &graph_handle); } else { error = qnn_interface.graphCreate(qnn_context, graph_name.c_str(), nullptr, &graph_handle); } @@ -67,10 +67,9 @@ class ggml_qnn_graph { "error = %d\n", graph_name.c_str(), error); return; - } else { - QNN_LOG_INFO("create qnn graph handle with graph name %s ok\n", graph_name.c_str()); } + QNN_LOG_INFO("create qnn graph handle with graph name %s ok\n", graph_name.c_str()); _graph_handle = graph_handle; } From 874216b9c887ebbde5eba476d51b3c4db1e1f3a5 Mon Sep 17 00:00:00 2001 From: Hongrui Chen Date: Sun, 7 Jul 2024 22:32:43 +0800 Subject: [PATCH 045/166] remove unused members --- ggml/src/ggml-qnn.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 632ce8ee5c19e..d4d9e2cd5d202 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -116,16 +116,11 @@ struct ggml_backend_qnn_buffer_context { free(buffer); } - for (auto *sub_buffer : sub_buffers) { - free(sub_buffer); - } - for (auto *qnn_tensor : qnn_tensors) { qnn::device_tensor_free(*qnn_tensor); free(qnn_tensor); } - sub_buffers.clear(); qnn_tensors.clear(); } void *buffer = nullptr; @@ -133,7 +128,6 @@ struct ggml_backend_qnn_buffer_context { struct ggml_backend_qnn_context *backend_ctx = nullptr; size_t buffer_size = 0; - std::vector sub_buffers; std::vector qnn_tensors; size_t device; std::string name; From 5f2e3918f6ac0d597ec5004180814fd14edfed97 Mon Sep 17 00:00:00 2001 From: Hongrui Chen Date: Sun, 7 Jul 2024 23:51:12 +0800 Subject: [PATCH 046/166] refactoring ggml_qnn_tensor --- ggml/src/ggml-qnn.cpp | 67 +++------ ggml/src/ggml-qnn/backend-ops.cpp | 157 ++++++++++++--------- ggml/src/ggml-qnn/backend.hpp | 6 +- ggml/src/ggml-qnn/graph.hpp | 18 +++ ggml/src/ggml-qnn/qnn-types.hpp | 2 - ggml/src/ggml-qnn/qnn.hpp | 15 +- ggml/src/ggml-qnn/tensor.hpp | 224 ++++++++++++++++++++---------- ggml/src/ggml-qnn/utils.hpp | 8 ++ 8 files changed, 293 insertions(+), 204 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index d4d9e2cd5d202..3584c41120ae6 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -1,5 +1,6 @@ #include "ggml-qnn.h" +#include #include #include #include @@ -81,7 +82,6 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { .threads = 1, .name = "qnn-cpu", .lib = "libQnnCpu.so", - .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}, @@ -91,7 +91,6 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { .threads = 1, .name = "qnn-gpu", .lib = "libQnnGpu.so", - .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}, @@ -101,7 +100,6 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { .threads = 1, .name = "qnn-npu", .lib = "libQnnHtp.so", - .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}, @@ -112,23 +110,16 @@ struct ggml_backend_qnn_buffer_context { ggml_backend_qnn_buffer_context(size_t device) : device(device), name(QNN_BACKEND_NAME + std::to_string(device)) {} ~ggml_backend_qnn_buffer_context() { + tensors.clear(); if (buffer) { free(buffer); } - - for (auto *qnn_tensor : qnn_tensors) { - qnn::device_tensor_free(*qnn_tensor); - free(qnn_tensor); - } - - qnn_tensors.clear(); } - void *buffer = nullptr; + void *buffer = nullptr; struct ggml_backend_qnn_context *backend_ctx = nullptr; - + std::list> tensors; size_t buffer_size = 0; - std::vector qnn_tensors; size_t device; std::string name; }; @@ -235,37 +226,14 @@ GGML_CALL static void *ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t bu GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor) { ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context; - Qnn_Tensor_t *p_qnn_tensor = (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t)); - if (!p_qnn_tensor) { - QNN_LOG_WARN("calloc failed"); - return; - } - - static int idx = 0; - char tensor_name[GGML_MAX_NAME] = { 0 }; - snprintf(tensor_name, GGML_MAX_NAME, "tensor_%04d", idx++); - Qnn_DataType_t qnn_data_type = qnn::device_datatype_from_ggml_datatype(tensor->type); - Qnn_TensorType_t qnn_tensor_type = qnn::device_tensortype_from_ggml_tensor(tensor); - Qnn_TensorMemType_t qnn_mem_type = QNN_TENSORMEMTYPE_RAW; - if (ctx->device == QNN_BACKEND_GPU) { - qnn_mem_type = QNN_TENSORMEMTYPE_MEMHANDLE; - } - - uint32_t dimensions[] = { (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], - (uint32_t)tensor->ne[3] }; - Qnn_Tensor_t qnn_tensor; - qnn::device_tensor_init(qnn_tensor, qnn::get_ggml_tensor_rank(tensor), qnn_mem_type, tensor_name, qnn_tensor_type, - qnn_data_type, dimensions); - - Qnn_ErrorHandle_t error = qnn::device_tensor_deep_copy(qnn_tensor, *p_qnn_tensor); - if (error != QNN_SUCCESS) { - free(p_qnn_tensor); - QNN_LOG_WARN("init tensor failed"); + auto instance = ctx->backend_ctx->instance; + auto qnn_tensor = std::make_unique(tensor, (QNNBackend)(ctx->device), instance); + if (!qnn_tensor->is_valid()) { + QNN_LOG_WARN("Create ggml_qnn_tensor failed"); return; } - tensor->extra = p_qnn_tensor; - ctx->qnn_tensors.push_back(p_qnn_tensor); + ctx->tensors.push_back(std::move(qnn_tensor)); } GGML_CALL static void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor, @@ -373,17 +341,16 @@ GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *)backend->context; QNN_LOG_INFO("idx %d, name:%s", ctx->device, g_qnn_mgr[ctx->device].name); - auto *instance = g_qnn_mgr[ctx->device].instance; - if (instance != nullptr) { - for (const auto &graph_item : ctx->qnn_graph_map) { + auto instance = g_qnn_mgr[ctx->device].instance; + if (instance) { + for (const auto &graph_item : ctx->qnn_binary_graph_cache) { QNN_LOG_INFO("graph type:%s", graph_item.first.c_str()); } - ctx->qnn_graph_map.clear(); + ctx->qnn_binary_graph_cache.clear(); instance->qnn_finalize(); - delete instance; - g_qnn_mgr[ctx->device].instance = nullptr; + g_qnn_mgr[ctx->device].instance.reset(); } if (g_qnn_mgr[ctx->device].backend != nullptr) { @@ -582,17 +549,15 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char *qnn_lib_path) { } } - auto *instance = new qnn::qnn_instance(qnn_lib_path, g_qnn_mgr[device].lib, ""); + auto instance = std::make_shared(qnn_lib_path, g_qnn_mgr[device].lib, ""); result = instance->qnn_init(nullptr); - if (0 != result) { + if (result != 0) { QNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why\n", qnn::get_backend_name(device)); - delete instance; return nullptr; } auto qnn_interface = instance->get_qnn_interface(); if (!qnn_interface.is_loaded()) { QNN_LOG_WARN("qnn subsystem failure\n"); - delete instance; return nullptr; } diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 79e280fcbe088..1914e64dcff27 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -23,10 +23,10 @@ bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, return false; } - auto *instance = ctx->instance; - auto *tensor0 = src0->extra; - auto *tensor1 = src1->extra; - auto *tensor2 = dst->extra; + auto instance = ctx->instance; + auto *tensor0 = qnn::ggml_qnn_tensor::from_ggml_tensor(src0); + auto *tensor1 = qnn::ggml_qnn_tensor::from_ggml_tensor(src1); + auto *tensor2 = qnn::ggml_qnn_tensor::from_ggml_tensor(dst); if (!instance || !tensor0 || !tensor1 || !tensor2) { QNN_LOG_WARN("invalid tensors\n"); return false; @@ -35,6 +35,80 @@ bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, return true; } +template +bool qnn_bind_tensors_to_graph(qnn::ggml_qnn_graph<_InputSize, _OutputSize> *graph, const std::string &op_name, + const std::array &inputs, + const std::array &outputs) { + std::array qnn_input_tensors; + for (size_t i = 0; i < inputs.size(); ++i) { + auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(inputs[i]); + if (!tensor || !tensor->bind_to_graph(*graph)) { + return false; + } + + qnn_input_tensors[i] = tensor->get_qnn_tensor(); + } + + std::array qnn_output_tensors; + for (size_t i = 0; i < outputs.size(); ++i) { + auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(outputs[i]); + if (!tensor || !tensor->bind_to_graph(*graph)) { + return false; + } + + qnn_output_tensors[i] = tensor->get_qnn_tensor(); + } + + if (!graph->add_nodes(op_name, qnn_input_tensors, qnn_output_tensors)) { + return false; + } + + return true; +} + +template +bool write_to_qnn_tensors(const std::array &inputs) { + for (auto &input : inputs) { + auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(input); + if (!tensor || !tensor->write_to_qnn_tensor()) { + return false; + } + } + + return true; +} + +template +bool read_from_qnn_tensors(const std::array &outputs) { + for (auto &output : outputs) { + auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(output); + if (!tensor || !tensor->read_from_qnn_tensor()) { + return false; + } + } + + return true; +} + +template +bool execute_graph(qnn::ggml_qnn_graph<_InputSize, _OutputSize> *graph, + const std::array &inputs, + const std::array &outputs) { + if (!write_to_qnn_tensors<_InputSize>(inputs)) { + return false; + } + + if (!graph->execute()) { + return false; + } + + if (!read_from_qnn_tensors<_OutputSize>(outputs)) { + return false; + } + + return true; +} + } // namespace #ifndef NDEBUG @@ -61,13 +135,10 @@ static void ggml_qnn_add(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, bool succeed = false; std::string graph_key(ggml_op_name(GGML_OP_ADD)); - auto it = ctx->qnn_graph_map.find(graph_key); - if (it != ctx->qnn_graph_map.end()) { - const auto &graph_item = it->second; - qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); - qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); - qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); - std::get<0>(graph_item)->execute(); + auto it = ctx->qnn_binary_graph_cache.find(graph_key); + qnn::ggml_qnn_graph_binary *graph_ptr = nullptr; + if (it != ctx->qnn_binary_graph_cache.end()) { + graph_ptr = it->second.get(); } else { graph_name = graph_name + "_" + std::to_string(ctx->threads) + "_" + src0->name + "_" + src1->name; auto graph = std::make_unique(graph_name, (QNNBackend)(ctx->device), @@ -78,34 +149,15 @@ static void ggml_qnn_add(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, goto failure; } - qnn::ggml_qnn_tensor_input tensor_input0(src0, graph->get_graph_handler(), ctx); - if (!tensor_input0.is_valid()) { - goto failure; - } - qnn::ggml_qnn_tensor_input tensor_input1(src1, graph->get_graph_handler(), ctx); - if (!tensor_input1.is_valid()) { - goto failure; - } - qnn::ggml_qnn_tensor_output tensor_output(dst, graph->get_graph_handler(), ctx); - if (!tensor_output.is_valid()) { + if (!qnn_bind_tensors_to_graph<2, 1>(graph.get(), QNN_OP_ELEMENT_WISE_ADD, { src0, src1 }, { dst })) { goto failure; } - if (!graph->add_nodes(QNN_OP_ELEMENT_WISE_ADD, - { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }, - { *tensor_output.get_qnn_tensor() })) { - goto failure; - } - - if (!graph->execute()) { - goto failure; - } - - ctx->qnn_graph_map[graph_key] = std::make_tuple(std::move(graph), tensor_input0.get_qnn_tensor(), - tensor_input1.get_qnn_tensor(), tensor_output.get_qnn_tensor()); + graph_ptr = graph.get(); + ctx->qnn_binary_graph_cache[graph_key] = std::move(graph); } - succeed = true; + succeed = execute_graph<2, 1>(graph_ptr, { src0, src1 }, { dst }); failure: if (!succeed) { @@ -143,13 +195,10 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context *ctx, const ggml_tensor *s bool succeed = false; std::string graph_key(ggml_op_name(GGML_OP_MUL_MAT)); - auto it = ctx->qnn_graph_map.find(graph_key); - if (it != ctx->qnn_graph_map.end()) { - const auto &graph_item = it->second; - qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); - qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); - qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); - std::get<0>(graph_item)->execute(); + auto it = ctx->qnn_binary_graph_cache.find(graph_key); + qnn::ggml_qnn_graph_binary *graph_ptr = nullptr; + if (it != ctx->qnn_binary_graph_cache.end()) { + graph_ptr = it->second.get(); } else { graph_name = graph_name + "_" + std::to_string(ctx->threads) + "_" + src0->name + "_" + src1->name; auto graph = std::make_unique(graph_name, (QNNBackend)(ctx->device), @@ -160,33 +209,15 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context *ctx, const ggml_tensor *s goto failure; } - qnn::ggml_qnn_tensor_input tensor_input0(src0, graph->get_graph_handler(), ctx); - if (!tensor_input0.is_valid()) { - goto failure; - } - qnn::ggml_qnn_tensor_input tensor_input1(src1, graph->get_graph_handler(), ctx); - if (!tensor_input1.is_valid()) { - goto failure; - } - qnn::ggml_qnn_tensor_output tensor_output(dst, graph->get_graph_handler(), ctx); - if (!tensor_output.is_valid()) { - goto failure; - } - - if (!graph->add_nodes(QNN_OP_MAT_MUL, { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }, - { *tensor_output.get_qnn_tensor() })) { - goto failure; - } - - if (!graph->execute()) { + if (!qnn_bind_tensors_to_graph<2, 1>(graph.get(), QNN_OP_MAT_MUL, { src0, src1 }, { dst })) { goto failure; } - ctx->qnn_graph_map[graph_key] = std::make_tuple(std::move(graph), tensor_input0.get_qnn_tensor(), - tensor_input1.get_qnn_tensor(), tensor_output.get_qnn_tensor()); + graph_ptr = graph.get(); + ctx->qnn_binary_graph_cache[graph_key] = std::move(graph); } - succeed = true; + succeed = execute_graph<2, 1>(graph_ptr, { src0, src1 }, { dst }); failure: if (!succeed) { diff --git a/ggml/src/ggml-qnn/backend.hpp b/ggml/src/ggml-qnn/backend.hpp index d60b334c0b2b5..48b243577ca1f 100644 --- a/ggml/src/ggml-qnn/backend.hpp +++ b/ggml/src/ggml-qnn/backend.hpp @@ -16,12 +16,10 @@ struct ggml_backend_qnn_context { int threads; char name[GGML_MAX_NAME]; char lib[GGML_MAX_NAME]; - qnn::qnn_instance *instance; + std::shared_ptr instance; ggml_backend *backend; QNN_INTERFACE_VER_TYPE raw_interface; QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface; qnn::qcom_socinfo socinfo; - std::unordered_map, Qnn_Tensor_t *, - Qnn_Tensor_t *, Qnn_Tensor_t *>> - qnn_graph_map; + std::unordered_map> qnn_binary_graph_cache; }; diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 651fc1c5301ec..6f9628cbd7739 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -73,6 +73,22 @@ class ggml_qnn_graph { _graph_handle = graph_handle; } + bool create_graph_tensor(Qnn_Tensor_t &tensor) { + if (!is_valid()) { + QNN_LOG_ERROR("Invalid graph\n"); + return false; + } + + auto err = _qnn_interface.tensorCreateGraphTensor(_graph_handle, &tensor); + if (err != QNN_SUCCESS) { + QNN_LOG_INFO("error = %d\n", err); + QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor)); + return false; + } + + return true; + } + bool add_nodes(const std::string &op_name, const input_tensor_array_t &tensor_inputs, const output_tensor_array_t &tensor_outputs) { if (!is_valid()) { @@ -124,6 +140,8 @@ class ggml_qnn_graph { Qnn_GraphHandle_t get_graph_handler() const { return _graph_handle; } + const std::string &get_name() const { return _graph_name; } + private: const std::string _graph_name; const QNNBackend _device; diff --git a/ggml/src/ggml-qnn/qnn-types.hpp b/ggml/src/ggml-qnn/qnn-types.hpp index 7c245651032c0..58ca8648b0b03 100644 --- a/ggml/src/ggml-qnn/qnn-types.hpp +++ b/ggml/src/ggml-qnn/qnn-types.hpp @@ -49,7 +49,5 @@ using pfn_qnninterface_getproviders = decltype(QnnInterface_getProviders); using pfn_qnnsysteminterface_getproviders = decltype(QnnSystemInterface_getProviders); } // namespace qnn -#define QNN_VER_PTR(x) (&((x).v1)) // TODO: remove this macro after we have a separate header for QNN - #define RPCMEM_DEFAULT_FLAGS 1 #define RPCMEM_HEAP_ID_SYSTEM 25 diff --git a/ggml/src/ggml-qnn/qnn.hpp b/ggml/src/ggml-qnn/qnn.hpp index 26465c96a0793..400ce005bfe2b 100644 --- a/ggml/src/ggml-qnn/qnn.hpp +++ b/ggml/src/ggml-qnn/qnn.hpp @@ -637,20 +637,20 @@ class qnn_instance { return 3; } - if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) { - QNN_LOG_WARN("tensor %s has been registered shared memory\n", (QNN_VER_PTR(*p_tensor)->name)); + if (is_rpcmem_registered(QNN_TENSOR_GET_MEM_HANDLE(*p_tensor))) { + QNN_LOG_WARN("tensor %s has been registered shared memory\n", QNN_TENSOR_GET_NAME(*p_tensor)); return 4; } int32_t mem_fd = rpcmem_to_fd(p_data); - if (-1 == mem_fd) { + if (mem_fd == -1) { QNN_LOG_WARN("failed to get file descriptor\n"); return 5; } QNN_LOG_INFO("mem_fd %d\n", mem_fd); - Qnn_MemDescriptor_t descriptor = { { QNN_VER_PTR(*p_tensor)->rank, QNN_VER_PTR(*p_tensor)->dimensions, + Qnn_MemDescriptor_t descriptor = { { QNN_TENSOR_GET_RANK(*p_tensor), QNN_TENSOR_GET_DIMENSIONS(*p_tensor), nullptr }, - QNN_VER_PTR(*p_tensor)->dataType, + QNN_TENSOR_GET_DATA_TYPE(*p_tensor), QNN_MEM_TYPE_ION, { { mem_fd } } }; Qnn_MemHandle_t handle = nullptr; @@ -662,9 +662,10 @@ class qnn_instance { strerror(error)); return 6; } else { - QNN_LOG_INFO("tensor %s successfully register shared memory\n", (QNN_VER_PTR(*p_tensor)->name)); + QNN_LOG_INFO("tensor %s successfully register shared memory\n", QNN_TENSOR_GET_NAME(*p_tensor)); } - QNN_VER_PTR(*p_tensor)->memHandle = handle; + + QNN_TENSOR_SET_MEM_HANDLE(*p_tensor, handle); _qnn_mem_set.insert((std::pair(p_data, handle))); return 0; diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index 8a9196616fcae..335aafe533d0d 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -1,127 +1,197 @@ #pragma once +#include +#include +#include + #include "ggml-qnn.h" #include "QnnTensor.h" #include "System/QnnSystemInterface.h" #include "backend.hpp" +#include "graph.hpp" #include "qnn.hpp" +#include "utils.hpp" namespace qnn { -template -class ggml_qnn_tensor_readwrite { +class ggml_qnn_tensor { public: - explicit ggml_qnn_tensor_readwrite(const ggml_tensor *tensor, Qnn_GraphHandle_t graph_handle, - ggml_backend_qnn_context *ctx) : - _tensor(tensor), _qnn_tensor(reinterpret_cast(tensor->extra)), _context(ctx) { - _old_dimensions = QNN_VER_PTR(*_qnn_tensor)->dimensions; - const auto qnn_data_type = device_datatype_from_ggml_datatype(tensor->type); - const bool is_npu = ctx->device == QNN_BACKEND_NPU; - QNN_VER_PTR(*_qnn_tensor)->type = _tensorType; - if (is_npu) { - QNN_VER_PTR(*_qnn_tensor)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; - QNN_VER_PTR(*_qnn_tensor)->clientBuf = { .data = nullptr, .dataSize = 0 }; + static ggml_qnn_tensor *from_ggml_tensor(const ggml_tensor *tensor) { + if (!tensor) { + return nullptr; } - auto err = ctx->raw_interface.tensorCreateGraphTensor(graph_handle, _qnn_tensor); - if (err != QNN_SUCCESS) { - QNN_LOG_INFO("error = %d\n", err); - QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor)); - _context = nullptr; - return; + return static_cast(tensor->extra); + } + + explicit ggml_qnn_tensor(ggml_tensor *tensor, QNNBackend device, std::shared_ptr qnn_instance) : + _tensor(tensor), _device(device), _qnn_instance(qnn_instance) { + _tensor_name = ggml_get_name(tensor); + if (_tensor_name.empty()) { + static std::atomic_uint32_t unnamed_tensor_count = 0; + char buffer[GGML_MAX_NAME] = {}; + snprintf(buffer, sizeof(buffer), "unnamed_%p", unnamed_tensor_count++); + _tensor_name = buffer; } + QNN_TENSOR_SET_NAME(_qnn_tensor, _tensor_name.c_str()); _dimensions[0] = (uint32_t)tensor->ne[0]; _dimensions[1] = (uint32_t)tensor->ne[1]; _dimensions[2] = (uint32_t)tensor->ne[2]; _dimensions[3] = (uint32_t)tensor->ne[3]; - QNN_VER_PTR(*_qnn_tensor)->dimensions = _dimensions; - QNN_VER_PTR(*_qnn_tensor)->rank = qnn::get_ggml_tensor_rank(tensor); - QNN_VER_PTR(*_qnn_tensor)->dataType = qnn_data_type; - + QNN_TENSOR_SET_DIMENSIONS(_qnn_tensor, _dimensions); + QNN_TENSOR_SET_TYPE(_qnn_tensor, device_tensortype_from_ggml_tensor(tensor)); + QNN_TENSOR_SET_DATA_FORMAT(_qnn_tensor, QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER); + QNN_TENSOR_SET_DATA_TYPE(_qnn_tensor, device_datatype_from_ggml_datatype(tensor->type)); + // TODO: set the quantizeParams base on the tensor type + QNN_TENSOR_SET_RANK(_qnn_tensor, qnn::get_ggml_tensor_rank(tensor)); + + const bool is_npu = device == QNN_BACKEND_NPU; if (is_npu) { - auto *instance = ctx->instance; - uint8_t *qnn_buffer = static_cast(instance->alloc_rpcmem(ggml_nbytes(tensor), alignof(void *))); - if (!qnn_buffer) { - QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); - QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor)); - _context = nullptr; - // No free for _qnn_tensor, because it's not registered. - return; + QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_MEMHANDLE); + QNN_TENSOR_SET_MEM_HANDLE(_qnn_tensor, nullptr); + } else { + QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); + Qnn_ClientBuffer_t client_buf = { tensor->data, get_ggml_tensor_data_size(tensor) }; + QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); + } + + tensor->extra = this; + } + + template + bool bind_to_graph(ggml_qnn_graph<_InputSize, _OutputSize> &graph) { + if (!is_valid()) { + QNN_LOG_WARN("tensor %s not valid", _tensor_name.c_str()); + return false; + } + + if (_graph_handle) { + if (_graph_handle != graph.get_graph_handler()) { + QNN_LOG_WARN("tensor %s has been bound to another graph", _tensor_name.c_str()); + return false; } else { - QNN_LOG_INFO("alloc rpcmem successfully\n"); + QNN_LOG_INFO("tensor %s already bound to same graph %s", _tensor_name.c_str(), + graph.get_name().c_str()); + return true; } + } - instance->register_rpcmem(qnn_buffer, _qnn_tensor); - if (_tensorType == QNN_TENSOR_TYPE_APP_WRITE || _tensorType == QNN_TENSOR_TYPE_APP_READWRITE) { - memcpy(qnn_buffer, tensor->data, ggml_nbytes(tensor)); - } - } else { - QNN_VER_PTR(*_qnn_tensor)->clientBuf = { tensor->data, get_ggml_tensor_data_size(tensor) }; + Qnn_Tensor_t tensor = _qnn_tensor; + if (!graph.create_graph_tensor(tensor)) { + QNN_LOG_WARN("create graph tensor failed, tensor %s", _tensor_name.c_str()); + return false; } + + if (!alloc_rpc_mem()) { + QNN_LOG_WARN("alloc rpc mem failed, tensor %s", _tensor_name.c_str()); + return false; + } + + QNN_TENSOR_SET_ID(_qnn_tensor, QNN_TENSOR_GET_ID(tensor)); + _graph_handle = graph.get_graph_handler(); + return true; } - explicit ggml_qnn_tensor_readwrite(const ggml_tensor *tensor, Qnn_Tensor_t *qnn_tensor, - ggml_backend_qnn_context *ctx) : - _tensor(tensor), _qnn_tensor(qnn_tensor), _context(ctx) { - _old_dimensions = QNN_VER_PTR(*_qnn_tensor)->dimensions; - const auto qnn_data_type = device_datatype_from_ggml_datatype(tensor->type); - const bool is_npu = ctx->device == QNN_BACKEND_NPU; + bool write_to_qnn_tensor() { + if (!is_valid()) { + QNN_LOG_WARN("tensor %s not valid", _tensor_name.c_str()); + return false; + } - _dimensions[0] = (uint32_t)tensor->ne[0]; - _dimensions[1] = (uint32_t)tensor->ne[1]; - _dimensions[2] = (uint32_t)tensor->ne[2]; - _dimensions[3] = (uint32_t)tensor->ne[3]; - QNN_VER_PTR(*_qnn_tensor)->dimensions = _dimensions; - QNN_VER_PTR(*_qnn_tensor)->rank = get_ggml_tensor_rank(tensor); - QNN_VER_PTR(*_qnn_tensor)->dataType = qnn_data_type; + auto tensor_type = QNN_TENSOR_GET_TYPE(_qnn_tensor); + if (tensor_type != QNN_TENSOR_TYPE_APP_WRITE && tensor_type != QNN_TENSOR_TYPE_APP_READWRITE) { + QNN_LOG_WARN("tensor %s not writable", _tensor_name.c_str()); + return false; + } - if (is_npu) { - uint8_t *qnn_buffer = - static_cast(ctx->instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*_qnn_tensor)->memHandle)); + if (should_use_mem_handle()) { + uint8_t *qnn_buffer = static_cast( + _qnn_instance->get_rpcmem_from_memhandle(QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor))); if (qnn_buffer) { - memcpy(qnn_buffer, tensor->data, ggml_nbytes(tensor)); + memcpy(qnn_buffer, _tensor->data, ggml_nbytes(_tensor)); } else { QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n"); - QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor)); - _context = nullptr; - return; + return false; } - } else { - QNN_VER_PTR(*_qnn_tensor)->clientBuf = { tensor->data, get_ggml_tensor_data_size(tensor) }; } + + // For CPU and GPU, the data is already in the tensor. + return true; } - ~ggml_qnn_tensor_readwrite() { - if ((_tensorType == QNN_TENSOR_TYPE_APP_READWRITE || _tensorType == QNN_TENSOR_TYPE_APP_READ) && _context && - _context->device == QNN_BACKEND_NPU) { + bool read_from_qnn_tensor() { + if (!is_valid()) { + QNN_LOG_WARN("tensor %s not valid", _tensor_name.c_str()); + return false; + } + + auto tensor_type = QNN_TENSOR_GET_TYPE(_qnn_tensor); + if (tensor_type != QNN_TENSOR_TYPE_APP_READ && tensor_type != QNN_TENSOR_TYPE_APP_READWRITE) { + QNN_LOG_WARN("tensor %s not readable", _tensor_name.c_str()); + return false; + } + + if (should_use_mem_handle()) { uint8_t *qnn_buffer = static_cast( - _context->instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*_qnn_tensor)->memHandle)); - memcpy(_tensor->data, qnn_buffer, ggml_nbytes(_tensor)); + _qnn_instance->get_rpcmem_from_memhandle(QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor))); + if (qnn_buffer) { + memcpy(_tensor->data, qnn_buffer, ggml_nbytes(_tensor)); + } else { + QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n"); + return false; + } } - QNN_VER_PTR(*_qnn_tensor)->dimensions = _old_dimensions; + // For CPU and GPU, the data is already in the tensor. + return true; } - bool is_valid() const { return _context; } - Qnn_Tensor_t *get_qnn_tensor() const { return _qnn_tensor; } + bool is_valid() const { return _tensor; } + const Qnn_Tensor_t &get_qnn_tensor() const { return _qnn_tensor; } private: + bool alloc_rpc_mem() { + if (!should_use_mem_handle()) { + return true; + } + + uint8_t *qnn_buffer = + static_cast(_qnn_instance->alloc_rpcmem(ggml_nbytes(_tensor), alignof(void *))); + if (!qnn_buffer) { + QNN_LOG_WARN("alloc rpc mem failure, %s\n", strerror(errno)); + QNN_LOG_DEBUG("tensor name %s", _tensor_name.c_str()); + return false; + } else { + QNN_LOG_INFO("alloc rpcmem successfully\n"); + } + + auto error = _qnn_instance->register_rpcmem(qnn_buffer, &_qnn_tensor); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("register rpc mem failure, %d\n", (int)error); + QNN_LOG_DEBUG("tensor name %s", _tensor_name.c_str()); + return false; + } + + return true; + } + + bool should_use_mem_handle() const { return _device == QNN_BACKEND_NPU; } + const ggml_tensor *_tensor; - Qnn_Tensor_t *_qnn_tensor; - ggml_backend_qnn_context *_context; - uint32_t *_old_dimensions; + QNNBackend _device; + std::shared_ptr _qnn_instance; + Qnn_Tensor_t _qnn_tensor = QNN_TENSOR_INIT; uint32_t _dimensions[4] = {}; + std::string _tensor_name; + Qnn_GraphHandle_t _graph_handle = nullptr; - ggml_qnn_tensor_readwrite(const ggml_qnn_tensor_readwrite &) = delete; - void operator=(const ggml_qnn_tensor_readwrite &) = delete; - ggml_qnn_tensor_readwrite(ggml_qnn_tensor_readwrite &&) = delete; - void operator=(ggml_qnn_tensor_readwrite &&) = delete; + ggml_qnn_tensor(const ggml_qnn_tensor &) = delete; + void operator=(const ggml_qnn_tensor &) = delete; + ggml_qnn_tensor(ggml_qnn_tensor &&) = delete; + void operator=(ggml_qnn_tensor &&) = delete; }; -using ggml_qnn_tensor_output = ggml_qnn_tensor_readwrite; -using ggml_qnn_tensor_input = ggml_qnn_tensor_readwrite; - } // namespace qnn diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp index 87d908f1e15fb..84cd8354e2d59 100644 --- a/ggml/src/ggml-qnn/utils.hpp +++ b/ggml/src/ggml-qnn/utils.hpp @@ -102,6 +102,13 @@ inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t &tensor) { return QNN_TENSORMEMTYPE_UNDEFINED; } +inline Qnn_MemHandle_t get_qnn_tensor_memhandle(const Qnn_Tensor_t &tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.memHandle; + } + return nullptr; +} + inline void set_qnn_tensor_id(Qnn_Tensor_t &tensor, uint32_t id) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.id = id; @@ -224,6 +231,7 @@ class qnn_perf { #define QNN_TENSOR_GET_RANK(tensor) qnn::get_qnn_tensor_rank(tensor) #define QNN_TENSOR_GET_DIMENSIONS(tensor) qnn::get_qnn_tensor_dimensions(tensor) #define QNN_TENSOR_GET_MEM_TYPE(tensor) qnn::get_qnn_tensor_memtype(tensor) +#define QNN_TENSOR_GET_MEM_HANDLE(tensor) qnn::get_qnn_tensor_memhandle(tensor) #define QNN_TENSOR_SET_ID(tensor, value) qnn::set_qnn_tensor_id(tensor, value) #define QNN_TENSOR_SET_NAME(tensor, value) qnn::set_qnn_tensor_name(tensor, value) From af869fd636fdfe0656dd94b4d9fc9d6f254207ea Mon Sep 17 00:00:00 2001 From: hongruichen Date: Tue, 9 Jul 2024 23:21:55 +0800 Subject: [PATCH 047/166] fix compiling error in debug build --- ggml/src/ggml-qnn/graph.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 6f9628cbd7739..01c44fe374eef 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -82,7 +82,7 @@ class ggml_qnn_graph { auto err = _qnn_interface.tensorCreateGraphTensor(_graph_handle, &tensor); if (err != QNN_SUCCESS) { QNN_LOG_INFO("error = %d\n", err); - QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor)); + QNN_LOG_DEBUG("tensor%p name %s", &tensor, QNN_TENSOR_GET_NAME(tensor)); return false; } From a7be0693ba1645ec1cf32bc13117229dde668e86 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Tue, 9 Jul 2024 20:35:58 +0800 Subject: [PATCH 048/166] add log --- ggml/src/ggml-qnn/tensor.hpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index 335aafe533d0d..e023fb7fc0157 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -11,6 +11,7 @@ #include "System/QnnSystemInterface.h" #include "backend.hpp" #include "graph.hpp" +#include "logger.hpp" #include "qnn.hpp" #include "utils.hpp" @@ -59,6 +60,7 @@ class ggml_qnn_tensor { } tensor->extra = this; + QNN_LOG_DEBUG("create tensor %s with device %d", _tensor_name.c_str(), device); } template @@ -92,6 +94,8 @@ class ggml_qnn_tensor { QNN_TENSOR_SET_ID(_qnn_tensor, QNN_TENSOR_GET_ID(tensor)); _graph_handle = graph.get_graph_handler(); + + QNN_LOG_DEBUG("bind tensor %s to graph %s", _tensor_name.c_str(), graph.get_name().c_str()); return true; } @@ -164,10 +168,10 @@ class ggml_qnn_tensor { QNN_LOG_WARN("alloc rpc mem failure, %s\n", strerror(errno)); QNN_LOG_DEBUG("tensor name %s", _tensor_name.c_str()); return false; - } else { - QNN_LOG_INFO("alloc rpcmem successfully\n"); } + QNN_LOG_INFO("tensor %s: alloc rpcmem(%p) successfully\n", _tensor_name.c_str(), qnn_buffer); + auto error = _qnn_instance->register_rpcmem(qnn_buffer, &_qnn_tensor); if (error != QNN_SUCCESS) { QNN_LOG_WARN("register rpc mem failure, %d\n", (int)error); From 9add256efee043640952127c947efedc014fc79e Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 10 Jul 2024 00:31:13 +0800 Subject: [PATCH 049/166] use helper function instead --- ggml/src/ggml-qnn/tensor.hpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index e023fb7fc0157..e966e638bee1f 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -49,8 +49,7 @@ class ggml_qnn_tensor { // TODO: set the quantizeParams base on the tensor type QNN_TENSOR_SET_RANK(_qnn_tensor, qnn::get_ggml_tensor_rank(tensor)); - const bool is_npu = device == QNN_BACKEND_NPU; - if (is_npu) { + if (should_use_mem_handle()) { QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_MEMHANDLE); QNN_TENSOR_SET_MEM_HANDLE(_qnn_tensor, nullptr); } else { @@ -171,7 +170,7 @@ class ggml_qnn_tensor { } QNN_LOG_INFO("tensor %s: alloc rpcmem(%p) successfully\n", _tensor_name.c_str(), qnn_buffer); - + auto error = _qnn_instance->register_rpcmem(qnn_buffer, &_qnn_tensor); if (error != QNN_SUCCESS) { QNN_LOG_WARN("register rpc mem failure, %d\n", (int)error); From dc7d83e121e48b5df6af52986723ad8f4946846d Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 10 Jul 2024 00:33:23 +0800 Subject: [PATCH 050/166] add log --- ggml/src/ggml-qnn/logger.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-qnn/logger.cpp b/ggml/src/ggml-qnn/logger.cpp index 8b29979224866..5a1ad13ba40ce 100644 --- a/ggml/src/ggml-qnn/logger.cpp +++ b/ggml/src/ggml-qnn/logger.cpp @@ -67,7 +67,7 @@ void qnn::sdk_logcallback(const char *fmt, QnnLog_Level_t level, uint64_t timest memset(s_ggml_qnn_logbuf, 0, QNN_LOGBUF_LEN); vsnprintf(reinterpret_cast(s_ggml_qnn_logbuf), QNN_LOGBUF_LEN, fmt, argp); - QNN_LOG_INFO("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggml_qnn_logbuf); + QNN_LOG_INFO("%8.1fms [%-7s] %s", ms, log_level_desc, s_ggml_qnn_logbuf); } #endif } From e97d3a6c48941451b7281766371092f6987b285e Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 10 Jul 2024 11:56:01 +0800 Subject: [PATCH 051/166] fix tensor buffer allocation add log commit qnn buffer after changed add log register_rpc_mem 2 times update input tensors before graph finalize default to QNN_TENSORMEMTYPE_RAW set new tensors at execute move write input tensors to exec check if mem registered before actual do register rpc mem once allocated --- ggml/src/ggml-qnn/backend-ops.cpp | 49 ++++++++++---------- ggml/src/ggml-qnn/graph.hpp | 6 ++- ggml/src/ggml-qnn/qnn.hpp | 3 +- ggml/src/ggml-qnn/tensor.hpp | 76 +++++++++++++++++++------------ 4 files changed, 77 insertions(+), 57 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 1914e64dcff27..bafe5ca160e66 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -66,44 +66,43 @@ bool qnn_bind_tensors_to_graph(qnn::ggml_qnn_graph<_InputSize, _OutputSize> *gra return true; } -template -bool write_to_qnn_tensors(const std::array &inputs) { - for (auto &input : inputs) { - auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(input); +template +bool execute_graph(qnn::ggml_qnn_graph<_InputSize, _OutputSize> *graph, + const std::array &inputs, + const std::array &outputs) { + + std::array qnn_input_tensors; + for (size_t i = 0; i < inputs.size(); ++i) { + auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(inputs[i]); if (!tensor || !tensor->write_to_qnn_tensor()) { + QNN_LOG_WARN("write_to_qnn_tensor failed\n"); return false; } - } - return true; -} + qnn_input_tensors[i] = tensor->get_qnn_tensor(); + } -template -bool read_from_qnn_tensors(const std::array &outputs) { - for (auto &output : outputs) { - auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(output); - if (!tensor || !tensor->read_from_qnn_tensor()) { + std::array qnn_output_tensors; + for (size_t i = 0; i < outputs.size(); ++i) { + auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(outputs[i]); + if (!tensor) { return false; } - } - - return true; -} -template -bool execute_graph(qnn::ggml_qnn_graph<_InputSize, _OutputSize> *graph, - const std::array &inputs, - const std::array &outputs) { - if (!write_to_qnn_tensors<_InputSize>(inputs)) { - return false; + qnn_output_tensors[i] = tensor->get_qnn_tensor(); } - if (!graph->execute()) { + if (!graph->execute(qnn_input_tensors, qnn_output_tensors)) { + QNN_LOG_WARN("execute failed\n"); return false; } - if (!read_from_qnn_tensors<_OutputSize>(outputs)) { - return false; + for (auto &output : outputs) { + auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(output); + if (!tensor || !tensor->read_from_qnn_tensor()) { + QNN_LOG_WARN("read_from_qnn_tensors failed\n"); + return false; + } } return true; diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 01c44fe374eef..cb04b1efda0fc 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -96,6 +96,7 @@ class ggml_qnn_graph { return false; } + QNN_LOG_DEBUG("graph name %s, add_nodes start", _graph_name.c_str()); _tensor_inputs = tensor_inputs; _tensor_outputs = tensor_outputs; @@ -116,10 +117,13 @@ class ggml_qnn_graph { return false; } + QNN_LOG_DEBUG("graph name %s, add_nodes succeed", _graph_name.c_str()); return true; } - bool execute() { + bool execute(const input_tensor_array_t &tensor_inputs, const output_tensor_array_t &tensor_outputs) { + _tensor_inputs = tensor_inputs; + _tensor_outputs = tensor_outputs; auto error = _qnn_interface.graphExecute(_graph_handle, _tensor_inputs.data(), _tensor_inputs.size(), _tensor_outputs.data(), _tensor_outputs.size(), nullptr, nullptr); if (_device == QNN_BACKEND_NPU) { diff --git a/ggml/src/ggml-qnn/qnn.hpp b/ggml/src/ggml-qnn/qnn.hpp index 400ce005bfe2b..9d60d2f6c551c 100644 --- a/ggml/src/ggml-qnn/qnn.hpp +++ b/ggml/src/ggml-qnn/qnn.hpp @@ -661,13 +661,12 @@ class qnn_instance { QNN_LOG_WARN("failed to register shared memory, error %d, %s\n", QNN_GET_ERROR_CODE(error), strerror(error)); return 6; - } else { - QNN_LOG_INFO("tensor %s successfully register shared memory\n", QNN_TENSOR_GET_NAME(*p_tensor)); } QNN_TENSOR_SET_MEM_HANDLE(*p_tensor, handle); _qnn_mem_set.insert((std::pair(p_data, handle))); + QNN_LOG_INFO("tensor %s successfully register shared memory handler: %p\n", QNN_TENSOR_GET_NAME(*p_tensor), handle); return 0; } diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index e966e638bee1f..aeab605693caf 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -49,14 +49,9 @@ class ggml_qnn_tensor { // TODO: set the quantizeParams base on the tensor type QNN_TENSOR_SET_RANK(_qnn_tensor, qnn::get_ggml_tensor_rank(tensor)); - if (should_use_mem_handle()) { - QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_MEMHANDLE); - QNN_TENSOR_SET_MEM_HANDLE(_qnn_tensor, nullptr); - } else { - QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); - Qnn_ClientBuffer_t client_buf = { tensor->data, get_ggml_tensor_data_size(tensor) }; - QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); - } + QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); + Qnn_ClientBuffer_t client_buf = {}; + QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); tensor->extra = this; QNN_LOG_DEBUG("create tensor %s with device %d", _tensor_name.c_str(), device); @@ -86,9 +81,26 @@ class ggml_qnn_tensor { return false; } - if (!alloc_rpc_mem()) { - QNN_LOG_WARN("alloc rpc mem failed, tensor %s", _tensor_name.c_str()); - return false; + if (should_use_mem_handle()) { + _qnn_rpc_buffer = alloc_rpc_mem(); + if (!_qnn_rpc_buffer) { + QNN_LOG_WARN("alloc rpc mem failed, tensor %s", _tensor_name.c_str()); + return false; + } + + auto tensor_type = QNN_TENSOR_GET_TYPE(_qnn_tensor); + if (!register_rpc_mem(_qnn_rpc_buffer)) { + QNN_LOG_WARN("commit rpc mem failure\n"); + return false; + } + + QNN_LOG_DEBUG("tensor %s, use mem handle %p", _tensor_name.c_str(), QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor)); + } else { + QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); + Qnn_ClientBuffer_t client_buf = { _tensor->data, get_ggml_tensor_data_size(_tensor) }; + QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); + QNN_LOG_DEBUG("tensor %s, use client buffer %p size %d", _tensor_name.c_str(), client_buf.data, + (int)client_buf.dataSize); } QNN_TENSOR_SET_ID(_qnn_tensor, QNN_TENSOR_GET_ID(tensor)); @@ -111,10 +123,8 @@ class ggml_qnn_tensor { } if (should_use_mem_handle()) { - uint8_t *qnn_buffer = static_cast( - _qnn_instance->get_rpcmem_from_memhandle(QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor))); - if (qnn_buffer) { - memcpy(qnn_buffer, _tensor->data, ggml_nbytes(_tensor)); + if (_qnn_rpc_buffer) { + memcpy(_qnn_rpc_buffer, _tensor->data, ggml_nbytes(_tensor)); } else { QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n"); return false; @@ -122,6 +132,7 @@ class ggml_qnn_tensor { } // For CPU and GPU, the data is already in the tensor. + QNN_LOG_DEBUG("write tensor %s to qnn", _tensor_name.c_str()); return true; } @@ -138,10 +149,8 @@ class ggml_qnn_tensor { } if (should_use_mem_handle()) { - uint8_t *qnn_buffer = static_cast( - _qnn_instance->get_rpcmem_from_memhandle(QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor))); - if (qnn_buffer) { - memcpy(_tensor->data, qnn_buffer, ggml_nbytes(_tensor)); + if (_qnn_rpc_buffer) { + memcpy(_tensor->data, _qnn_rpc_buffer, ggml_nbytes(_tensor)); } else { QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n"); return false; @@ -149,6 +158,7 @@ class ggml_qnn_tensor { } // For CPU and GPU, the data is already in the tensor. + QNN_LOG_DEBUG("read tensor %s from qnn", _tensor_name.c_str()); return true; } @@ -156,28 +166,35 @@ class ggml_qnn_tensor { const Qnn_Tensor_t &get_qnn_tensor() const { return _qnn_tensor; } private: - bool alloc_rpc_mem() { - if (!should_use_mem_handle()) { - return true; - } - - uint8_t *qnn_buffer = + uint8_t *alloc_rpc_mem() { + uint8_t *qnn_rpc_buffer = static_cast(_qnn_instance->alloc_rpcmem(ggml_nbytes(_tensor), alignof(void *))); - if (!qnn_buffer) { + if (!qnn_rpc_buffer) { QNN_LOG_WARN("alloc rpc mem failure, %s\n", strerror(errno)); QNN_LOG_DEBUG("tensor name %s", _tensor_name.c_str()); - return false; + return nullptr; } - QNN_LOG_INFO("tensor %s: alloc rpcmem(%p) successfully\n", _tensor_name.c_str(), qnn_buffer); + QNN_LOG_INFO("tensor %s: alloc rpcmem(%p) successfully\n", _tensor_name.c_str(), qnn_rpc_buffer); + return qnn_rpc_buffer; + } + + bool register_rpc_mem(uint8_t *qnn_rpc_buffer) { + if (_qnn_instance->is_rpcmem_registered(QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor))) { + QNN_LOG_INFO("tensor %s: rpcmem(%p) already registered\n", _tensor_name.c_str(), qnn_rpc_buffer); + return true; + } - auto error = _qnn_instance->register_rpcmem(qnn_buffer, &_qnn_tensor); + auto error = _qnn_instance->register_rpcmem(qnn_rpc_buffer, &_qnn_tensor); if (error != QNN_SUCCESS) { QNN_LOG_WARN("register rpc mem failure, %d\n", (int)error); QNN_LOG_DEBUG("tensor name %s", _tensor_name.c_str()); return false; } + // The mem handle will be set at qnn_instance::register_rpcmem + QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_MEMHANDLE); + QNN_LOG_INFO("tensor %s: register rpcmem(%p) successfully\n", _tensor_name.c_str(), qnn_rpc_buffer); return true; } @@ -190,6 +207,7 @@ class ggml_qnn_tensor { uint32_t _dimensions[4] = {}; std::string _tensor_name; Qnn_GraphHandle_t _graph_handle = nullptr; + uint8_t *_qnn_rpc_buffer = nullptr; ggml_qnn_tensor(const ggml_qnn_tensor &) = delete; void operator=(const ggml_qnn_tensor &) = delete; From 3feb574bf05191f2d2306f6b56bc7c81805f7f0d Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 10 Jul 2024 19:40:02 +0800 Subject: [PATCH 052/166] merge register_rpc_mem into alloc_rpc_mem --- ggml/src/ggml-qnn/tensor.hpp | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index aeab605693caf..8a825b57de57b 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -2,6 +2,7 @@ #pragma once #include +#include #include #include @@ -9,7 +10,6 @@ #include "QnnTensor.h" #include "System/QnnSystemInterface.h" -#include "backend.hpp" #include "graph.hpp" #include "logger.hpp" #include "qnn.hpp" @@ -88,12 +88,6 @@ class ggml_qnn_tensor { return false; } - auto tensor_type = QNN_TENSOR_GET_TYPE(_qnn_tensor); - if (!register_rpc_mem(_qnn_rpc_buffer)) { - QNN_LOG_WARN("commit rpc mem failure\n"); - return false; - } - QNN_LOG_DEBUG("tensor %s, use mem handle %p", _tensor_name.c_str(), QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor)); } else { QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); @@ -176,26 +170,18 @@ class ggml_qnn_tensor { } QNN_LOG_INFO("tensor %s: alloc rpcmem(%p) successfully\n", _tensor_name.c_str(), qnn_rpc_buffer); - return qnn_rpc_buffer; - } - - bool register_rpc_mem(uint8_t *qnn_rpc_buffer) { - if (_qnn_instance->is_rpcmem_registered(QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor))) { - QNN_LOG_INFO("tensor %s: rpcmem(%p) already registered\n", _tensor_name.c_str(), qnn_rpc_buffer); - return true; - } - auto error = _qnn_instance->register_rpcmem(qnn_rpc_buffer, &_qnn_tensor); if (error != QNN_SUCCESS) { QNN_LOG_WARN("register rpc mem failure, %d\n", (int)error); QNN_LOG_DEBUG("tensor name %s", _tensor_name.c_str()); - return false; + _qnn_instance->free_rpcmem(qnn_rpc_buffer); + return nullptr; } // The mem handle will be set at qnn_instance::register_rpcmem QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_MEMHANDLE); QNN_LOG_INFO("tensor %s: register rpcmem(%p) successfully\n", _tensor_name.c_str(), qnn_rpc_buffer); - return true; + return qnn_rpc_buffer; } bool should_use_mem_handle() const { return _device == QNN_BACKEND_NPU; } From b49b501e267f96747554f321196275b4f81ae5f9 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 10 Jul 2024 19:48:57 +0800 Subject: [PATCH 053/166] fix sprintf type --- ggml/src/ggml-qnn/tensor.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index 8a825b57de57b..e6bb63c54481c 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -33,7 +33,7 @@ class ggml_qnn_tensor { if (_tensor_name.empty()) { static std::atomic_uint32_t unnamed_tensor_count = 0; char buffer[GGML_MAX_NAME] = {}; - snprintf(buffer, sizeof(buffer), "unnamed_%p", unnamed_tensor_count++); + snprintf(buffer, sizeof(buffer), "unnamed_%d", (int)(unnamed_tensor_count++)); _tensor_name = buffer; } From 80051cfc4d9f340d4bb5eed1eddcdbdf98e5da51 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 10 Jul 2024 19:57:47 +0800 Subject: [PATCH 054/166] remove unused variables --- ggml/src/ggml-qnn/backend-ops.cpp | 2 -- ggml/src/ggml-qnn/qnn.hpp | 3 --- 2 files changed, 5 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index bafe5ca160e66..c84c59e1e0c2b 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -122,8 +122,6 @@ bool execute_graph(qnn::ggml_qnn_graph<_InputSize, _OutputSize> *graph, #define CHECK_PARAMS(ctx, src0, src1, dst) #endif -// TODO: this function can be removed later because there are duplicated codes with ggml_qnn_mul_mat -// keep it for illustrate how to implement a specified GGMPL OP using QNN API + QNN RPC static void ggml_qnn_add(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst) { CHECK_PARAMS(ctx, src0, src1, dst); diff --git a/ggml/src/ggml-qnn/qnn.hpp b/ggml/src/ggml-qnn/qnn.hpp index 9d60d2f6c551c..10549a6c5e413 100644 --- a/ggml/src/ggml-qnn/qnn.hpp +++ b/ggml/src/ggml-qnn/qnn.hpp @@ -901,9 +901,6 @@ class qnn_instance { std::string _model_name; // Qualcomm's dedicated prebuilt model name, keep it for further usage BackendIdType _backend_id; - bool _debug_tensor = false; - bool _do_node_validations = true; - QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_DEBUG; qnn::sdk_profile_level _profile_level = qnn::sdk_profile_level::profile_detail; From b6f29273f0bffbb59e5d7d1d99e479983da6a740 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 10 Jul 2024 23:00:31 +0800 Subject: [PATCH 055/166] add function to get graph from cache --- ggml/src/ggml-qnn/backend-ops.cpp | 106 +++++++++++++----------------- ggml/src/ggml-qnn/utils.hpp | 2 + 2 files changed, 47 insertions(+), 61 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index c84c59e1e0c2b..2627e23fd8e3c 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -108,6 +108,41 @@ bool execute_graph(qnn::ggml_qnn_graph<_InputSize, _OutputSize> *graph, return true; } +template +qnn::ggml_qnn_graph_binary *get_qnn_graph_from_cache(ggml_backend_qnn_context *ctx, ggml_op op, + const std::string &qnn_op, + const std::array &inputs, + const std::array &outputs) { + const std::string graph_key(ggml_op_name(op)); + auto it = ctx->qnn_binary_graph_cache.find(graph_key); + qnn::ggml_qnn_graph_binary *graph_ptr = nullptr; + if (it != ctx->qnn_binary_graph_cache.end()) { + graph_ptr = it->second.get(); + } else { + std::string graph_name = graph_key + "_" + std::to_string(ctx->threads); + for (auto &input: inputs) { + graph_name += "_"; + graph_name += input->name; + } + auto graph = std::make_unique(graph_name, (QNNBackend)(ctx->device), + ctx->instance->get_qnn_context_handle(), + ctx->raw_interface, ctx->socinfo.vtcm_size_in_mb); + + if (!graph->is_valid()) { + return nullptr; + } + + if (!qnn_bind_tensors_to_graph<2, 1>(graph.get(), qnn_op.c_str(), inputs, outputs)) { + return nullptr; + } + + graph_ptr = graph.get(); + ctx->qnn_binary_graph_cache[graph_key] = std::move(graph); + } + + return graph_ptr; +} + } // namespace #ifndef NDEBUG @@ -126,44 +161,21 @@ static void ggml_qnn_add(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, ggml_tensor *dst) { CHECK_PARAMS(ctx, src0, src1, dst); - std::string graph_name = "ggml_op_qnn_add"; - qnn::qnn_perf perf(graph_name); + qnn::qnn_perf perf("ggml_op_qnn_add"); perf.start(); bool succeed = false; - std::string graph_key(ggml_op_name(GGML_OP_ADD)); - auto it = ctx->qnn_binary_graph_cache.find(graph_key); - qnn::ggml_qnn_graph_binary *graph_ptr = nullptr; - if (it != ctx->qnn_binary_graph_cache.end()) { - graph_ptr = it->second.get(); - } else { - graph_name = graph_name + "_" + std::to_string(ctx->threads) + "_" + src0->name + "_" + src1->name; - auto graph = std::make_unique(graph_name, (QNNBackend)(ctx->device), - ctx->instance->get_qnn_context_handle(), - ctx->raw_interface, ctx->socinfo.vtcm_size_in_mb); - - if (!graph->is_valid()) { - goto failure; - } - - if (!qnn_bind_tensors_to_graph<2, 1>(graph.get(), QNN_OP_ELEMENT_WISE_ADD, { src0, src1 }, { dst })) { - goto failure; - } - - graph_ptr = graph.get(); - ctx->qnn_binary_graph_cache[graph_key] = std::move(graph); + qnn::ggml_qnn_graph_binary *graph_ptr = + get_qnn_graph_from_cache<2, 1>(ctx, GGML_OP_ADD, QNN_OP_ELEMENT_WISE_ADD, { src0, src1 }, { dst }); + if (graph_ptr) { + succeed = execute_graph<2, 1>(graph_ptr, { src0, src1 }, { dst }); } - succeed = execute_graph<2, 1>(graph_ptr, { src0, src1 }, { dst }); - -failure: if (!succeed) { print_ggml_tensor(src0); print_ggml_tensor(src1); print_ggml_tensor(dst); } - - perf.info(); } /* @@ -181,49 +193,21 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context *ctx, const ggml_tensor *s ggml_tensor *dst) { CHECK_PARAMS(ctx, src0, src1, dst); - std::string graph_name = "ggml_op_qnn_mul_mat"; - qnn::qnn_perf perf(graph_name); + qnn::qnn_perf perf("ggml_op_qnn_mul_mat"); perf.start(); - // TODO: for scenarios of quantized data in src0 - // pass-1: dequantize src0 to FP32 - // pass-2: dq-src0 * src1 - // the performance gains is worth although there is performance loss in pass-1 - bool succeed = false; - std::string graph_key(ggml_op_name(GGML_OP_MUL_MAT)); - auto it = ctx->qnn_binary_graph_cache.find(graph_key); - qnn::ggml_qnn_graph_binary *graph_ptr = nullptr; - if (it != ctx->qnn_binary_graph_cache.end()) { - graph_ptr = it->second.get(); - } else { - graph_name = graph_name + "_" + std::to_string(ctx->threads) + "_" + src0->name + "_" + src1->name; - auto graph = std::make_unique(graph_name, (QNNBackend)(ctx->device), - ctx->instance->get_qnn_context_handle(), - ctx->raw_interface, ctx->socinfo.vtcm_size_in_mb); - - if (!graph->is_valid()) { - goto failure; - } - - if (!qnn_bind_tensors_to_graph<2, 1>(graph.get(), QNN_OP_MAT_MUL, { src0, src1 }, { dst })) { - goto failure; - } - - graph_ptr = graph.get(); - ctx->qnn_binary_graph_cache[graph_key] = std::move(graph); + qnn::ggml_qnn_graph_binary *graph_ptr = + get_qnn_graph_from_cache<2, 1>(ctx, GGML_OP_MUL_MAT, QNN_OP_MAT_MUL, { src0, src1 }, { dst }); + if (graph_ptr) { + succeed = execute_graph<2, 1>(graph_ptr, { src0, src1 }, { dst }); } - succeed = execute_graph<2, 1>(graph_ptr, { src0, src1 }, { dst }); - -failure: if (!succeed) { print_ggml_tensor(src0); print_ggml_tensor(src1); print_ggml_tensor(dst); } - - perf.info(); } static void ggml_qnn_repeat(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp index 84cd8354e2d59..4a01347d0fc1b 100644 --- a/ggml/src/ggml-qnn/utils.hpp +++ b/ggml/src/ggml-qnn/utils.hpp @@ -189,6 +189,7 @@ void device_tensor_free(Qnn_Tensor_t &tensor); class qnn_perf { public: qnn_perf(const std::string &perf_name) : _perf_name(std::move(perf_name)) {}; + ~qnn_perf() { info(); } qnn_perf() = delete; qnn_perf(const qnn_perf &) = delete; qnn_perf &operator=(const qnn_perf &) = delete; @@ -211,6 +212,7 @@ class qnn_perf { class qnn_perf { public: qnn_perf(const std::string &perf_name) {} + ~qnn_perf() { info(); } qnn_perf() = delete; qnn_perf(const qnn_perf &) = delete; qnn_perf &operator=(const qnn_perf &) = delete; From 7ea28a6fac55a6723a02a8a873fc830b581b36c9 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 10 Jul 2024 23:39:03 +0800 Subject: [PATCH 056/166] add helper function for binary op --- ggml/src/ggml-qnn/backend-ops.cpp | 150 +++++++++++++++++++++++------- 1 file changed, 116 insertions(+), 34 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 2627e23fd8e3c..5871a7b6ef211 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -120,7 +120,7 @@ qnn::ggml_qnn_graph_binary *get_qnn_graph_from_cache(ggml_backend_qnn_context *c graph_ptr = it->second.get(); } else { std::string graph_name = graph_key + "_" + std::to_string(ctx->threads); - for (auto &input: inputs) { + for (auto &input : inputs) { graph_name += "_"; graph_name += input->name; } @@ -143,6 +143,116 @@ qnn::ggml_qnn_graph_binary *get_qnn_graph_from_cache(ggml_backend_qnn_context *c return graph_ptr; } +constexpr const char *kGgmlOpToQnnOp[] = { + nullptr, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + QNN_OP_ELEMENT_WISE_ADD, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + nullptr, // GGML_OP_SUB + nullptr, // GGML_OP_MUL + nullptr, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + nullptr, // GGML_OP_SQRT + nullptr, // GGML_OP_LOG + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + nullptr, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM + + QNN_OP_MAT_MUL, // GGML_OP_MUL_MAT + nullptr, // GGML_OP_MUL_MAT_ID + nullptr, // GGML_OP_OUT_PROD + + nullptr, // GGML_OP_SCALE + nullptr, // GGML_OP_SET + nullptr, // GGML_OP_CPY + nullptr, // GGML_OP_CONT + nullptr, // GGML_OP_RESHAPE + nullptr, // GGML_OP_VIEW + nullptr, // GGML_OP_PERMUTE + nullptr, // GGML_OP_TRANSPOSE + nullptr, // GGML_OP_GET_ROWS + nullptr, // GGML_OP_GET_ROWS_BACK + nullptr, // GGML_OP_DIAG + nullptr, // GGML_OP_DIAG_MASK_INF + nullptr, // GGML_OP_DIAG_MASK_ZERO + nullptr, // GGML_OP_SOFT_MAX + nullptr, // GGML_OP_SOFT_MAX_BACK + nullptr, // GGML_OP_ROPE + nullptr, // GGML_OP_ROPE_BACK + nullptr, // GGML_OP_CLAMP + nullptr, // GGML_OP_CONV_TRANSPOSE_1D + nullptr, // GGML_OP_IM2COL + nullptr, // GGML_OP_CONV_TRANSPOSE_2D + nullptr, // GGML_OP_POOL_1D + nullptr, // GGML_OP_POOL_2D + nullptr, // GGML_OP_UPSCALE + nullptr, // GGML_OP_PAD + nullptr, // GGML_OP_ARANGE + nullptr, // GGML_OP_TIMESTEP_EMBEDDING + nullptr, // GGML_OP_ARGSORT + nullptr, // GGML_OP_LEAKY_RELU + + nullptr, // GGML_OP_FLASH_ATTN_EXT + nullptr, // GGML_OP_FLASH_ATTN_BACK + nullptr, // GGML_OP_SSM_CONV + nullptr, // GGML_OP_SSM_SCAN + nullptr, // GGML_OP_WIN_PART + nullptr, // GGML_OP_WIN_UNPART + nullptr, // GGML_OP_GET_REL_POS + nullptr, // GGML_OP_ADD_REL_POS + + nullptr, // GGML_OP_UNARY + + nullptr, // GGML_OP_MAP_UNARY + nullptr, // GGML_OP_MAP_BINARY + + nullptr, // GGML_OP_MAP_CUSTOM1_F32 + nullptr, // GGML_OP_MAP_CUSTOM2_F32 + nullptr, // GGML_OP_MAP_CUSTOM3_F32 + + nullptr, // GGML_OP_MAP_CUSTOM1 + nullptr, // GGML_OP_MAP_CUSTOM2 + nullptr, // GGML_OP_MAP_CUSTOM3 + + nullptr, // GGML_OP_CROSS_ENTROPY_LOSS + nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK +}; + +static_assert(sizeof(kGgmlOpToQnnOp) / sizeof(kGgmlOpToQnnOp[0]) == GGML_OP_COUNT, + "GGML_OP_COUNT does not match the size of the ops table"); + +template +void qnn_binary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) { + static_assert(kGgmlOpToQnnOp[_GgmlOp] != nullptr, "GGML_OP does not have a corresponding QNN_OP"); + + qnn::qnn_perf perf(ggml_op_name(_GgmlOp)); + perf.start(); + + bool succeed = false; + qnn::ggml_qnn_graph_binary *graph_ptr = + get_qnn_graph_from_cache<2, 1>(ctx, _GgmlOp, kGgmlOpToQnnOp[_GgmlOp], { src0, src1 }, { dst }); + if (graph_ptr) { + succeed = execute_graph<2, 1>(graph_ptr, { src0, src1 }, { dst }); + } + + if (!succeed) { + print_ggml_tensor(src0); + print_ggml_tensor(src1); + print_ggml_tensor(dst); + } +} + } // namespace #ifndef NDEBUG @@ -160,22 +270,7 @@ qnn::ggml_qnn_graph_binary *get_qnn_graph_from_cache(ggml_backend_qnn_context *c static void ggml_qnn_add(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst) { CHECK_PARAMS(ctx, src0, src1, dst); - - qnn::qnn_perf perf("ggml_op_qnn_add"); - perf.start(); - - bool succeed = false; - qnn::ggml_qnn_graph_binary *graph_ptr = - get_qnn_graph_from_cache<2, 1>(ctx, GGML_OP_ADD, QNN_OP_ELEMENT_WISE_ADD, { src0, src1 }, { dst }); - if (graph_ptr) { - succeed = execute_graph<2, 1>(graph_ptr, { src0, src1 }, { dst }); - } - - if (!succeed) { - print_ggml_tensor(src0); - print_ggml_tensor(src1); - print_ggml_tensor(dst); - } + qnn_binary_op_impl(ctx, src0, src1, dst); } /* @@ -192,22 +287,7 @@ static void ggml_qnn_add(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, static void ggml_qnn_mul_mat(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst) { CHECK_PARAMS(ctx, src0, src1, dst); - - qnn::qnn_perf perf("ggml_op_qnn_mul_mat"); - perf.start(); - - bool succeed = false; - qnn::ggml_qnn_graph_binary *graph_ptr = - get_qnn_graph_from_cache<2, 1>(ctx, GGML_OP_MUL_MAT, QNN_OP_MAT_MUL, { src0, src1 }, { dst }); - if (graph_ptr) { - succeed = execute_graph<2, 1>(graph_ptr, { src0, src1 }, { dst }); - } - - if (!succeed) { - print_ggml_tensor(src0); - print_ggml_tensor(src1); - print_ggml_tensor(dst); - } + qnn_binary_op_impl(ctx, src0, src1, dst); } static void ggml_qnn_repeat(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, @@ -320,7 +400,7 @@ static void ggml_qnn_nop(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, } qnn::ggml_qnn_op_array_t qnn::ggml_qnn_op_array() { - static constexpr const qnn::ggml_qnn_op_t kQnnOpsTable[GGML_OP_COUNT] = { + static constexpr const qnn::ggml_qnn_op_t kQnnOpsTable[] = { nullptr, // GGML_OP_NONE nullptr, // GGML_OP_DUP ggml_qnn_add, // GGML_OP_ADD @@ -405,5 +485,7 @@ qnn::ggml_qnn_op_array_t qnn::ggml_qnn_op_array() { nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK }; + static_assert(sizeof(kQnnOpsTable) / sizeof(kQnnOpsTable[0]) == GGML_OP_COUNT, + "GGML_OP_COUNT does not match the size of the ops table"); return kQnnOpsTable; } From 8932135fdb8ee05a9e4c44a64137a28c35bf05bc Mon Sep 17 00:00:00 2001 From: hongruichen Date: Thu, 11 Jul 2024 00:07:00 +0800 Subject: [PATCH 057/166] add sqrt and mul ops --- ggml/src/ggml-qnn/backend-ops.cpp | 137 +++++++++++++++--------------- 1 file changed, 70 insertions(+), 67 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 5871a7b6ef211..ca48f79bbc44b 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -8,6 +8,18 @@ #include "tensor.hpp" #include "utils.hpp" +#ifndef NDEBUG +#define CHECK_PARAMS(ctx, src0, src1, dst) \ + do { \ + if (!qnn_is_valid_params((ctx), (src0), (src1), (dst))) { \ + return; \ + } \ + } while (0) + +#else +#define CHECK_PARAMS(ctx, src0, src1, dst) +#endif + namespace { void print_ggml_tensor(const ggml_tensor *tensor) { @@ -144,29 +156,29 @@ qnn::ggml_qnn_graph_binary *get_qnn_graph_from_cache(ggml_backend_qnn_context *c } constexpr const char *kGgmlOpToQnnOp[] = { - nullptr, // GGML_OP_NONE - nullptr, // GGML_OP_DUP - QNN_OP_ELEMENT_WISE_ADD, // GGML_OP_ADD - nullptr, // GGML_OP_ADD1 - nullptr, // GGML_OP_ACC - nullptr, // GGML_OP_SUB - nullptr, // GGML_OP_MUL - nullptr, // GGML_OP_DIV - nullptr, // GGML_OP_SQR - nullptr, // GGML_OP_SQRT - nullptr, // GGML_OP_LOG - nullptr, // GGML_OP_SUM - nullptr, // GGML_OP_SUM_ROWS - nullptr, // GGML_OP_MEAN - nullptr, // GGML_OP_ARGMAX - nullptr, // GGML_OP_REPEAT - nullptr, // GGML_OP_REPEAT_BACK - nullptr, // GGML_OP_CONCAT - nullptr, // GGML_OP_SILU_BACK - nullptr, // GGML_OP_NORM - nullptr, // GGML_OP_RMS_NORM - nullptr, // GGML_OP_RMS_NORM_BACK - nullptr, // GGML_OP_GROUP_NORM + nullptr, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + QNN_OP_ELEMENT_WISE_ADD, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + nullptr, // GGML_OP_SUB + QNN_OP_ELEMENT_WISE_MULTIPLY, // GGML_OP_MUL + nullptr, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + QNN_OP_ELEMENT_WISE_SQUARE_ROOT, // GGML_OP_SQRT + nullptr, // GGML_OP_LOG + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + nullptr, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM QNN_OP_MAT_MUL, // GGML_OP_MUL_MAT nullptr, // GGML_OP_MUL_MAT_ID @@ -236,6 +248,8 @@ void qnn_binary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, ggml_tensor *dst) { static_assert(kGgmlOpToQnnOp[_GgmlOp] != nullptr, "GGML_OP does not have a corresponding QNN_OP"); + CHECK_PARAMS(ctx, src0, src1, dst); + qnn::qnn_perf perf(ggml_op_name(_GgmlOp)); perf.start(); @@ -255,24 +269,16 @@ void qnn_binary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, } // namespace -#ifndef NDEBUG -#define CHECK_PARAMS(ctx, src0, src1, dst) \ - do { \ - if (!qnn_is_valid_params((ctx), (src0), (src1), (dst))) { \ - return; \ - } \ - } while (0) - -#else -#define CHECK_PARAMS(ctx, src0, src1, dst) -#endif - static void ggml_qnn_add(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst) { - CHECK_PARAMS(ctx, src0, src1, dst); qnn_binary_op_impl(ctx, src0, src1, dst); } +static void ggml_qnn_mul(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) { + qnn_binary_op_impl(ctx, src0, src1, dst); +} + /* * ggml_qnn_mul_mat was re-added as a standalone function because * the following comments came from https://github.com/ggerganov/llama.cpp/pull/1632 @@ -286,7 +292,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, */ static void ggml_qnn_mul_mat(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst) { - CHECK_PARAMS(ctx, src0, src1, dst); qnn_binary_op_impl(ctx, src0, src1, dst); } @@ -329,6 +334,11 @@ static void ggml_qnn_leaky_relu(ggml_backend_qnn_context *ctx, const ggml_tensor static void ggml_qnn_sqr(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst) {} +static void ggml_qnn_sqrt(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) { + qnn_binary_op_impl(ctx, src0, src1, dst); +} + static void ggml_qnn_norm(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst) {} @@ -392,38 +402,31 @@ static void ggml_qnn_argsort(ggml_backend_qnn_context *ctx, const ggml_tensor *s GGML_ASSERT(ggml_is_contiguous(src0)); } -static void ggml_qnn_nop(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) { - (void)src0; - (void)src1; - (void)dst; -} - qnn::ggml_qnn_op_array_t qnn::ggml_qnn_op_array() { static constexpr const qnn::ggml_qnn_op_t kQnnOpsTable[] = { - nullptr, // GGML_OP_NONE - nullptr, // GGML_OP_DUP - ggml_qnn_add, // GGML_OP_ADD - nullptr, // GGML_OP_ADD1 - nullptr, // GGML_OP_ACC - nullptr, // GGML_OP_SUB - nullptr, // GGML_OP_MUL - nullptr, // GGML_OP_DIV - nullptr, // GGML_OP_SQR - nullptr, // GGML_OP_SQRT - nullptr, // GGML_OP_LOG - nullptr, // GGML_OP_SUM - nullptr, // GGML_OP_SUM_ROWS - nullptr, // GGML_OP_MEAN - nullptr, // GGML_OP_ARGMAX - nullptr, // GGML_OP_REPEAT - nullptr, // GGML_OP_REPEAT_BACK - nullptr, // GGML_OP_CONCAT - nullptr, // GGML_OP_SILU_BACK - nullptr, // GGML_OP_NORM - nullptr, // GGML_OP_RMS_NORM - nullptr, // GGML_OP_RMS_NORM_BACK - nullptr, // GGML_OP_GROUP_NORM + nullptr, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + ggml_qnn_add, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + nullptr, // GGML_OP_SUB + ggml_qnn_mul, // GGML_OP_MUL + nullptr, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + ggml_qnn_sqrt, // GGML_OP_SQRT + nullptr, // GGML_OP_LOG + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + nullptr, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM ggml_qnn_mul_mat, // GGML_OP_MUL_MAT nullptr, // GGML_OP_MUL_MAT_ID From be3aa9631fb43fd39a078f8f55b3646ffe0492d9 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Thu, 11 Jul 2024 00:09:56 +0800 Subject: [PATCH 058/166] use template function directly --- ggml/src/ggml-qnn/backend-ops.cpp | 187 +++++------------------------- 1 file changed, 27 insertions(+), 160 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index ca48f79bbc44b..1aaba32c93176 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -269,168 +269,35 @@ void qnn_binary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, } // namespace -static void ggml_qnn_add(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) { - qnn_binary_op_impl(ctx, src0, src1, dst); -} - -static void ggml_qnn_mul(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) { - qnn_binary_op_impl(ctx, src0, src1, dst); -} - -/* - * ggml_qnn_mul_mat was re-added as a standalone function because - * the following comments came from https://github.com/ggerganov/llama.cpp/pull/1632 - * MUL_MAT take most of the compute time (about 95%). - * So to speed up llama, we have to focus on MUL_MAT. - * - * We have three kinds of MUL_MAT to compute: - * mul_mat_f32: both src0 and src1 are F32. - * mul_mat_f16_f32: src0 is F16 and src1 is F32. - * mul_mat_q_f32: src0 is quantized (Q4_0, Q4_1, ...), and src1 is F32. - */ -static void ggml_qnn_mul_mat(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) { - qnn_binary_op_impl(ctx, src0, src1, dst); -} - -static void ggml_qnn_repeat(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_get_rows(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_acc(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_div(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_gelu(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_silu(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_gelu_quick(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_tanh(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_relu(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_hardsigmoid(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_hardswish(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_leaky_relu(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_sqr(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_sqrt(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) { - qnn_binary_op_impl(ctx, src0, src1, dst); -} - -static void ggml_qnn_norm(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_group_norm(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_concat(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_upscale(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_pad(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_rms_norm(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_cpy(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_dup(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) { - ggml_qnn_cpy(ctx, src0, dst, nullptr); - (void)src1; -} - -static void ggml_qnn_mul_mat_id(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_scale(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_clamp(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_diag_mask_inf(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_soft_max(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_rope(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) { - GGML_ASSERT(ggml_is_contiguous(src0)); -} - -static void ggml_qnn_pool2d(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_im2col(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_sum_rows(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) { - GGML_ASSERT(ggml_is_contiguous(src0)); -} - -static void ggml_qnn_argsort(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) { - GGML_ASSERT(ggml_is_contiguous(src0)); -} - qnn::ggml_qnn_op_array_t qnn::ggml_qnn_op_array() { static constexpr const qnn::ggml_qnn_op_t kQnnOpsTable[] = { - nullptr, // GGML_OP_NONE - nullptr, // GGML_OP_DUP - ggml_qnn_add, // GGML_OP_ADD - nullptr, // GGML_OP_ADD1 - nullptr, // GGML_OP_ACC - nullptr, // GGML_OP_SUB - ggml_qnn_mul, // GGML_OP_MUL - nullptr, // GGML_OP_DIV - nullptr, // GGML_OP_SQR - ggml_qnn_sqrt, // GGML_OP_SQRT - nullptr, // GGML_OP_LOG - nullptr, // GGML_OP_SUM - nullptr, // GGML_OP_SUM_ROWS - nullptr, // GGML_OP_MEAN - nullptr, // GGML_OP_ARGMAX - nullptr, // GGML_OP_REPEAT - nullptr, // GGML_OP_REPEAT_BACK - nullptr, // GGML_OP_CONCAT - nullptr, // GGML_OP_SILU_BACK - nullptr, // GGML_OP_NORM - nullptr, // GGML_OP_RMS_NORM - nullptr, // GGML_OP_RMS_NORM_BACK - nullptr, // GGML_OP_GROUP_NORM - - ggml_qnn_mul_mat, // GGML_OP_MUL_MAT - nullptr, // GGML_OP_MUL_MAT_ID - nullptr, // GGML_OP_OUT_PROD + nullptr, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + qnn_binary_op_impl, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + nullptr, // GGML_OP_SUB + qnn_binary_op_impl, // GGML_OP_MUL + nullptr, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + qnn_binary_op_impl, // GGML_OP_SQRT + nullptr, // GGML_OP_LOG + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + nullptr, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM + + qnn_binary_op_impl, // GGML_OP_MUL_MAT + nullptr, // GGML_OP_MUL_MAT_ID + nullptr, // GGML_OP_OUT_PROD nullptr, // GGML_OP_SCALE nullptr, // GGML_OP_SET From f0894d897a587f2244a49fd3161feeb6244c9e01 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 12 Jul 2024 19:40:55 +0800 Subject: [PATCH 059/166] wip wip --- ggml/src/ggml-qnn/backend-ops.cpp | 92 +++++++++++++++---------------- 1 file changed, 46 insertions(+), 46 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 1aaba32c93176..94a5d3c28a9d1 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -156,29 +156,29 @@ qnn::ggml_qnn_graph_binary *get_qnn_graph_from_cache(ggml_backend_qnn_context *c } constexpr const char *kGgmlOpToQnnOp[] = { - nullptr, // GGML_OP_NONE - nullptr, // GGML_OP_DUP - QNN_OP_ELEMENT_WISE_ADD, // GGML_OP_ADD - nullptr, // GGML_OP_ADD1 - nullptr, // GGML_OP_ACC - nullptr, // GGML_OP_SUB - QNN_OP_ELEMENT_WISE_MULTIPLY, // GGML_OP_MUL - nullptr, // GGML_OP_DIV - nullptr, // GGML_OP_SQR - QNN_OP_ELEMENT_WISE_SQUARE_ROOT, // GGML_OP_SQRT - nullptr, // GGML_OP_LOG - nullptr, // GGML_OP_SUM - nullptr, // GGML_OP_SUM_ROWS - nullptr, // GGML_OP_MEAN - nullptr, // GGML_OP_ARGMAX - nullptr, // GGML_OP_REPEAT - nullptr, // GGML_OP_REPEAT_BACK - nullptr, // GGML_OP_CONCAT - nullptr, // GGML_OP_SILU_BACK - nullptr, // GGML_OP_NORM - nullptr, // GGML_OP_RMS_NORM - nullptr, // GGML_OP_RMS_NORM_BACK - nullptr, // GGML_OP_GROUP_NORM + nullptr, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + QNN_OP_ELEMENT_WISE_ADD, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + nullptr, // GGML_OP_SUB + QNN_OP_ELEMENT_WISE_MULTIPLY, // GGML_OP_MUL + nullptr, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + nullptr, // GGML_OP_SQRT + nullptr, // GGML_OP_LOG + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + nullptr, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM QNN_OP_MAT_MUL, // GGML_OP_MUL_MAT nullptr, // GGML_OP_MUL_MAT_ID @@ -271,29 +271,29 @@ void qnn_binary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, qnn::ggml_qnn_op_array_t qnn::ggml_qnn_op_array() { static constexpr const qnn::ggml_qnn_op_t kQnnOpsTable[] = { - nullptr, // GGML_OP_NONE - nullptr, // GGML_OP_DUP - qnn_binary_op_impl, // GGML_OP_ADD - nullptr, // GGML_OP_ADD1 - nullptr, // GGML_OP_ACC - nullptr, // GGML_OP_SUB - qnn_binary_op_impl, // GGML_OP_MUL - nullptr, // GGML_OP_DIV - nullptr, // GGML_OP_SQR - qnn_binary_op_impl, // GGML_OP_SQRT - nullptr, // GGML_OP_LOG - nullptr, // GGML_OP_SUM - nullptr, // GGML_OP_SUM_ROWS - nullptr, // GGML_OP_MEAN - nullptr, // GGML_OP_ARGMAX - nullptr, // GGML_OP_REPEAT - nullptr, // GGML_OP_REPEAT_BACK - nullptr, // GGML_OP_CONCAT - nullptr, // GGML_OP_SILU_BACK - nullptr, // GGML_OP_NORM - nullptr, // GGML_OP_RMS_NORM - nullptr, // GGML_OP_RMS_NORM_BACK - nullptr, // GGML_OP_GROUP_NORM + nullptr, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + qnn_binary_op_impl, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + nullptr, // GGML_OP_SUB + qnn_binary_op_impl, // GGML_OP_MUL + nullptr, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + nullptr, // GGML_OP_SQRT + nullptr, // GGML_OP_LOG + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + nullptr, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM qnn_binary_op_impl, // GGML_OP_MUL_MAT nullptr, // GGML_OP_MUL_MAT_ID From 0eb595cc6e7691d8fbf30b7b04cdf8dd7eb108e3 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 12 Jul 2024 19:52:35 +0800 Subject: [PATCH 060/166] use table to simpilify the op mapping --- tests/ggml-qnn/ggml-qnn-ut.cpp | 69 +++++++++++++++++++++++++--------- 1 file changed, 52 insertions(+), 17 deletions(-) diff --git a/tests/ggml-qnn/ggml-qnn-ut.cpp b/tests/ggml-qnn/ggml-qnn-ut.cpp index ff01e62f983c7..0c3fbf71ebdbf 100644 --- a/tests/ggml-qnn/ggml-qnn-ut.cpp +++ b/tests/ggml-qnn/ggml-qnn-ut.cpp @@ -327,6 +327,41 @@ static void show_usage() { ); } + +typedef ggml_tensor * (*ggml_op_binary_t)( + ggml_context * ctx, + ggml_tensor * a, + ggml_tensor * b); + +static constexpr const ggml_op_binary_t kBinaryOps[] = { + nullptr, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + ggml_add, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + nullptr, // GGML_OP_SUB + ggml_mul, // GGML_OP_MUL + nullptr, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + nullptr, // GGML_OP_SQRT + nullptr, // GGML_OP_LOG + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + nullptr, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM + ggml_mul_mat, // GGML_OP_MUL_MAT +}; + +static_assert(kBinaryOps[GGML_OP_MUL_MAT] == ggml_mul_mat, "ggml_mul_mat at wrong index, check kBinaryOps"); + static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { int64_t n_begin_time = 0LL; int64_t n_end_time = 0LL; @@ -398,19 +433,15 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { ggml_set_input(src0); ggml_set_input(src1); - switch (n_ggml_op_type) { - case GGML_OP_ADD: - dst = ggml_add(ctx, src0, src1); - break; - case GGML_OP_MUL_MAT: - dst = ggml_mul_mat(ctx, src0, src1); - break; - default: - QNN_LOG_WARN("ggml op %d(%s) not supported", n_ggml_op_type, - ggml_op_name((enum ggml_op) n_ggml_op_type)); - ggml_free(ctx); - ggml_backend_free(backend); - return 3; + auto binary_op = kBinaryOps[n_ggml_op_type]; + if (binary_op) { + dst = binary_op(ctx, src0, src1); + } else { + QNN_LOG_WARN("ggml op %d(%s) not supported", n_ggml_op_type, + ggml_op_name((enum ggml_op) n_ggml_op_type)); + ggml_free(ctx); + ggml_backend_free(backend); + return 3; } ggml_set_output(dst); @@ -473,6 +504,11 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { return 0; } +static const std::unordered_map kMapStringToGGMLOp = { + {"GGML_OP_ADD", GGML_OP_ADD}, + {"GGML_OP_MUL_MAT", GGML_OP_MUL_MAT}, +}; + int main(int argc, char * argv[]) { int num_threads = 4; int n_backend_type = QNN_BACKEND_CPU; @@ -481,10 +517,9 @@ int main(int argc, char * argv[]) { for (int i = 1; i < argc; i++) { if (0 == strcmp(argv[i], "-t")) { if (i + 1 < argc) { - if (0 == memcmp(argv[i + 1], "GGML_OP_ADD", 11)) { - n_ggml_op_type = GGML_OP_ADD; - } else if (0 == memcmp(argv[i + 1], "GGML_OP_MUL_MAT", 15)) { - n_ggml_op_type = GGML_OP_MUL_MAT; + auto it = kMapStringToGGMLOp.find(argv[i + 1]); + if (it != kMapStringToGGMLOp.end()) { + n_ggml_op_type = it->second; } else { show_usage(); return 1; From e3aa43adbdabc75b9f6fcf5bf5bf4ab9899df0a7 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 12 Jul 2024 23:26:11 +0800 Subject: [PATCH 061/166] suppress warning --- ggml/src/ggml-qnn/backend-ops.cpp | 37 ++++++++++++++++++------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 94a5d3c28a9d1..30f2e402cf16f 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -9,25 +9,9 @@ #include "utils.hpp" #ifndef NDEBUG -#define CHECK_PARAMS(ctx, src0, src1, dst) \ - do { \ - if (!qnn_is_valid_params((ctx), (src0), (src1), (dst))) { \ - return; \ - } \ - } while (0) - -#else -#define CHECK_PARAMS(ctx, src0, src1, dst) -#endif namespace { -void print_ggml_tensor(const ggml_tensor *tensor) { - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - tensor->name, tensor->type, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1], tensor->ne[2], - tensor->nb[0], tensor->nb[1], tensor->nb[2]); -} - bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst) { if (!ctx || !src0 || !src1 || !dst) { @@ -47,6 +31,27 @@ bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, return true; } +} // namespace + +#define CHECK_PARAMS(ctx, src0, src1, dst) \ + do { \ + if (!qnn_is_valid_params((ctx), (src0), (src1), (dst))) { \ + return; \ + } \ + } while (0) + +#else +#define CHECK_PARAMS(ctx, src0, src1, dst) +#endif + +namespace { + +void print_ggml_tensor(const ggml_tensor *tensor) { + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + tensor->name, tensor->type, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1], tensor->ne[2], + tensor->nb[0], tensor->nb[1], tensor->nb[2]); +} + template bool qnn_bind_tensors_to_graph(qnn::ggml_qnn_graph<_InputSize, _OutputSize> *graph, const std::string &op_name, const std::array &inputs, From 7cbc4fbd8c9bd15513d0b47e1fe88e722bd863d5 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 12 Jul 2024 23:26:38 +0800 Subject: [PATCH 062/166] add mul --- tests/ggml-qnn/ggml-qnn-ut.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/ggml-qnn/ggml-qnn-ut.cpp b/tests/ggml-qnn/ggml-qnn-ut.cpp index 0c3fbf71ebdbf..96dfa2bcfe27e 100644 --- a/tests/ggml-qnn/ggml-qnn-ut.cpp +++ b/tests/ggml-qnn/ggml-qnn-ut.cpp @@ -507,6 +507,7 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { static const std::unordered_map kMapStringToGGMLOp = { {"GGML_OP_ADD", GGML_OP_ADD}, {"GGML_OP_MUL_MAT", GGML_OP_MUL_MAT}, + {"GGML_OP_MUL", GGML_OP_MUL}, }; int main(int argc, char * argv[]) { From 100ccd5e7fb5bafa92d57ea87108461f91bcfcc6 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 13 Jul 2024 00:06:58 +0800 Subject: [PATCH 063/166] add unary op template and more ops --- ggml/src/ggml-qnn.cpp | 37 ++--- ggml/src/ggml-qnn/backend-ops.cpp | 243 ++++++++++++++++++++++++------ ggml/src/ggml-qnn/backend-ops.hpp | 11 +- ggml/src/ggml-qnn/backend.hpp | 8 +- 4 files changed, 225 insertions(+), 74 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 3584c41120ae6..de1fefe497e58 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -1,6 +1,5 @@ #include "ggml-qnn.h" -#include #include #include #include @@ -15,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -142,7 +142,8 @@ struct ggml_backend_qnn_buffer_type_context { // ================================================================================================= static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, const struct ggml_tensor *tensor, bool b_dump_tensor_info) { - if (ggml_is_empty(tensor) || !qnn::ggml_qnn_op_array()[tensor->op]) { + if (ggml_is_empty(tensor) || + (!qnn::ggml_qnn_unary_op_array()[tensor->op] && !qnn::ggml_qnn_binary_op_array()[tensor->op])) { return false; } @@ -161,19 +162,6 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, const struct g return false; } - // TODO: support other GGML OPs using QNN API - // a GENERAL approach could fix this problem in a standalone PR of refine ggml backend - // subsystem for hybrid inference between CPU&GPU / CPU&NPU easily(less the 100 LoC and no - // side-effect to the existing codes) for ANY ggml backends which the backend's - // ggml_backend_xxx_buffer_is_host return true. this approach could be found at: - // https://github.com/ggerganov/llama.cpp/pull/7641 - bool supported_op = false; - supported_op = (tensor->op == GGML_OP_ADD); - supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL_MAT)); - if (!supported_op) { - return false; - } - // TODO: support other quantized data type if (ggml_is_quantized(src0->type)) { if (src0->type != GGML_TYPE_Q8_0 && src0->type != GGML_TYPE_Q4_0) { @@ -192,14 +180,18 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, const struct g } bool ggml_qnn_compute_forward(ggml_backend_qnn_context *ctx, struct ggml_tensor *tensor) { - auto func = qnn::ggml_qnn_op_array()[tensor->op]; - if (!func) { - QNN_LOG_WARN("unsupported op %d", tensor->op); - return false; + auto unary_op = qnn::ggml_qnn_unary_op_array()[tensor->op]; + if (unary_op) { + return unary_op(ctx, tensor->src[0], tensor); } - func(ctx, tensor->src[0], tensor->src[1], tensor); - return true; + auto binary_op = qnn::ggml_qnn_binary_op_array()[tensor->op]; + if (binary_op) { + return binary_op(ctx, tensor->src[0], tensor->src[1], tensor); + } + + QNN_LOG_WARN("unsupported op %d", tensor->op); + return false; } static const char *ggml_backend_qnn_buffer_get_name(ggml_backend_buffer_t buffer) { @@ -232,7 +224,7 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t QNN_LOG_WARN("Create ggml_qnn_tensor failed"); return; } - + ctx->tensors.push_back(std::move(qnn_tensor)); } @@ -343,6 +335,7 @@ GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { auto instance = g_qnn_mgr[ctx->device].instance; if (instance) { + ctx->qnn_unary_graph_cache.clear(); for (const auto &graph_item : ctx->qnn_binary_graph_cache) { QNN_LOG_INFO("graph type:%s", graph_item.first.c_str()); } diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 30f2e402cf16f..a516d8b06c344 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -12,6 +12,23 @@ namespace { +bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src, ggml_tensor *dst) { + if (!ctx || !src || !dst) { + QNN_LOG_WARN("invalid params\n"); + return false; + } + + auto instance = ctx->instance; + auto *tensor0 = qnn::ggml_qnn_tensor::from_ggml_tensor(src); + auto *tensor1 = qnn::ggml_qnn_tensor::from_ggml_tensor(dst); + if (!instance || !tensor0 || !tensor1) { + QNN_LOG_WARN("invalid tensors\n"); + return false; + } + + return true; +} + bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst) { if (!ctx || !src0 || !src1 || !dst) { @@ -33,15 +50,13 @@ bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, } // namespace -#define CHECK_PARAMS(ctx, src0, src1, dst) \ - do { \ - if (!qnn_is_valid_params((ctx), (src0), (src1), (dst))) { \ - return; \ - } \ - } while (0) +#define CHECK_PARAMS(ctx, ...) \ + if (!qnn_is_valid_params((ctx), __VA_ARGS__)) { \ + return false; \ + } #else -#define CHECK_PARAMS(ctx, src0, src1, dst) +#define CHECK_PARAMS(ctx, ...) #endif namespace { @@ -125,15 +140,33 @@ bool execute_graph(qnn::ggml_qnn_graph<_InputSize, _OutputSize> *graph, return true; } +qnn::ggml_qnn_unary_graph_cache_t &get_qnn_graph_cache(ggml_backend_qnn_context *ctx, + const std::array &inputs, + const std::array &outputs) { + GGML_UNUSED(inputs); + GGML_UNUSED(outputs); + return ctx->qnn_unary_graph_cache; +} + +qnn::ggml_qnn_binary_graph_cache_t &get_qnn_graph_cache(ggml_backend_qnn_context *ctx, + const std::array &inputs, + const std::array &outputs) { + GGML_UNUSED(inputs); + GGML_UNUSED(outputs); + return ctx->qnn_binary_graph_cache; +} + template -qnn::ggml_qnn_graph_binary *get_qnn_graph_from_cache(ggml_backend_qnn_context *ctx, ggml_op op, - const std::string &qnn_op, - const std::array &inputs, - const std::array &outputs) { +qnn::ggml_qnn_graph<_InputSize, _OutputSize> *get_qnn_graph_from_cache( + ggml_backend_qnn_context *ctx, ggml_op op, const std::string &qnn_op, + const std::array &inputs, const std::array &outputs) { + using graph_t = qnn::ggml_qnn_graph<_InputSize, _OutputSize>; + + auto &graph_cache = get_qnn_graph_cache(ctx, inputs, outputs); const std::string graph_key(ggml_op_name(op)); - auto it = ctx->qnn_binary_graph_cache.find(graph_key); - qnn::ggml_qnn_graph_binary *graph_ptr = nullptr; - if (it != ctx->qnn_binary_graph_cache.end()) { + auto it = graph_cache.find(graph_key); + graph_t *graph_ptr = nullptr; + if (it != graph_cache.end()) { graph_ptr = it->second.get(); } else { std::string graph_name = graph_key + "_" + std::to_string(ctx->threads); @@ -141,49 +174,49 @@ qnn::ggml_qnn_graph_binary *get_qnn_graph_from_cache(ggml_backend_qnn_context *c graph_name += "_"; graph_name += input->name; } - auto graph = std::make_unique(graph_name, (QNNBackend)(ctx->device), - ctx->instance->get_qnn_context_handle(), - ctx->raw_interface, ctx->socinfo.vtcm_size_in_mb); + auto graph = + std::make_unique(graph_name, (QNNBackend)(ctx->device), ctx->instance->get_qnn_context_handle(), + ctx->raw_interface, ctx->socinfo.vtcm_size_in_mb); if (!graph->is_valid()) { return nullptr; } - if (!qnn_bind_tensors_to_graph<2, 1>(graph.get(), qnn_op.c_str(), inputs, outputs)) { + if (!qnn_bind_tensors_to_graph<_InputSize, _OutputSize>(graph.get(), qnn_op.c_str(), inputs, outputs)) { return nullptr; } graph_ptr = graph.get(); - ctx->qnn_binary_graph_cache[graph_key] = std::move(graph); + graph_cache[graph_key] = std::move(graph); } return graph_ptr; } constexpr const char *kGgmlOpToQnnOp[] = { - nullptr, // GGML_OP_NONE - nullptr, // GGML_OP_DUP - QNN_OP_ELEMENT_WISE_ADD, // GGML_OP_ADD - nullptr, // GGML_OP_ADD1 - nullptr, // GGML_OP_ACC - nullptr, // GGML_OP_SUB - QNN_OP_ELEMENT_WISE_MULTIPLY, // GGML_OP_MUL - nullptr, // GGML_OP_DIV - nullptr, // GGML_OP_SQR - nullptr, // GGML_OP_SQRT - nullptr, // GGML_OP_LOG - nullptr, // GGML_OP_SUM - nullptr, // GGML_OP_SUM_ROWS - nullptr, // GGML_OP_MEAN - nullptr, // GGML_OP_ARGMAX - nullptr, // GGML_OP_REPEAT - nullptr, // GGML_OP_REPEAT_BACK - nullptr, // GGML_OP_CONCAT - nullptr, // GGML_OP_SILU_BACK - nullptr, // GGML_OP_NORM - nullptr, // GGML_OP_RMS_NORM - nullptr, // GGML_OP_RMS_NORM_BACK - nullptr, // GGML_OP_GROUP_NORM + nullptr, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + QNN_OP_ELEMENT_WISE_ADD, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + QNN_OP_ELEMENT_WISE_SUBTRACT, // GGML_OP_SUB + QNN_OP_ELEMENT_WISE_MULTIPLY, // GGML_OP_MUL + QNN_OP_ELEMENT_WISE_DIVIDE, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + QNN_OP_ELEMENT_WISE_SQUARE_ROOT, // GGML_OP_SQRT + nullptr, // GGML_OP_LOG + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + nullptr, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM QNN_OP_MAT_MUL, // GGML_OP_MUL_MAT nullptr, // GGML_OP_MUL_MAT_ID @@ -249,7 +282,7 @@ static_assert(sizeof(kGgmlOpToQnnOp) / sizeof(kGgmlOpToQnnOp[0]) == GGML_OP_COUN "GGML_OP_COUNT does not match the size of the ops table"); template -void qnn_binary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, +bool qnn_binary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst) { static_assert(kGgmlOpToQnnOp[_GgmlOp] != nullptr, "GGML_OP does not have a corresponding QNN_OP"); @@ -270,21 +303,137 @@ void qnn_binary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, print_ggml_tensor(src1); print_ggml_tensor(dst); } + + return succeed; +} + +template +bool qnn_unary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src, ggml_tensor *dst) { + static_assert(kGgmlOpToQnnOp[_GgmlOp] != nullptr, "GGML_OP does not have a corresponding QNN_OP"); + + CHECK_PARAMS(ctx, src, dst); + + qnn::qnn_perf perf(ggml_op_name(_GgmlOp)); + perf.start(); + + bool succeed = false; + auto *graph_ptr = get_qnn_graph_from_cache<1, 1>(ctx, _GgmlOp, kGgmlOpToQnnOp[_GgmlOp], { src }, { dst }); + if (graph_ptr) { + succeed = execute_graph<1, 1>(graph_ptr, { src }, { dst }); + } + + if (!succeed) { + print_ggml_tensor(src); + print_ggml_tensor(dst); + } + + return succeed; } } // namespace -qnn::ggml_qnn_op_array_t qnn::ggml_qnn_op_array() { - static constexpr const qnn::ggml_qnn_op_t kQnnOpsTable[] = { +qnn::ggml_qnn_unary_op_array_t qnn::ggml_qnn_unary_op_array() { + static constexpr const qnn::ggml_qnn_unary_op_t kQnnOpsTable[] = { nullptr, // GGML_OP_NONE nullptr, // GGML_OP_DUP - qnn_binary_op_impl, // GGML_OP_ADD + nullptr, // GGML_OP_ADD nullptr, // GGML_OP_ADD1 nullptr, // GGML_OP_ACC nullptr, // GGML_OP_SUB - qnn_binary_op_impl, // GGML_OP_MUL + nullptr, // GGML_OP_MUL nullptr, // GGML_OP_DIV nullptr, // GGML_OP_SQR + qnn_unary_op_impl, // GGML_OP_SQRT + nullptr, // GGML_OP_LOG + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + nullptr, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM + + nullptr, // GGML_OP_MUL_MAT + nullptr, // GGML_OP_MUL_MAT_ID + nullptr, // GGML_OP_OUT_PROD + + nullptr, // GGML_OP_SCALE + nullptr, // GGML_OP_SET + nullptr, // GGML_OP_CPY + nullptr, // GGML_OP_CONT + nullptr, // GGML_OP_RESHAPE + nullptr, // GGML_OP_VIEW + nullptr, // GGML_OP_PERMUTE + nullptr, // GGML_OP_TRANSPOSE + nullptr, // GGML_OP_GET_ROWS + nullptr, // GGML_OP_GET_ROWS_BACK + nullptr, // GGML_OP_DIAG + nullptr, // GGML_OP_DIAG_MASK_INF + nullptr, // GGML_OP_DIAG_MASK_ZERO + nullptr, // GGML_OP_SOFT_MAX + nullptr, // GGML_OP_SOFT_MAX_BACK + nullptr, // GGML_OP_ROPE + nullptr, // GGML_OP_ROPE_BACK + nullptr, // GGML_OP_CLAMP + nullptr, // GGML_OP_CONV_TRANSPOSE_1D + nullptr, // GGML_OP_IM2COL + nullptr, // GGML_OP_CONV_TRANSPOSE_2D + nullptr, // GGML_OP_POOL_1D + nullptr, // GGML_OP_POOL_2D + nullptr, // GGML_OP_UPSCALE + nullptr, // GGML_OP_PAD + nullptr, // GGML_OP_ARANGE + nullptr, // GGML_OP_TIMESTEP_EMBEDDING + nullptr, // GGML_OP_ARGSORT + nullptr, // GGML_OP_LEAKY_RELU + + nullptr, // GGML_OP_FLASH_ATTN_EXT + nullptr, // GGML_OP_FLASH_ATTN_BACK + nullptr, // GGML_OP_SSM_CONV + nullptr, // GGML_OP_SSM_SCAN + nullptr, // GGML_OP_WIN_PART + nullptr, // GGML_OP_WIN_UNPART + nullptr, // GGML_OP_GET_REL_POS + nullptr, // GGML_OP_ADD_REL_POS + + nullptr, // GGML_OP_UNARY + + nullptr, // GGML_OP_MAP_UNARY + nullptr, // GGML_OP_MAP_BINARY + + nullptr, // GGML_OP_MAP_CUSTOM1_F32 + nullptr, // GGML_OP_MAP_CUSTOM2_F32 + nullptr, // GGML_OP_MAP_CUSTOM3_F32 + + nullptr, // GGML_OP_MAP_CUSTOM1 + nullptr, // GGML_OP_MAP_CUSTOM2 + nullptr, // GGML_OP_MAP_CUSTOM3 + + nullptr, // GGML_OP_CROSS_ENTROPY_LOSS + nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK + }; + + static_assert(sizeof(kQnnOpsTable) / sizeof(kQnnOpsTable[0]) == GGML_OP_COUNT, + "GGML_OP_COUNT does not match the size of the ops table"); + return kQnnOpsTable; +} + +qnn::ggml_qnn_binary_op_array_t qnn::ggml_qnn_binary_op_array() { + static constexpr const qnn::ggml_qnn_binary_op_t kQnnOpsTable[] = { + nullptr, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + qnn_binary_op_impl, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + qnn_binary_op_impl, // GGML_OP_SUB + qnn_binary_op_impl, // GGML_OP_MUL + qnn_binary_op_impl, // GGML_OP_DIV + nullptr, // GGML_OP_SQR nullptr, // GGML_OP_SQRT nullptr, // GGML_OP_LOG nullptr, // GGML_OP_SUM diff --git a/ggml/src/ggml-qnn/backend-ops.hpp b/ggml/src/ggml-qnn/backend-ops.hpp index 01c23ecff9b16..8d94fc6c25424 100644 --- a/ggml/src/ggml-qnn/backend-ops.hpp +++ b/ggml/src/ggml-qnn/backend-ops.hpp @@ -6,11 +6,14 @@ namespace qnn { -typedef void (*ggml_qnn_op_t)(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst); +typedef bool (*ggml_qnn_unary_op_t)(ggml_backend_qnn_context *ctx, const ggml_tensor *src, ggml_tensor *dst); +typedef bool (*ggml_qnn_binary_op_t)(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst); -typedef const ggml_qnn_op_t (&ggml_qnn_op_array_t)[GGML_OP_COUNT]; +typedef const ggml_qnn_unary_op_t (&ggml_qnn_unary_op_array_t)[GGML_OP_COUNT]; +typedef const ggml_qnn_binary_op_t (&ggml_qnn_binary_op_array_t)[GGML_OP_COUNT]; -ggml_qnn_op_array_t ggml_qnn_op_array(); +ggml_qnn_unary_op_array_t ggml_qnn_unary_op_array(); +ggml_qnn_binary_op_array_t ggml_qnn_binary_op_array(); } // namespace qnn diff --git a/ggml/src/ggml-qnn/backend.hpp b/ggml/src/ggml-qnn/backend.hpp index 48b243577ca1f..0ec927779cc31 100644 --- a/ggml/src/ggml-qnn/backend.hpp +++ b/ggml/src/ggml-qnn/backend.hpp @@ -11,6 +11,11 @@ #include "graph.hpp" #include "qnn.hpp" +namespace qnn { +typedef std::unordered_map> ggml_qnn_unary_graph_cache_t; +typedef std::unordered_map> ggml_qnn_binary_graph_cache_t; +} // namespace qnn + struct ggml_backend_qnn_context { int device; int threads; @@ -21,5 +26,6 @@ struct ggml_backend_qnn_context { QNN_INTERFACE_VER_TYPE raw_interface; QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface; qnn::qcom_socinfo socinfo; - std::unordered_map> qnn_binary_graph_cache; + qnn::ggml_qnn_unary_graph_cache_t qnn_unary_graph_cache; + qnn::ggml_qnn_binary_graph_cache_t qnn_binary_graph_cache; }; From c1e2283887c3fb6d09b2b3fdedd3847f1060ddfa Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 13 Jul 2024 10:55:36 +0800 Subject: [PATCH 064/166] expose op at unit test --- tests/ggml-qnn/ggml-qnn-ut.cpp | 50 ++++++++++++++++++++++++++++++---- 1 file changed, 44 insertions(+), 6 deletions(-) diff --git a/tests/ggml-qnn/ggml-qnn-ut.cpp b/tests/ggml-qnn/ggml-qnn-ut.cpp index 96dfa2bcfe27e..dea336966061c 100644 --- a/tests/ggml-qnn/ggml-qnn-ut.cpp +++ b/tests/ggml-qnn/ggml-qnn-ut.cpp @@ -327,21 +327,51 @@ static void show_usage() { ); } +typedef ggml_tensor * (*ggml_op_unary_t)( + ggml_context * ctx, + ggml_tensor * a); typedef ggml_tensor * (*ggml_op_binary_t)( ggml_context * ctx, ggml_tensor * a, ggml_tensor * b); +static constexpr const ggml_op_unary_t kUnaryOps[] = { + nullptr, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + nullptr, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + nullptr, // GGML_OP_SUB + nullptr, // GGML_OP_MUL + nullptr, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + ggml_sqrt, // GGML_OP_SQRT + nullptr, // GGML_OP_LOG + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + nullptr, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM + nullptr, // GGML_OP_MUL_MAT +}; + static constexpr const ggml_op_binary_t kBinaryOps[] = { nullptr, // GGML_OP_NONE nullptr, // GGML_OP_DUP ggml_add, // GGML_OP_ADD nullptr, // GGML_OP_ADD1 nullptr, // GGML_OP_ACC - nullptr, // GGML_OP_SUB + ggml_sub, // GGML_OP_SUB ggml_mul, // GGML_OP_MUL - nullptr, // GGML_OP_DIV + ggml_div, // GGML_OP_DIV nullptr, // GGML_OP_SQR nullptr, // GGML_OP_SQRT nullptr, // GGML_OP_LOG @@ -433,8 +463,11 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { ggml_set_input(src0); ggml_set_input(src1); + auto unary_op = kUnaryOps[n_ggml_op_type]; auto binary_op = kBinaryOps[n_ggml_op_type]; - if (binary_op) { + if (unary_op) { + dst = unary_op(ctx, src0); + } else if (binary_op) { dst = binary_op(ctx, src0, src1); } else { QNN_LOG_WARN("ggml op %d(%s) not supported", n_ggml_op_type, @@ -504,10 +537,15 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { return 0; } +#define DEFINE_OP(op) { #op, op } + static const std::unordered_map kMapStringToGGMLOp = { - {"GGML_OP_ADD", GGML_OP_ADD}, - {"GGML_OP_MUL_MAT", GGML_OP_MUL_MAT}, - {"GGML_OP_MUL", GGML_OP_MUL}, + DEFINE_OP(GGML_OP_ADD), + DEFINE_OP(GGML_OP_SUB), + DEFINE_OP(GGML_OP_MUL), + DEFINE_OP(GGML_OP_DIV), + DEFINE_OP(GGML_OP_SQRT), + DEFINE_OP(GGML_OP_MUL_MAT), }; int main(int argc, char * argv[]) { From 148ceab70c8b08a93345819b817cf08e19a5316a Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sun, 14 Jul 2024 22:57:09 +0800 Subject: [PATCH 065/166] add log op --- ggml/src/ggml-qnn/backend-ops.cpp | 4 ++-- tests/ggml-qnn/ggml-qnn-ut.cpp | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index a516d8b06c344..711f707531228 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -204,7 +204,7 @@ constexpr const char *kGgmlOpToQnnOp[] = { QNN_OP_ELEMENT_WISE_DIVIDE, // GGML_OP_DIV nullptr, // GGML_OP_SQR QNN_OP_ELEMENT_WISE_SQUARE_ROOT, // GGML_OP_SQRT - nullptr, // GGML_OP_LOG + QNN_OP_ELEMENT_WISE_LOG, // GGML_OP_LOG nullptr, // GGML_OP_SUM nullptr, // GGML_OP_SUM_ROWS nullptr, // GGML_OP_MEAN @@ -344,7 +344,7 @@ qnn::ggml_qnn_unary_op_array_t qnn::ggml_qnn_unary_op_array() { nullptr, // GGML_OP_DIV nullptr, // GGML_OP_SQR qnn_unary_op_impl, // GGML_OP_SQRT - nullptr, // GGML_OP_LOG + qnn_unary_op_impl, // GGML_OP_LOG nullptr, // GGML_OP_SUM nullptr, // GGML_OP_SUM_ROWS nullptr, // GGML_OP_MEAN diff --git a/tests/ggml-qnn/ggml-qnn-ut.cpp b/tests/ggml-qnn/ggml-qnn-ut.cpp index dea336966061c..59e561f130e75 100644 --- a/tests/ggml-qnn/ggml-qnn-ut.cpp +++ b/tests/ggml-qnn/ggml-qnn-ut.cpp @@ -347,7 +347,7 @@ static constexpr const ggml_op_unary_t kUnaryOps[] = { nullptr, // GGML_OP_DIV nullptr, // GGML_OP_SQR ggml_sqrt, // GGML_OP_SQRT - nullptr, // GGML_OP_LOG + ggml_log, // GGML_OP_LOG nullptr, // GGML_OP_SUM nullptr, // GGML_OP_SUM_ROWS nullptr, // GGML_OP_MEAN @@ -546,6 +546,7 @@ static const std::unordered_map kMapStringToGGMLOp = { DEFINE_OP(GGML_OP_DIV), DEFINE_OP(GGML_OP_SQRT), DEFINE_OP(GGML_OP_MUL_MAT), + DEFINE_OP(GGML_OP_LOG), }; int main(int argc, char * argv[]) { From 30b40006cc3371ef7c5b4d43b44a5a4d4ec3d907 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sun, 14 Jul 2024 23:50:11 +0800 Subject: [PATCH 066/166] remove unused declarations --- ggml/src/ggml-qnn.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index de1fefe497e58..f1de6b18591c6 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -142,7 +142,7 @@ struct ggml_backend_qnn_buffer_type_context { // ================================================================================================= static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, const struct ggml_tensor *tensor, bool b_dump_tensor_info) { - if (ggml_is_empty(tensor) || + if (ggml_is_empty(tensor) || (!qnn::ggml_qnn_unary_op_array()[tensor->op] && !qnn::ggml_qnn_binary_op_array()[tensor->op])) { return false; } @@ -569,9 +569,7 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char *qnn_lib_path) { return qnn_backend; } -extern "C" GGML_CALL int ggml_backend_qnn_reg_devices(void); - -GGML_CALL int ggml_backend_qnn_reg_devices() { +int ggml_backend_qnn_reg_devices() { for (size_t idx = 0; idx < GGML_QNN_MAX_DEVICES; idx++) { char name[GGML_MAX_NAME]; ggml_backend_qnn_get_device_description(idx, name, GGML_MAX_NAME); From c46b4deea9bbf02000a355f37fcddc27a3a0ad76 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Mon, 15 Jul 2024 10:23:12 +0800 Subject: [PATCH 067/166] [unit test] init all tensor by one function --- tests/ggml-qnn/ggml-qnn-ut.cpp | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/tests/ggml-qnn/ggml-qnn-ut.cpp b/tests/ggml-qnn/ggml-qnn-ut.cpp index 59e561f130e75..f19a6355d30fe 100644 --- a/tests/ggml-qnn/ggml-qnn-ut.cpp +++ b/tests/ggml-qnn/ggml-qnn-ut.cpp @@ -80,11 +80,11 @@ static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const static const char * get_qnn_backend_name(int n_backend_type) { switch (n_backend_type) { - case 0: + case QNN_BACKEND_CPU: return "QNN-CPU"; - case 1: + case QNN_BACKEND_GPU: return "QNN-GPU"; - case 2: + case QNN_BACKEND_NPU: return "QNN-NPU"; case 3: return "ggml"; @@ -494,16 +494,7 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { gf = ggml_new_graph(ctx); ggml_build_forward_expand(gf, dst); - if (n_backend_type != QNN_BACKEND_GGML) { - initialize_tensors(ctx); - } else { - if (qtype == GGML_TYPE_F32) { - ggml_set_f32(src0, 2.f); - } else { - initialize_tensors(ctx); - } - ggml_set_f32(src1, 3.f); - } + initialize_tensors(ctx); ggml_graph_compute_helper(backend, gf, work_buffer, num_threads, nullptr, nullptr); From 4410fd65630be9f782fe0d2e484de08ada18dcaa Mon Sep 17 00:00:00 2001 From: hongruichen Date: Mon, 15 Jul 2024 10:30:57 +0800 Subject: [PATCH 068/166] format with clang-format --- tests/ggml-qnn/ggml-qnn-ut.cpp | 371 ++++++++++++++++----------------- 1 file changed, 174 insertions(+), 197 deletions(-) diff --git a/tests/ggml-qnn/ggml-qnn-ut.cpp b/tests/ggml-qnn/ggml-qnn-ut.cpp index f19a6355d30fe..fefb262445dc0 100644 --- a/tests/ggml-qnn/ggml-qnn-ut.cpp +++ b/tests/ggml-qnn/ggml-qnn-ut.cpp @@ -1,67 +1,67 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include #include -#include +#include #include +#include #include -#include +#include +#include +#include +#include +#include +#include #include +#include +#include -#include -#include -#include -#include -#include -#include -#include -#include +#include +#include +#include #include -#include +#include #include -#include -#include +#include +#include #include -#include +#include +#include #include -#include +#include +#include +#include +#include +#include +#include #include -#include -#include #include #include +#include #include "ggml.h" + #include "ggml-alloc.h" #include "ggml-backend.h" #include "ggml-qnn.h" -#define GGML_QNN_DEBUG 1 +#define GGML_QNN_DEBUG 1 #define GGML_QNN_LOGBUF_LEN 4096 -#define QNN_LOG_ERROR(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) -#define QNN_LOG_WARN(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) -#define QNN_LOG_INFO(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define QNN_LOG_ERROR(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define QNN_LOG_WARN(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define QNN_LOG_INFO(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) #if GGML_QNN_DEBUG -#define QNN_LOG_DEBUG(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define QNN_LOG_DEBUG(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) #else #define QNN_LOG_DEBUG(...) #endif -static void tensor_dump(const ggml_tensor * tensor, const char * name); +static void tensor_dump(const ggml_tensor *tensor, const char *name); #define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor) -static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...) { +static void ggml_qnn_log_internal(ggml_log_level level, const char *file, const char *func, int line, + const char *format, ...) { static std::mutex ggml_qnn_log_internal_mutex; static char s_ggml_qnn_log_internal_buf[GGML_QNN_LOGBUF_LEN]; @@ -78,7 +78,7 @@ static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const } } -static const char * get_qnn_backend_name(int n_backend_type) { +static const char *get_qnn_backend_name(int n_backend_type) { switch (n_backend_type) { case QNN_BACKEND_CPU: return "QNN-CPU"; @@ -93,13 +93,9 @@ static const char * get_qnn_backend_name(int n_backend_type) { } } -static bool ggml_graph_compute_helper( - struct ggml_backend * backend, - struct ggml_cgraph * graph, - std::vector & buf, - int n_threads, - ggml_abort_callback abort_callback, - void * abort_callback_data) { +static bool ggml_graph_compute_helper(struct ggml_backend *backend, struct ggml_cgraph *graph, + std::vector &buf, int n_threads, ggml_abort_callback abort_callback, + void *abort_callback_data) { struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); plan.abort_callback = abort_callback; @@ -129,8 +125,8 @@ static bool ggml_graph_compute_helper( #define QK8_0 32 typedef struct { - uint16_t d; // delta - int8_t qs[QK8_0]; // quants + uint16_t d; // delta + int8_t qs[QK8_0]; // quants } block_q8_0; static inline float ggml_compute_fp16_to_fp32(uint16_t h) { @@ -141,12 +137,11 @@ static inline float ggml_compute_fp16_to_fp32(uint16_t h) { #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) -static void tensor_dump(const ggml_tensor * tensor, const char * name) { - QNN_LOG_DEBUG("dump ggml tensor %s(%s): type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - name, tensor->name, - tensor->type, ggml_type_name(tensor->type), - tensor->ne[0], tensor->ne[1], tensor->ne[2], - tensor->nb[0], tensor->nb[1], tensor->nb[2]); +static void tensor_dump(const ggml_tensor *tensor, const char *name) { + QNN_LOG_DEBUG("dump ggml tensor %s(%s): type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 + ", nb = (%5zi, %5zi, %5zi)\n", + name, tensor->name, tensor->type, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1], + tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]); float value = 0; std::ostringstream tmposs; @@ -160,10 +155,8 @@ static void tensor_dump(const ggml_tensor * tensor, const char * name) { for (int i = 0; i < tensor->ne[2]; i++) { for (int j = 0; j < tensor->ne[1]; j++) { for (int k = 0; k < tensor->ne[0]; k++) { - value = ((int8_t *) tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] + - j * tensor->ne[0] + k]; - tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value - << " "; + value = ((int8_t *)tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] + j * tensor->ne[0] + k]; + tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value << " "; } tmposs << "\n"; } @@ -181,10 +174,8 @@ static void tensor_dump(const ggml_tensor * tensor, const char * name) { for (int i = 0; i < tensor->ne[2]; i++) { for (int j = 0; j < tensor->ne[1]; j++) { for (int k = 0; k < tensor->ne[0]; k++) { - value = ((float *) tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] + - j * tensor->ne[0] + k]; - tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value - << " "; + value = ((float *)tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] + j * tensor->ne[0] + k]; + tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value << " "; } tmposs << "\n"; } @@ -202,11 +193,11 @@ static void tensor_dump(const ggml_tensor * tensor, const char * name) { for (int i = 0; i < tensor->ne[2]; i++) { for (int j = 0; j < tensor->ne[1]; j++) { for (int k = 0; k < tensor->ne[0]; k++) { - unsigned short tmpvalue = ((unsigned short *) tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] + - j * tensor->ne[0] + k]; + unsigned short tmpvalue = + ((unsigned short *) + tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] + j * tensor->ne[0] + k]; value = GGML_FP16_TO_FP32(tmpvalue); - tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value - << " "; + tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value << " "; } tmposs << "\n"; } @@ -220,15 +211,14 @@ static void tensor_dump(const ggml_tensor * tensor, const char * name) { } if (tensor->type == GGML_TYPE_Q8_0) { - block_q8_0 * tmp = ((block_q8_0 *)tensor->data); - for (int j = 0; j < tensor->ne[1]; j++) { - int n = tensor->ne[0] / QK8_0; //blocks per row + block_q8_0 *tmp = ((block_q8_0 *)tensor->data); + for (int j = 0; j < tensor->ne[1]; j++) { + int n = tensor->ne[0] / QK8_0; // blocks per row for (int z = 0; z < n; z++) { - const float d = GGML_FP16_TO_FP32(tmp[ j * n + z ].d); + const float d = GGML_FP16_TO_FP32(tmp[j * n + z].d); for (int k = 0; k < QK8_0; k++) { value = tmp[j * n + z].qs[k] * d; - tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value - << " "; + tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value << " "; } } tmposs << "\n"; @@ -241,7 +231,7 @@ static void tensor_dump(const ggml_tensor * tensor, const char * name) { } } -static uint32_t get_tensor_rank(const ggml_tensor * tensor) { +static uint32_t get_tensor_rank(const ggml_tensor *tensor) { uint32_t rank = 0; for (int i = 0; i < GGML_MAX_DIMS; i++) { if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) { @@ -251,7 +241,7 @@ static uint32_t get_tensor_rank(const ggml_tensor * tensor) { return rank; } -static uint32_t get_tensor_data_size(const ggml_tensor * tensor) { +static uint32_t get_tensor_data_size(const ggml_tensor *tensor) { size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); size_t n_dims = get_tensor_rank(tensor); for (int i = 1; i < n_dims; i++) { @@ -264,8 +254,8 @@ static uint32_t get_tensor_data_size(const ggml_tensor * tensor) { return ggml_nbytes(tensor); } -//ref: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-backend-ops.cpp#L20 -static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) { +// ref: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-backend-ops.cpp#L20 +static void init_tensor_uniform(ggml_tensor *tensor, float min = -1.0f, float max = 1.0f) { size_t size = ggml_nelements(tensor); std::vector data(size); for (size_t i = 0; i < size; i++) { @@ -274,7 +264,7 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) { #ifdef GGML_USE_QNN - memcpy((char*)tensor->data, data.data(), size * sizeof(float)); + memcpy((char *)tensor->data, data.data(), size * sizeof(float)); #else ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float)); #endif @@ -282,25 +272,25 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0); std::vector dataq(ggml_row_size(tensor->type, size)); std::vector imatrix(tensor->ne[0], 1.0f); // dummy importance matrix - const float * im = imatrix.data(); + const float *im = imatrix.data(); if (!ggml_quantize_requires_imatrix(tensor->type)) { // when the imatrix is optional, we want to test both quantization with and without imatrix // use one of the random numbers to decide - if (data[0] > 0.5f*(min + max)) { + if (data[0] > 0.5f * (min + max)) { im = nullptr; } } - ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im); + ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size / tensor->ne[0], tensor->ne[0], im); GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size())); #ifdef GGML_USE_QNN - memcpy((char*)tensor->data, dataq.data(), dataq.size()); + memcpy((char *)tensor->data, dataq.data(), dataq.size()); #else ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size()); #endif } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) { // This is going to create some weird integers though. #ifdef GGML_USE_QNN - memcpy((char*)tensor->data, data.data(), ggml_nbytes(tensor)); + memcpy((char *)tensor->data, data.data(), ggml_nbytes(tensor)); #else ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor)); #endif @@ -309,125 +299,117 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m } } -//ref: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-backend-ops.cpp#L310 -static void initialize_tensors(ggml_context * ctx) { - for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { +// ref: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-backend-ops.cpp#L310 +static void initialize_tensors(ggml_context *ctx) { + for (ggml_tensor *t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { init_tensor_uniform(t); } } static void show_usage() { - printf(" " \ - "\nUsage: test_qnn_ops [options]\n" \ - "\n" \ - "Options:\n" \ - " -t GGML_OP_ADD / GGML_OP_MULMAT\n" \ - " -b 0(QNN_CPU) 1(QNN_GPU) 2(QNN_NPU) 3(ggml)\n" \ - " ?/h print usage infomation\n\n" - ); + printf( + " " + "\nUsage: test_qnn_ops [options]\n" + "\n" + "Options:\n" + " -t GGML_OP_ADD / GGML_OP_MULMAT\n" + " -b 0(QNN_CPU) 1(QNN_GPU) 2(QNN_NPU) 3(ggml)\n" + " ?/h print usage infomation\n\n"); } -typedef ggml_tensor * (*ggml_op_unary_t)( - ggml_context * ctx, - ggml_tensor * a); +typedef ggml_tensor *(*ggml_op_unary_t)(ggml_context *ctx, ggml_tensor *a); -typedef ggml_tensor * (*ggml_op_binary_t)( - ggml_context * ctx, - ggml_tensor * a, - ggml_tensor * b); +typedef ggml_tensor *(*ggml_op_binary_t)(ggml_context *ctx, ggml_tensor *a, ggml_tensor *b); static constexpr const ggml_op_unary_t kUnaryOps[] = { - nullptr, // GGML_OP_NONE - nullptr, // GGML_OP_DUP - nullptr, // GGML_OP_ADD - nullptr, // GGML_OP_ADD1 - nullptr, // GGML_OP_ACC - nullptr, // GGML_OP_SUB - nullptr, // GGML_OP_MUL - nullptr, // GGML_OP_DIV - nullptr, // GGML_OP_SQR - ggml_sqrt, // GGML_OP_SQRT - ggml_log, // GGML_OP_LOG - nullptr, // GGML_OP_SUM - nullptr, // GGML_OP_SUM_ROWS - nullptr, // GGML_OP_MEAN - nullptr, // GGML_OP_ARGMAX - nullptr, // GGML_OP_REPEAT - nullptr, // GGML_OP_REPEAT_BACK - nullptr, // GGML_OP_CONCAT - nullptr, // GGML_OP_SILU_BACK - nullptr, // GGML_OP_NORM - nullptr, // GGML_OP_RMS_NORM - nullptr, // GGML_OP_RMS_NORM_BACK - nullptr, // GGML_OP_GROUP_NORM - nullptr, // GGML_OP_MUL_MAT + nullptr, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + nullptr, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + nullptr, // GGML_OP_SUB + nullptr, // GGML_OP_MUL + nullptr, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + ggml_sqrt, // GGML_OP_SQRT + ggml_log, // GGML_OP_LOG + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + nullptr, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM + nullptr, // GGML_OP_MUL_MAT }; static constexpr const ggml_op_binary_t kBinaryOps[] = { - nullptr, // GGML_OP_NONE - nullptr, // GGML_OP_DUP - ggml_add, // GGML_OP_ADD - nullptr, // GGML_OP_ADD1 - nullptr, // GGML_OP_ACC - ggml_sub, // GGML_OP_SUB - ggml_mul, // GGML_OP_MUL - ggml_div, // GGML_OP_DIV - nullptr, // GGML_OP_SQR - nullptr, // GGML_OP_SQRT - nullptr, // GGML_OP_LOG - nullptr, // GGML_OP_SUM - nullptr, // GGML_OP_SUM_ROWS - nullptr, // GGML_OP_MEAN - nullptr, // GGML_OP_ARGMAX - nullptr, // GGML_OP_REPEAT - nullptr, // GGML_OP_REPEAT_BACK - nullptr, // GGML_OP_CONCAT - nullptr, // GGML_OP_SILU_BACK - nullptr, // GGML_OP_NORM - nullptr, // GGML_OP_RMS_NORM - nullptr, // GGML_OP_RMS_NORM_BACK - nullptr, // GGML_OP_GROUP_NORM - ggml_mul_mat, // GGML_OP_MUL_MAT + nullptr, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + ggml_add, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + ggml_sub, // GGML_OP_SUB + ggml_mul, // GGML_OP_MUL + ggml_div, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + nullptr, // GGML_OP_SQRT + nullptr, // GGML_OP_LOG + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + nullptr, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM + ggml_mul_mat, // GGML_OP_MUL_MAT }; static_assert(kBinaryOps[GGML_OP_MUL_MAT] == ggml_mul_mat, "ggml_mul_mat at wrong index, check kBinaryOps"); static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { - int64_t n_begin_time = 0LL; - int64_t n_end_time = 0LL; - int64_t n_duration = 0LL; - size_t ctx_size = 0; - int sizey = 4; - int sizex = 4; - - struct ggml_context * ctx = nullptr; - struct ggml_cgraph * gf = nullptr; - struct ggml_tensor * src0 = nullptr; - struct ggml_tensor * src1 = nullptr; - struct ggml_tensor * dst = nullptr; - ggml_backend_t backend = nullptr; - ggml_backend_buffer_t buffer= nullptr; - - ggml_type qtype = GGML_TYPE_I8; - qtype = GGML_TYPE_F16; - qtype = GGML_TYPE_Q8_0; - qtype = GGML_TYPE_F32; + int64_t n_begin_time = 0LL; + int64_t n_end_time = 0LL; + int64_t n_duration = 0LL; + size_t ctx_size = 0; + int sizey = 4; + int sizex = 4; + + struct ggml_context *ctx = nullptr; + struct ggml_cgraph *gf = nullptr; + struct ggml_tensor *src0 = nullptr; + struct ggml_tensor *src1 = nullptr; + struct ggml_tensor *dst = nullptr; + ggml_backend_t backend = nullptr; + ggml_backend_buffer_t buffer = nullptr; + + ggml_type qtype = GGML_TYPE_I8; + qtype = GGML_TYPE_F16; + qtype = GGML_TYPE_Q8_0; + qtype = GGML_TYPE_F32; std::vector work_buffer; QNN_LOG_DEBUG("enter qnn_ggml_op\n"); - QNN_LOG_DEBUG("ggml op:%d(%s)\n", n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type)); + QNN_LOG_DEBUG("ggml op:%d(%s)\n", n_ggml_op_type, ggml_op_name((enum ggml_op)n_ggml_op_type)); n_begin_time = ggml_time_us(); ctx_size += 1024 * 1024 * 32; - QNN_LOG_DEBUG("Allocating Memory of size %zi bytes, %zi MB\n", ctx_size, - (ctx_size / 1024 / 1024)); + QNN_LOG_DEBUG("Allocating Memory of size %zi bytes, %zi MB\n", ctx_size, (ctx_size / 1024 / 1024)); - struct ggml_init_params params = { - /*.mem_size =*/ ctx_size, - /*.mem_buffer =*/ NULL, - /* no_alloc =*/ 0 - }; + struct ggml_init_params params = { /*.mem_size =*/ctx_size, + /*.mem_buffer =*/NULL, + /* no_alloc =*/0 }; if (n_backend_type != QNN_BACKEND_GGML) { params.no_alloc = true; @@ -470,8 +452,7 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { } else if (binary_op) { dst = binary_op(ctx, src0, src1); } else { - QNN_LOG_WARN("ggml op %d(%s) not supported", n_ggml_op_type, - ggml_op_name((enum ggml_op) n_ggml_op_type)); + QNN_LOG_WARN("ggml op %d(%s) not supported", n_ggml_op_type, ggml_op_name((enum ggml_op)n_ggml_op_type)); ggml_free(ctx); ggml_backend_free(backend); return 3; @@ -504,17 +485,17 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { TENSOR_DUMP(src1); TENSOR_DUMP(dst); } else { - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, - src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 + ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, - src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 + ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - dst->name, - dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 + ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); } @@ -524,26 +505,22 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { n_end_time = ggml_time_us(); n_duration = (n_end_time - n_begin_time) / 1000; - QNN_LOG_DEBUG("duration of ut GGML_OP_%s using QNN backend %s: %lld milliseconds\n", ggml_op_name((enum ggml_op)n_ggml_op_type), get_qnn_backend_name(n_backend_type), n_duration); + QNN_LOG_DEBUG("duration of ut GGML_OP_%s using QNN backend %s: %lld milliseconds\n", + ggml_op_name((enum ggml_op)n_ggml_op_type), get_qnn_backend_name(n_backend_type), n_duration); return 0; } #define DEFINE_OP(op) { #op, op } static const std::unordered_map kMapStringToGGMLOp = { - DEFINE_OP(GGML_OP_ADD), - DEFINE_OP(GGML_OP_SUB), - DEFINE_OP(GGML_OP_MUL), - DEFINE_OP(GGML_OP_DIV), - DEFINE_OP(GGML_OP_SQRT), - DEFINE_OP(GGML_OP_MUL_MAT), - DEFINE_OP(GGML_OP_LOG), + DEFINE_OP(GGML_OP_ADD), DEFINE_OP(GGML_OP_SUB), DEFINE_OP(GGML_OP_MUL), DEFINE_OP(GGML_OP_DIV), + DEFINE_OP(GGML_OP_SQRT), DEFINE_OP(GGML_OP_MUL_MAT), DEFINE_OP(GGML_OP_LOG), }; -int main(int argc, char * argv[]) { - int num_threads = 4; - int n_backend_type = QNN_BACKEND_CPU; - int n_ggml_op_type = GGML_OP_ADD; +int main(int argc, char *argv[]) { + int num_threads = 4; + int n_backend_type = QNN_BACKEND_CPU; + int n_ggml_op_type = GGML_OP_ADD; for (int i = 1; i < argc; i++) { if (0 == strcmp(argv[i], "-t")) { @@ -561,7 +538,7 @@ int main(int argc, char * argv[]) { if (i + 1 < argc) { int backend = atoi(argv[i + 1]); if (backend <= QNN_BACKEND_GGML) - n_backend_type = backend; + n_backend_type = backend; else { show_usage(); return 1; @@ -575,9 +552,9 @@ int main(int argc, char * argv[]) { } QNN_LOG_DEBUG("enter qnn_ggml_op\n"); - QNN_LOG_DEBUG("backend %d, ggml op:%d(%s)", n_backend_type, n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type)); + QNN_LOG_DEBUG("backend %d, ggml op:%d(%s)", n_backend_type, n_ggml_op_type, + ggml_op_name((enum ggml_op)n_ggml_op_type)); qnn_op_ut(num_threads, n_backend_type, n_ggml_op_type); - return 0; } From cd5a7331f7cd1d79ab482c2a454e2ef963fff0ee Mon Sep 17 00:00:00 2001 From: hongruichen Date: Mon, 15 Jul 2024 10:50:33 +0800 Subject: [PATCH 069/166] add cpu backend as cross reference --- tests/ggml-qnn/ggml-qnn-ut.cpp | 39 ++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/tests/ggml-qnn/ggml-qnn-ut.cpp b/tests/ggml-qnn/ggml-qnn-ut.cpp index fefb262445dc0..a87781e52b070 100644 --- a/tests/ggml-qnn/ggml-qnn-ut.cpp +++ b/tests/ggml-qnn/ggml-qnn-ut.cpp @@ -244,7 +244,7 @@ static uint32_t get_tensor_rank(const ggml_tensor *tensor) { static uint32_t get_tensor_data_size(const ggml_tensor *tensor) { size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); size_t n_dims = get_tensor_rank(tensor); - for (int i = 1; i < n_dims; i++) { + for (size_t i = 1; i < n_dims; i++) { data_size *= tensor->ne[i]; } @@ -377,7 +377,8 @@ static constexpr const ggml_op_binary_t kBinaryOps[] = { static_assert(kBinaryOps[GGML_OP_MUL_MAT] == ggml_mul_mat, "ggml_mul_mat at wrong index, check kBinaryOps"); -static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { +static void qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type, ggml_type qtype, + std::vector &results) { int64_t n_begin_time = 0LL; int64_t n_end_time = 0LL; int64_t n_duration = 0LL; @@ -393,11 +394,6 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { ggml_backend_t backend = nullptr; ggml_backend_buffer_t buffer = nullptr; - ggml_type qtype = GGML_TYPE_I8; - qtype = GGML_TYPE_F16; - qtype = GGML_TYPE_Q8_0; - qtype = GGML_TYPE_F32; - std::vector work_buffer; QNN_LOG_DEBUG("enter qnn_ggml_op\n"); QNN_LOG_DEBUG("ggml op:%d(%s)\n", n_ggml_op_type, ggml_op_name((enum ggml_op)n_ggml_op_type)); @@ -416,14 +412,14 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { backend = ggml_backend_qnn_init(n_backend_type, "/data/local/tmp/"); if (nullptr == backend) { QNN_LOG_ERROR("create qnn backend %d(%s) failed\n", n_backend_type, get_qnn_backend_name(n_backend_type)); - return 1; + return; } } ctx = ggml_init(params); if (!ctx) { QNN_LOG_ERROR("%s: ggml_init() failed\n"); - return 2; + return; } QNN_LOG_DEBUG("creating new tensors\n"); @@ -455,7 +451,7 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { QNN_LOG_WARN("ggml op %d(%s) not supported", n_ggml_op_type, ggml_op_name((enum ggml_op)n_ggml_op_type)); ggml_free(ctx); ggml_backend_free(backend); - return 3; + return; } ggml_set_output(dst); @@ -466,7 +462,7 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { QNN_LOG_ERROR("%s: failed to allocate backend buffer\n", __func__); ggml_free(ctx); ggml_backend_free(backend); - return 4; + return; } } #endif @@ -484,6 +480,8 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { TENSOR_DUMP(src0); TENSOR_DUMP(src1); TENSOR_DUMP(dst); + results.resize(ggml_nbytes(dst)); + memcpy(results.data(), ggml_get_data(dst), ggml_nbytes(dst)); } else { QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", @@ -507,7 +505,6 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { n_duration = (n_end_time - n_begin_time) / 1000; QNN_LOG_DEBUG("duration of ut GGML_OP_%s using QNN backend %s: %lld milliseconds\n", ggml_op_name((enum ggml_op)n_ggml_op_type), get_qnn_backend_name(n_backend_type), n_duration); - return 0; } #define DEFINE_OP(op) { #op, op } @@ -517,6 +514,10 @@ static const std::unordered_map kMapStringToGGMLOp = { DEFINE_OP(GGML_OP_SQRT), DEFINE_OP(GGML_OP_MUL_MAT), DEFINE_OP(GGML_OP_LOG), }; +#define CONSOLE_RED "\033[31m" +#define CONSOLE_GREEN "\033[32m" +#define CONSOLE_RESET "\033[0m" + int main(int argc, char *argv[]) { int num_threads = 4; int n_backend_type = QNN_BACKEND_CPU; @@ -554,7 +555,17 @@ int main(int argc, char *argv[]) { QNN_LOG_DEBUG("enter qnn_ggml_op\n"); QNN_LOG_DEBUG("backend %d, ggml op:%d(%s)", n_backend_type, n_ggml_op_type, ggml_op_name((enum ggml_op)n_ggml_op_type)); - qnn_op_ut(num_threads, n_backend_type, n_ggml_op_type); - return 0; + std::vector results; + qnn_op_ut(num_threads, n_backend_type, n_ggml_op_type, GGML_TYPE_F32, results); + std::vector cpu_results; + qnn_op_ut(num_threads, QNN_BACKEND_GGML, n_ggml_op_type, GGML_TYPE_F32, cpu_results); + + if (results == cpu_results) { + QNN_LOG_INFO(CONSOLE_GREEN "[Result] results equal!" CONSOLE_RESET); + return 0; + } else { + QNN_LOG_ERROR(CONSOLE_RED "[Result] results not equal!" CONSOLE_RESET); + return 1; + } } From f32327e2b2182b170013f123b5469cab6d731d22 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Mon, 15 Jul 2024 11:19:01 +0800 Subject: [PATCH 070/166] remove multiply declearation of log in unit test --- ggml/src/ggml-qnn/logger.cpp | 2 - ggml/src/ggml-qnn/logger.hpp | 2 + tests/ggml-qnn/CMakeLists.txt | 4 ++ tests/ggml-qnn/ggml-qnn-ut.cpp | 71 ++++++++++------------------------ 4 files changed, 26 insertions(+), 53 deletions(-) diff --git a/ggml/src/ggml-qnn/logger.cpp b/ggml/src/ggml-qnn/logger.cpp index 5a1ad13ba40ce..8b74b90edf476 100644 --- a/ggml/src/ggml-qnn/logger.cpp +++ b/ggml/src/ggml-qnn/logger.cpp @@ -9,8 +9,6 @@ #include #endif -#define QNN_LOGBUF_LEN 4096 - void qnn::internal_log(ggml_log_level level, const char *file, const char *func, int line, const char *format, ...) { static std::mutex qnn_internal_log_mutex; static char s_qnn_internal_log_buf[QNN_LOGBUF_LEN]; diff --git a/ggml/src/ggml-qnn/logger.hpp b/ggml/src/ggml-qnn/logger.hpp index f81a1814e9756..b4bab0c006691 100644 --- a/ggml/src/ggml-qnn/logger.hpp +++ b/ggml/src/ggml-qnn/logger.hpp @@ -9,6 +9,8 @@ #include "QnnTypes.h" #include "System/QnnSystemInterface.h" +#define QNN_LOGBUF_LEN 4096 + namespace qnn { void internal_log(ggml_log_level level, const char *file, const char *func, int line, const char *format, ...); diff --git a/tests/ggml-qnn/CMakeLists.txt b/tests/ggml-qnn/CMakeLists.txt index b4f1bd6c07482..e72cc13e78ce4 100644 --- a/tests/ggml-qnn/CMakeLists.txt +++ b/tests/ggml-qnn/CMakeLists.txt @@ -61,3 +61,7 @@ link_libraries(${LOG_LIB} android) add_executable(${TARGET_NAME} ${SOURCE_FILES} ) + +target_include_directories(${TARGET_NAME} PRIVATE + ../../ggml/src/ggml-qnn/ +) diff --git a/tests/ggml-qnn/ggml-qnn-ut.cpp b/tests/ggml-qnn/ggml-qnn-ut.cpp index a87781e52b070..2fea53e620dd8 100644 --- a/tests/ggml-qnn/ggml-qnn-ut.cpp +++ b/tests/ggml-qnn/ggml-qnn-ut.cpp @@ -43,40 +43,7 @@ #include "ggml-backend.h" #include "ggml-qnn.h" -#define GGML_QNN_DEBUG 1 -#define GGML_QNN_LOGBUF_LEN 4096 - -#define QNN_LOG_ERROR(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) -#define QNN_LOG_WARN(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) -#define QNN_LOG_INFO(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) - -#if GGML_QNN_DEBUG -#define QNN_LOG_DEBUG(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) -#else -#define QNN_LOG_DEBUG(...) -#endif - -static void tensor_dump(const ggml_tensor *tensor, const char *name); - -#define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor) - -static void ggml_qnn_log_internal(ggml_log_level level, const char *file, const char *func, int line, - const char *format, ...) { - static std::mutex ggml_qnn_log_internal_mutex; - static char s_ggml_qnn_log_internal_buf[GGML_QNN_LOGBUF_LEN]; - - { - std::lock_guard lock(ggml_qnn_log_internal_mutex); - va_list args; - va_start(args, format); - int len_prefix = snprintf(s_ggml_qnn_log_internal_buf, GGML_QNN_LOGBUF_LEN, "[%s, %d]: ", func, line); - int len = vsnprintf(s_ggml_qnn_log_internal_buf + len_prefix, GGML_QNN_LOGBUF_LEN - len_prefix, format, args); - if (len < (GGML_QNN_LOGBUF_LEN - len_prefix)) { - printf("%s\n", s_ggml_qnn_log_internal_buf); - } - va_end(args); - } -} +#include "logger.hpp" static const char *get_qnn_backend_name(int n_backend_type) { switch (n_backend_type) { @@ -86,7 +53,7 @@ static const char *get_qnn_backend_name(int n_backend_type) { return "QNN-GPU"; case QNN_BACKEND_NPU: return "QNN-NPU"; - case 3: + case QNN_BACKEND_GGML: return "ggml"; default: return "unknown"; @@ -137,11 +104,13 @@ static inline float ggml_compute_fp16_to_fp32(uint16_t h) { #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) +#define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor) + static void tensor_dump(const ggml_tensor *tensor, const char *name) { - QNN_LOG_DEBUG("dump ggml tensor %s(%s): type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 - ", nb = (%5zi, %5zi, %5zi)\n", - name, tensor->name, tensor->type, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1], - tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]); + QNN_LOG_INFO("dump ggml tensor %s(%s): type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 + ", nb = (%5zi, %5zi, %5zi)\n", + name, tensor->name, tensor->type, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1], + tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]); float value = 0; std::ostringstream tmposs; @@ -162,8 +131,8 @@ static void tensor_dump(const ggml_tensor *tensor, const char *name) { } } } - if (strlen(tmposs.str().c_str()) <= (GGML_QNN_LOGBUF_LEN - 96)) { - QNN_LOG_DEBUG("\n%s\n", tmposs.str().c_str()); + if (strlen(tmposs.str().c_str()) <= (QNN_LOGBUF_LEN - 96)) { + QNN_LOG_INFO("\n%s\n", tmposs.str().c_str()); tmposs.clear(); tmposs.str(""); } @@ -181,8 +150,8 @@ static void tensor_dump(const ggml_tensor *tensor, const char *name) { } } } - if (strlen(tmposs.str().c_str()) <= (GGML_QNN_LOGBUF_LEN - 96)) { - QNN_LOG_DEBUG("\n%s\n", tmposs.str().c_str()); + if (strlen(tmposs.str().c_str()) <= (QNN_LOGBUF_LEN - 96)) { + QNN_LOG_INFO("\n%s\n", tmposs.str().c_str()); tmposs.clear(); tmposs.str(""); } @@ -203,8 +172,8 @@ static void tensor_dump(const ggml_tensor *tensor, const char *name) { } } } - if (strlen(tmposs.str().c_str()) <= (GGML_QNN_LOGBUF_LEN - 96)) { - QNN_LOG_DEBUG("\n%s\n", tmposs.str().c_str()); + if (strlen(tmposs.str().c_str()) <= (QNN_LOGBUF_LEN - 96)) { + QNN_LOG_INFO("\n%s\n", tmposs.str().c_str()); tmposs.clear(); tmposs.str(""); } @@ -223,8 +192,8 @@ static void tensor_dump(const ggml_tensor *tensor, const char *name) { } tmposs << "\n"; } - if (strlen(tmposs.str().c_str()) <= (GGML_QNN_LOGBUF_LEN - 96)) { - QNN_LOG_DEBUG("\n%s\n", tmposs.str().c_str()); + if (strlen(tmposs.str().c_str()) <= (QNN_LOGBUF_LEN - 96)) { + QNN_LOG_INFO("\n%s\n", tmposs.str().c_str()); tmposs.clear(); tmposs.str(""); } @@ -480,8 +449,6 @@ static void qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type, g TENSOR_DUMP(src0); TENSOR_DUMP(src1); TENSOR_DUMP(dst); - results.resize(ggml_nbytes(dst)); - memcpy(results.data(), ggml_get_data(dst), ggml_nbytes(dst)); } else { QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", @@ -497,6 +464,8 @@ static void qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type, g dst->nb[1], dst->nb[2]); } + results.resize(ggml_nbytes(dst)); + memcpy(results.data(), ggml_get_data(dst), ggml_nbytes(dst)); ggml_free(ctx); ggml_backend_buffer_free(buffer); ggml_backend_free(backend); @@ -562,10 +531,10 @@ int main(int argc, char *argv[]) { qnn_op_ut(num_threads, QNN_BACKEND_GGML, n_ggml_op_type, GGML_TYPE_F32, cpu_results); if (results == cpu_results) { - QNN_LOG_INFO(CONSOLE_GREEN "[Result] results equal!" CONSOLE_RESET); + QNN_LOG_INFO(CONSOLE_GREEN "[Success] results equal to CPU backend!" CONSOLE_RESET); return 0; } else { - QNN_LOG_ERROR(CONSOLE_RED "[Result] results not equal!" CONSOLE_RESET); + QNN_LOG_ERROR(CONSOLE_RED "[Failed] results mismatch with CPU backend!" CONSOLE_RESET); return 1; } } From ff601abc1ce99cef1de1787add056b406f09a544 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Tue, 16 Jul 2024 00:05:40 +0800 Subject: [PATCH 071/166] add todo --- tests/ggml-qnn/ggml-qnn-ut.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/ggml-qnn/ggml-qnn-ut.cpp b/tests/ggml-qnn/ggml-qnn-ut.cpp index 2fea53e620dd8..71cb86a71bdf1 100644 --- a/tests/ggml-qnn/ggml-qnn-ut.cpp +++ b/tests/ggml-qnn/ggml-qnn-ut.cpp @@ -211,6 +211,7 @@ static uint32_t get_tensor_rank(const ggml_tensor *tensor) { } static uint32_t get_tensor_data_size(const ggml_tensor *tensor) { +#if ENABLE_QNNSDK_LOG size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); size_t n_dims = get_tensor_rank(tensor); for (size_t i = 1; i < n_dims; i++) { @@ -219,6 +220,7 @@ static uint32_t get_tensor_data_size(const ggml_tensor *tensor) { QNN_LOG_DEBUG("get_tensor_data_size %d", data_size); QNN_LOG_DEBUG("ggml_nbytes(tensor) %d", ggml_nbytes(tensor)); +#endif return ggml_nbytes(tensor); } @@ -530,6 +532,8 @@ int main(int argc, char *argv[]) { std::vector cpu_results; qnn_op_ut(num_threads, QNN_BACKEND_GGML, n_ggml_op_type, GGML_TYPE_F32, cpu_results); + // TODO: theoretically, the results should be the same, but the results may be different due to the different hardware + // a better way to compare the results is to compare the floating point numbers with allowed error if (results == cpu_results) { QNN_LOG_INFO(CONSOLE_GREEN "[Success] results equal to CPU backend!" CONSOLE_RESET); return 0; From 0301b500cd2ce15935b4c3139427e72872f231cb Mon Sep 17 00:00:00 2001 From: hongruichen Date: Tue, 16 Jul 2024 22:52:16 +0800 Subject: [PATCH 072/166] refactoring: prevent leak the QNN_INTERFACE_VER_TYPE and QNN_SYSTEM_INTERFACE_VER_TYPE outside of qnn.hpp --- ggml/include/ggml-qnn.h | 26 +- ggml/src/ggml-qnn.cpp | 39 +-- ggml/src/ggml-qnn/backend-ops.cpp | 2 +- ggml/src/ggml-qnn/backend.hpp | 15 +- ggml/src/ggml-qnn/graph.hpp | 22 +- ggml/src/ggml-qnn/qnn-lib.cpp | 35 +++ ggml/src/ggml-qnn/{qnn.hpp => qnn-lib.hpp} | 304 +++++++++------------ ggml/src/ggml-qnn/tensor.hpp | 2 +- tests/ggml-qnn/CMakeLists.txt | 1 + 9 files changed, 212 insertions(+), 234 deletions(-) create mode 100644 ggml/src/ggml-qnn/qnn-lib.cpp rename ggml/src/ggml-qnn/{qnn.hpp => qnn-lib.hpp} (81%) diff --git a/ggml/include/ggml-qnn.h b/ggml/include/ggml-qnn.h index 60aaf22179647..026c6ddf06672 100644 --- a/ggml/include/ggml-qnn.h +++ b/ggml/include/ggml-qnn.h @@ -1,8 +1,9 @@ #pragma once -#include "ggml-backend.h" #include "ggml.h" +#include "ggml-backend.h" + #ifdef __cplusplus extern "C" { #endif @@ -10,11 +11,11 @@ extern "C" { #define GGML_QNN_MAX_DEVICES 3 enum QNNBackend { - QNN_BACKEND_CPU, - QNN_BACKEND_GPU, - QNN_BACKEND_NPU, - QNN_BACKEND_GGML, //"fake" QNN backend, used for compare performance between - // QNN and original GGML + QNN_BACKEND_CPU = 0, + QNN_BACKEND_GPU, + QNN_BACKEND_NPU, + QNN_BACKEND_GGML, //"fake" QNN backend, used for compare performance between + // QNN and original GGML }; GGML_API int ggml_backend_qnn_reg_devices(void); @@ -27,22 +28,17 @@ GGML_API int ggml_backend_qnn_reg_devices(void); * Android or specified in JNI layer * @return */ -GGML_API ggml_backend_t ggml_backend_qnn_init(size_t dev_num, - const char* qnn_lib_path); +GGML_API ggml_backend_t ggml_backend_qnn_init(size_t dev_num, const char *qnn_lib_path); GGML_API bool ggml_backend_is_qnn(ggml_backend_t backend); -GGML_API void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, - int thread_counts); +GGML_API void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int thread_counts); GGML_API int ggml_backend_qnn_get_device_count(void); -GGML_API void ggml_backend_qnn_get_device_description(size_t dev_num, - char* description, - size_t description_size); +GGML_API void ggml_backend_qnn_get_device_description(size_t dev_num, char *description, size_t description_size); -GGML_API GGML_CALL ggml_backend_buffer_type_t -ggml_backend_qnn_buffer_type(size_t dev_num); +GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t dev_num); #ifdef __cplusplus } diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index f1de6b18591c6..46f7e64bcdedb 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -78,32 +78,9 @@ static struct qnn::qcom_socinfo g_qnn_soc_info_table[] = { // HMX(Hexagon Matrix eXtensions)/HTA(Hexagon Tensor Accelerator) static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { - [QNN_BACKEND_CPU] = { .device = 0, - .threads = 1, - .name = "qnn-cpu", - .lib = "libQnnCpu.so", - .backend = nullptr, - .raw_interface = {}, - .raw_system_interface = {}, - .socinfo = {} }, - - [QNN_BACKEND_GPU] = { .device = 1, - .threads = 1, - .name = "qnn-gpu", - .lib = "libQnnGpu.so", - .backend = nullptr, - .raw_interface = {}, - .raw_system_interface = {}, - .socinfo = {} }, - - [QNN_BACKEND_NPU] = { .device = 2, - .threads = 1, - .name = "qnn-npu", - .lib = "libQnnHtp.so", - .backend = nullptr, - .raw_interface = {}, - .raw_system_interface = {}, - .socinfo = {} }, + ggml_backend_qnn_context(QNN_BACKEND_CPU, 1, "qnn-cpu", "libQnnCpu.so"), /* QNN_BACKEND_CPU */ + ggml_backend_qnn_context(QNN_BACKEND_GPU, 1, "qnn-gpu", "libQnnGpu.so"), /* QNN_BACKEND_GPU */ + ggml_backend_qnn_context(QNN_BACKEND_NPU, 1, "qnn-npu", "libQnnHtp.so"), /* QNN_BACKEND_NPU */ }; struct ggml_backend_qnn_buffer_context { @@ -549,17 +526,17 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char *qnn_lib_path) { return nullptr; } auto qnn_interface = instance->get_qnn_interface(); - if (!qnn_interface.is_loaded()) { + if (!qnn_interface) { QNN_LOG_WARN("qnn subsystem failure\n"); return nullptr; } std::string device_name = qnn::get_backend_name(device); QNN_LOG_INFO("qnn device name %s", device_name.c_str()); - g_qnn_mgr[device].instance = instance; - g_qnn_mgr[device].raw_interface = instance->get_qnn_raw_interface(); - g_qnn_mgr[device].raw_system_interface = instance->get_qnn_raw_system_interface(); - g_qnn_mgr[device].socinfo = instance->get_soc_info(); + auto &qnn_device = g_qnn_mgr[device]; + qnn_device.instance = instance; + qnn_device.qnn_interface = qnn_interface; + qnn_device.socinfo = instance->get_soc_info(); ggml_backend_t qnn_backend = new ggml_backend{ /* .guid = */ ggml_backend_qnn_guid(), /* .iface = */ ggml_backend_qnn_interface, diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 711f707531228..e1a8c4da5ed40 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -176,7 +176,7 @@ qnn::ggml_qnn_graph<_InputSize, _OutputSize> *get_qnn_graph_from_cache( } auto graph = std::make_unique(graph_name, (QNNBackend)(ctx->device), ctx->instance->get_qnn_context_handle(), - ctx->raw_interface, ctx->socinfo.vtcm_size_in_mb); + ctx->qnn_interface, ctx->socinfo.vtcm_size_in_mb); if (!graph->is_valid()) { return nullptr; diff --git a/ggml/src/ggml-qnn/backend.hpp b/ggml/src/ggml-qnn/backend.hpp index 0ec927779cc31..32f3c6cd445f6 100644 --- a/ggml/src/ggml-qnn/backend.hpp +++ b/ggml/src/ggml-qnn/backend.hpp @@ -9,7 +9,7 @@ #include "ggml-backend.h" #include "graph.hpp" -#include "qnn.hpp" +#include "qnn-lib.hpp" namespace qnn { typedef std::unordered_map> ggml_qnn_unary_graph_cache_t; @@ -21,11 +21,16 @@ struct ggml_backend_qnn_context { int threads; char name[GGML_MAX_NAME]; char lib[GGML_MAX_NAME]; + ggml_backend *backend = nullptr; + qnn::qcom_socinfo socinfo = {}; std::shared_ptr instance; - ggml_backend *backend; - QNN_INTERFACE_VER_TYPE raw_interface; - QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface; - qnn::qcom_socinfo socinfo; + std::shared_ptr qnn_interface; qnn::ggml_qnn_unary_graph_cache_t qnn_unary_graph_cache; qnn::ggml_qnn_binary_graph_cache_t qnn_binary_graph_cache; + + explicit ggml_backend_qnn_context(int device, int threads, const char *name, const char *lib) : + device(device), threads(threads) { + strncpy(this->name, name, GGML_MAX_NAME); + strncpy(this->lib, lib, GGML_MAX_NAME); + } }; diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index cb04b1efda0fc..e4900906ce3e9 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -2,11 +2,12 @@ #pragma once #include +#include #include "ggml-qnn.h" #include "logger.hpp" -#include "qnn.hpp" +#include "qnn-lib.hpp" namespace qnn { @@ -17,7 +18,7 @@ class ggml_qnn_graph { typedef std::array output_tensor_array_t; explicit ggml_qnn_graph(const std::string &graph_name, QNNBackend device, Qnn_ContextHandle_t qnn_context, - QNN_INTERFACE_VER_TYPE qnn_interface, size_t vtcm_size_in_mb) : + std::shared_ptr qnn_interface, size_t vtcm_size_in_mb) : _graph_name(graph_name), _device(device), _qnn_interface(qnn_interface) { QNN_LOG_INFO("graph name %s", graph_name.c_str()); @@ -56,9 +57,9 @@ class ggml_qnn_graph { const QnnGraph_Config_t *graph_configs[] = { &graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config, &graph_opt_config, nullptr }; - error = qnn_interface.graphCreate(qnn_context, graph_name.c_str(), graph_configs, &graph_handle); + error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), graph_configs, &graph_handle); } else { - error = qnn_interface.graphCreate(qnn_context, graph_name.c_str(), nullptr, &graph_handle); + error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), nullptr, &graph_handle); } if (error != QNN_SUCCESS) { @@ -79,7 +80,7 @@ class ggml_qnn_graph { return false; } - auto err = _qnn_interface.tensorCreateGraphTensor(_graph_handle, &tensor); + auto err = _qnn_interface->qnn_tensor_create_graph_tensor(_graph_handle, &tensor); if (err != QNN_SUCCESS) { QNN_LOG_INFO("error = %d\n", err); QNN_LOG_DEBUG("tensor%p name %s", &tensor, QNN_TENSOR_GET_NAME(tensor)); @@ -105,13 +106,13 @@ class ggml_qnn_graph { .v1 = { _graph_name.c_str(), QNN_OP_PACKAGE_NAME_QTI_AISW, op_name.c_str(), 0, qnn_params, (uint32_t)_tensor_inputs.size(), _tensor_inputs.data(), (uint32_t)_tensor_outputs.size(), _tensor_outputs.data() } }; - auto error = _qnn_interface.graphAddNode(_graph_handle, op_config); + auto error = _qnn_interface->qnn_graph_add_node(_graph_handle, op_config); if (error != QNN_SUCCESS) { QNN_LOG_ERROR("graphAddNode.error = %d\n", error); return false; } - error = _qnn_interface.graphFinalize(_graph_handle, nullptr, nullptr); + error = _qnn_interface->qnn_graph_finalize(_graph_handle, nullptr, nullptr); if (error != QNN_SUCCESS) { QNN_LOG_ERROR("graphFinalize.error = %d\n", error); return false; @@ -124,8 +125,9 @@ class ggml_qnn_graph { bool execute(const input_tensor_array_t &tensor_inputs, const output_tensor_array_t &tensor_outputs) { _tensor_inputs = tensor_inputs; _tensor_outputs = tensor_outputs; - auto error = _qnn_interface.graphExecute(_graph_handle, _tensor_inputs.data(), _tensor_inputs.size(), - _tensor_outputs.data(), _tensor_outputs.size(), nullptr, nullptr); + auto error = + _qnn_interface->qnn_graph_execute(_graph_handle, _tensor_inputs.data(), _tensor_inputs.size(), + _tensor_outputs.data(), _tensor_outputs.size(), nullptr, nullptr); if (_device == QNN_BACKEND_NPU) { if (error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) { QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); @@ -149,7 +151,7 @@ class ggml_qnn_graph { private: const std::string _graph_name; const QNNBackend _device; - const QNN_INTERFACE_VER_TYPE _qnn_interface; + std::shared_ptr _qnn_interface; Qnn_GraphHandle_t _graph_handle = nullptr; std::array _tensor_inputs; std::array _tensor_outputs; diff --git a/ggml/src/ggml-qnn/qnn-lib.cpp b/ggml/src/ggml-qnn/qnn-lib.cpp new file mode 100644 index 0000000000000..a7553c4ac2b75 --- /dev/null +++ b/ggml/src/ggml-qnn/qnn-lib.cpp @@ -0,0 +1,35 @@ + +#include "qnn-lib.hpp" + +namespace qnn { + +qnn_system_interface::qnn_system_interface(const QnnSystemInterface_t &qnn_sys_interface, dl_handler_t lib_handle) : + _qnn_sys_interface(qnn_sys_interface), _lib_handle(lib_handle) { + qnn_system_context_create(&_qnn_system_handle); + if (_qnn_system_handle) { + QNN_LOG_INFO("initialize qnn system successfully\n"); + } else { + QNN_LOG_WARN("can not create QNN system contenxt\n"); + } +} + +qnn_system_interface::~qnn_system_interface() { + if (_qnn_system_handle) { + if (qnn_system_context_free(_qnn_system_handle) != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN system context\n"); + } + } else { + QNN_LOG_WARN("system handle is null\n"); + } + + if (_lib_handle) { + int dlclose_error = dl_unload(_lib_handle); + if (dlclose_error != 0) { + QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", dl_error()); + } + } else { + QNN_LOG_WARN("system lib handle is null\n"); + } +} + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/qnn.hpp b/ggml/src/ggml-qnn/qnn-lib.hpp similarity index 81% rename from ggml/src/ggml-qnn/qnn.hpp rename to ggml/src/ggml-qnn/qnn-lib.hpp index 10549a6c5e413..7307c9f63e75f 100644 --- a/ggml/src/ggml-qnn/qnn.hpp +++ b/ggml/src/ggml-qnn/qnn-lib.hpp @@ -25,29 +25,64 @@ namespace qnn { +// TODO: those function should be moved to a separate file, and have separate implementation for each platform +typedef void *dl_handler_t; + +inline dl_handler_t dl_load(const std::string &lib_path) { return dlopen(lib_path.c_str(), RTLD_NOW | RTLD_LOCAL); } + +inline void *dl_sym(dl_handler_t handle, const std::string &symbol) { return dlsym(handle, symbol.c_str()); } + +inline int dl_unload(dl_handler_t handle) { return dlclose(handle); } + +inline const char *dl_error() { return dlerror(); } + // ================================================================================================= // // wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK // ref:https://github.com/pytorch/executorch/tree/main/backends/qualcomm // ================================================================================================= -class qnn_interface { -#define DEFINE_SHIM_FUNCTION_INTERFACE(F, pointer_name) \ - template \ - inline auto qnn_##F(Args... args) const { \ - return (_qnn_interface->QNN_INTERFACE_VER_NAME.pointer_name)(std::forward(args)...); \ - } +class qnn_system_interface { -#define DEFINE_SHIM_FUNCTION_SYS_INTERFACE(F, pointer_name) \ - template \ - inline auto qnn_##F(Args... args) const { \ - return (_qnn_sys_interface->QNN_SYSTEM_INTERFACE_VER_NAME.pointer_name)(std::forward(args)...); \ +#define DEFINE_SHIM_FUNCTION_SYS_INTERFACE(F, pointer_name) \ + template \ + inline auto qnn_##F(Args... args) const { \ + return (_qnn_sys_interface.QNN_SYSTEM_INTERFACE_VER_NAME.pointer_name)(std::forward(args)...); \ } - friend class qnn_instance; +public: + qnn_system_interface(const QnnSystemInterface_t &qnn_sys_interface, dl_handler_t lib_handle); + ~qnn_system_interface(); + bool is_valid() const { return _qnn_system_handle != nullptr; } + + // QnnSystem + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_create, systemContextCreate); + + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, systemContextGetBinaryInfo); + + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree); + +private: + qnn_system_interface(const qnn_system_interface &) = delete; + void operator=(const qnn_system_interface &) = delete; + qnn_system_interface(qnn_system_interface &&) = delete; + void operator=(qnn_system_interface &&) = delete; + + const QnnSystemInterface_t _qnn_sys_interface = {}; + dl_handler_t _lib_handle = nullptr; + QnnSystemContext_Handle_t _qnn_system_handle = nullptr; +}; + +class qnn_interface { + +#define DEFINE_SHIM_FUNCTION_INTERFACE(F, pointer_name) \ + template \ + inline auto qnn_##F(Args... args) const { \ + return (_qnn_interface.QNN_INTERFACE_VER_NAME.pointer_name)(std::forward(args)...); \ + } public: - qnn_interface() = default; + qnn_interface(const QnnInterface_t &qnn_interface) : _qnn_interface(qnn_interface) {} // QnnBackend DEFINE_SHIM_FUNCTION_INTERFACE(backend_create, backendCreate); @@ -59,7 +94,6 @@ class qnn_interface { DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, backendValidateOpConfig); DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, backendGetApiVersion); - // QnnDevice DEFINE_SHIM_FUNCTION_INTERFACE(device_create, deviceCreate); @@ -69,6 +103,8 @@ class qnn_interface { DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, deviceGetPlatformInfo); + DEFINE_SHIM_FUNCTION_INTERFACE(device_free_platform_info, deviceFreePlatformInfo); + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_info, deviceGetInfo); // QnnContext @@ -124,27 +160,15 @@ class qnn_interface { DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, tensorCreateGraphTensor); - // QnnSystem - DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_create, systemContextCreate); - - DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, systemContextGetBinaryInfo); - - DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree); - - void set_qnn_interface(const QnnInterface_t *qnn_interface) { _qnn_interface = qnn_interface; } - - void set_qnn_system_interface(const QnnSystemInterface_t *qnn_sys_interface) { - _qnn_sys_interface = qnn_sys_interface; - } - - uint32_t get_backend_id() const { return _qnn_interface->backendId; } - - bool is_loaded() const { return ((_qnn_sys_interface != nullptr) && (_qnn_interface != nullptr)); } + uint32_t get_backend_id() const { return _qnn_interface.backendId; } private: - const QnnInterface_t *_qnn_interface = nullptr; + qnn_interface(const qnn_interface &) = delete; + void operator=(const qnn_interface &) = delete; + qnn_interface(qnn_interface &&) = delete; + void operator=(qnn_interface &&) = delete; - const QnnSystemInterface_t *_qnn_sys_interface = nullptr; + const QnnInterface_t _qnn_interface = {}; }; class qnn_instance { @@ -161,8 +185,7 @@ class qnn_instance { QNN_LOG_DEBUG("enter qni_init\n"); std::lock_guard lock(_init_mutex); - - if (0 != load_system()) { + if (load_system() != 0) { QNN_LOG_WARN("can not load QNN system lib, pls check why?\n"); return 1; } else { @@ -170,16 +193,16 @@ class qnn_instance { } std::string backend_lib_path = _lib_path + _backend_name; - if (0 == _lib_path_to_backend_id.count(backend_lib_path)) { + if (_lib_path_to_backend_id.count(backend_lib_path) == 0) { int is_load_ok = load_backend(backend_lib_path, saver_config); - if (0 != is_load_ok) { + if (is_load_ok != 0) { QNN_LOG_WARN("failed to load QNN backend\n"); return 2; } } backend_id = _lib_path_to_backend_id[backend_lib_path]; - if (0 == _loaded_backend.count(backend_id) || 0 == _loaded_lib_handle.count(backend_id)) { + if (_loaded_backend.count(backend_id) == 0 || _loaded_lib_handle.count(backend_id) == 0) { QNN_LOG_WARN( "library %s is loaded but loaded backend count=%zu, " "loaded lib_handle count=%zu\n", @@ -187,9 +210,8 @@ class qnn_instance { return 3; } - _qnn_interface.set_qnn_interface(_loaded_backend[backend_id]); - - _qnn_interface.qnn_log_create(qnn::sdk_logcallback, _qnn_log_level, &_qnn_log_handle); + _qnn_interface = std::make_shared(*_loaded_backend[backend_id]); + _qnn_interface->qnn_log_create(qnn::sdk_logcallback, _qnn_log_level, &_qnn_log_handle); if (nullptr == _qnn_log_handle) { // NPU backend not work on Qualcomm SoC equipped low-end phone QNN_LOG_WARN("why failed to initialize qnn log\n"); @@ -199,7 +221,7 @@ class qnn_instance { } std::vector temp_backend_config; - _qnn_interface.qnn_backend_create( + _qnn_interface->qnn_backend_create( _qnn_log_handle, temp_backend_config.empty() ? nullptr : temp_backend_config.data(), &_qnn_backend_handle); if (nullptr == _qnn_backend_handle) { QNN_LOG_WARN("why failed to initialize qnn backend\n"); @@ -208,20 +230,18 @@ class qnn_instance { QNN_LOG_DEBUG("initialize qnn backend successfully\n"); } - if (nullptr != _qnn_raw_interface.propertyHasCapability) { - Qnn_ErrorHandle_t qnn_status = _qnn_raw_interface.propertyHasCapability(QNN_PROPERTY_GROUP_DEVICE); - if (QNN_PROPERTY_NOT_SUPPORTED == qnn_status) { - QNN_LOG_WARN("device property is not supported\n"); - } - if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnn_status) { - QNN_LOG_WARN("device property is not known to backend\n"); - } + Qnn_ErrorHandle_t qnn_status = _qnn_interface->qnn_property_has_capability(QNN_PROPERTY_GROUP_DEVICE); + if (QNN_PROPERTY_NOT_SUPPORTED == qnn_status) { + QNN_LOG_WARN("device property is not supported\n"); + } + if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnn_status) { + QNN_LOG_WARN("device property is not known to backend\n"); } - Qnn_ErrorHandle_t qnn_status = QNN_SUCCESS; + qnn_status = QNN_SUCCESS; if (_backend_name.find("Htp") != std::variant_npos) { const QnnDevice_PlatformInfo_t *p_info = nullptr; - _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info); + _qnn_interface->qnn_device_get_platform_info(nullptr, &p_info); QNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices); QnnDevice_HardwareDeviceInfo_t *infos = p_info->v1.hwDevices; QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = {}; @@ -238,7 +258,7 @@ class qnn_instance { chipinfo.vtcmSize); _soc_info = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize }; } - _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info); + _qnn_interface->qnn_device_free_platform_info(nullptr, p_info); QnnHtpDevice_CustomConfig_t soc_customconfig; soc_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_SOC; @@ -256,9 +276,9 @@ class qnn_instance { arch_devconfig.customConfig = &arch_customconfig; const QnnDevice_Config_t *p_deviceconfig[] = { &soc_devconfig, &arch_devconfig, nullptr }; - qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, p_deviceconfig, &_qnn_device_handle); + qnn_status = _qnn_interface->qnn_device_create(_qnn_log_handle, p_deviceconfig, &_qnn_device_handle); } else { - qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, nullptr, &_qnn_device_handle); + qnn_status = _qnn_interface->qnn_device_create(_qnn_log_handle, nullptr, &_qnn_device_handle); } if (QNN_SUCCESS != qnn_status && QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnn_status) { QNN_LOG_WARN("failed to create QNN device\n"); @@ -270,7 +290,7 @@ class qnn_instance { QNN_LOG_INFO("profiling turned on; level = %d", _profile_level); if (qnn::sdk_profile_level::profile_basic == _profile_level) { QNN_LOG_INFO("basic profiling requested. creating Qnn Profile object\n"); - if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate( + if (QNN_PROFILE_NO_ERROR != _qnn_interface->qnn_profile_create( _qnn_backend_handle, QNN_PROFILE_LEVEL_BASIC, &_qnn_profile_handle)) { QNN_LOG_WARN("unable to create profile handle in the backend\n"); return 6; @@ -279,9 +299,9 @@ class qnn_instance { } } else if (qnn::sdk_profile_level::profile_detail == _profile_level) { QNN_LOG_INFO("detailed profiling requested. Creating Qnn Profile object\n"); - if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate(_qnn_backend_handle, - QNN_PROFILE_LEVEL_DETAILED, - &_qnn_profile_handle)) { + if (QNN_PROFILE_NO_ERROR != _qnn_interface->qnn_profile_create(_qnn_backend_handle, + QNN_PROFILE_LEVEL_DETAILED, + &_qnn_profile_handle)) { QNN_LOG_WARN("unable to create profile handle in the backend\n"); return 7; } else { @@ -290,22 +310,22 @@ class qnn_instance { } } - _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL); + _rpc_lib_handle = dl_load("libcdsprpc.so"); if (nullptr == _rpc_lib_handle) { - QNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dlerror()); + QNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dl_error()); return 8; } else { QNN_LOG_DEBUG("load rpcmem lib successfully\n"); set_rpcmem_initialized(true); } - _pfn_rpc_mem_init = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_init")); - _pfn_rpc_mem_deinit = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_deinit")); - _pfn_rpc_mem_alloc = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_alloc")); - _pfn_rpc_mem_free = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_free")); - _pfn_rpc_mem_to_fd = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_to_fd")); + _pfn_rpc_mem_init = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_init")); + _pfn_rpc_mem_deinit = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_deinit")); + _pfn_rpc_mem_alloc = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_alloc")); + _pfn_rpc_mem_free = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_free")); + _pfn_rpc_mem_to_fd = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_to_fd")); if (nullptr == _pfn_rpc_mem_alloc || nullptr == _pfn_rpc_mem_free || nullptr == _pfn_rpc_mem_to_fd) { - QNN_LOG_WARN("unable to access symbols in QNN RPC lib. dlerror(): %s", dlerror()); - dlclose(_rpc_lib_handle); + QNN_LOG_WARN("unable to access symbols in QNN RPC lib. error: %s", dl_error()); + dl_unload(_rpc_lib_handle); return 9; } @@ -318,7 +338,7 @@ class qnn_instance { qnn_context_config.priority = QNN_PRIORITY_DEFAULT; const QnnContext_Config_t * context_configs[] = {&qnn_context_config, nullptr}; */ - _qnn_interface.qnn_context_create(_qnn_backend_handle, _qnn_device_handle, nullptr, &_qnn_context_handle); + _qnn_interface->qnn_context_create(_qnn_backend_handle, _qnn_device_handle, nullptr, &_qnn_context_handle); if (nullptr == _qnn_context_handle) { QNN_LOG_WARN("why failed to initialize qnn context\n"); return 10; @@ -370,8 +390,8 @@ class qnn_instance { if (nullptr != _pfn_rpc_mem_deinit) // make Qualcomm's SoC equipped low-end phone happy _pfn_rpc_mem_deinit(); - if (dlclose(_rpc_lib_handle) != 0) { - QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dlerror()); + if (dl_unload(_rpc_lib_handle) != 0) { + QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dl_error()); } else { QNN_LOG_DEBUG("succeed to close rpcmem lib\n"); } @@ -381,45 +401,45 @@ class qnn_instance { } if (nullptr != _qnn_context_handle) { - error = _qnn_interface.qnn_context_free(_qnn_context_handle, _qnn_profile_handle); + error = _qnn_interface->qnn_context_free(_qnn_context_handle, _qnn_profile_handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n", _qnn_interface.get_backend_id(), + QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), QNN_GET_ERROR_CODE(error)); } _qnn_context_handle = nullptr; } if (nullptr != _qnn_profile_handle) { - error = _qnn_interface.qnn_profile_free(_qnn_profile_handle); + error = _qnn_interface->qnn_profile_free(_qnn_profile_handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d\n", _qnn_interface.get_backend_id(), + QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), QNN_GET_ERROR_CODE(error)); } _qnn_profile_handle = nullptr; } if (nullptr != _qnn_device_handle) { - error = _qnn_interface.qnn_device_free(_qnn_device_handle); + error = _qnn_interface->qnn_device_free(_qnn_device_handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n", _qnn_interface.get_backend_id(), + QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), QNN_GET_ERROR_CODE(error)); } _qnn_device_handle = nullptr; } if (nullptr != _qnn_backend_handle) { - error = _qnn_interface.qnn_backend_free(_qnn_backend_handle); + error = _qnn_interface->qnn_backend_free(_qnn_backend_handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n", _qnn_interface.get_backend_id(), + QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), QNN_GET_ERROR_CODE(error)); } _qnn_backend_handle = nullptr; } if (nullptr != _qnn_log_handle) { - error = _qnn_interface.qnn_log_free(_qnn_log_handle); + error = _qnn_interface->qnn_log_free(_qnn_log_handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n", _qnn_interface.get_backend_id(), + QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), QNN_GET_ERROR_CODE(error)); } _qnn_log_handle = nullptr; @@ -427,32 +447,18 @@ class qnn_instance { unload_backend(); - unload_system(); + _qnn_sys_interface.reset(); return ret_status; } - const qnn_interface &get_qnn_interface() { - if (!_qnn_interface.is_loaded()) { + std::shared_ptr get_qnn_interface() { + if (!_qnn_interface) { QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); } return _qnn_interface; } - const QNN_INTERFACE_VER_TYPE &get_qnn_raw_interface() { - if (!_qnn_interface.is_loaded()) { - QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); - } - return _qnn_raw_interface; - } - - const QNN_SYSTEM_INTERFACE_VER_TYPE &get_qnn_raw_system_interface() { - if (!_qnn_interface.is_loaded()) { - QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); - } - return _qnn_raw_system_interface; - } - const Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; } const Qnn_ProfileHandle_t get_qnn_profile_handle() { return _qnn_profile_handle; } @@ -463,13 +469,11 @@ class qnn_instance { const Qnn_ContextHandle_t get_qnn_context_handle() { return _qnn_context_handle; } - const QnnSystemContext_Handle_t get_qnn_system_handle() { return _qnn_system_handle; } - const Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; } int init_htp_perfinfra() { QnnDevice_Infrastructure_t device_infra = nullptr; - int error = _qnn_raw_interface.deviceGetInfrastructure(&device_infra); + int error = _qnn_interface->qnn_device_get_infrastructure(&device_infra); if (error != QNN_SUCCESS) { QNN_LOG_WARN("failed to get qnn device infra\n"); return 1; @@ -655,8 +659,8 @@ class qnn_instance { { { mem_fd } } }; Qnn_MemHandle_t handle = nullptr; int error = QNN_SUCCESS; - error = _qnn_interface.qnn_mem_register(_qnn_context_handle, &descriptor, - /*numDescriptors=*/1, &handle); + error = _qnn_interface->qnn_mem_register(_qnn_context_handle, &descriptor, + /*numDescriptors=*/1, &handle); if (error != QNN_SUCCESS) { QNN_LOG_WARN("failed to register shared memory, error %d, %s\n", QNN_GET_ERROR_CODE(error), strerror(error)); @@ -666,7 +670,8 @@ class qnn_instance { QNN_TENSOR_SET_MEM_HANDLE(*p_tensor, handle); _qnn_mem_set.insert((std::pair(p_data, handle))); - QNN_LOG_INFO("tensor %s successfully register shared memory handler: %p\n", QNN_TENSOR_GET_NAME(*p_tensor), handle); + QNN_LOG_INFO("tensor %s successfully register shared memory handler: %p\n", QNN_TENSOR_GET_NAME(*p_tensor), + handle); return 0; } @@ -692,7 +697,7 @@ class qnn_instance { for (std::unordered_map::iterator it = _qnn_mem_set.begin(); it != _qnn_mem_set.end(); it++) { Qnn_MemHandle_t mem_handle = it->second; - error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1); + error = _qnn_interface->qnn_mem_de_register(&mem_handle, 1); if (error != QNN_SUCCESS) { QNN_LOG_WARN("failed to unregister shared memory, error %d\n", QNN_GET_ERROR_CODE(error)); } @@ -711,16 +716,16 @@ class qnn_instance { std::string system_lib_path = _lib_path + "libQnnSystem.so"; QNN_LOG_DEBUG("system_lib_path:%s\n", system_lib_path.c_str()); - _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL); - if (nullptr == _system_lib_handle) { - QNN_LOG_WARN("can not open QNN library %s, error: %s\n", system_lib_path.c_str(), dlerror()); + auto system_lib_handle = dl_load(system_lib_path); + if (!system_lib_handle) { + QNN_LOG_WARN("can not load QNN library %s, error: %s\n", system_lib_path.c_str(), dl_error()); return 1; } auto *get_providers = reinterpret_cast( - dlsym(_system_lib_handle, "QnnSystemInterface_getProviders")); - if (nullptr == get_providers) { - QNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s\n", dlerror()); + dl_sym(system_lib_handle, "QnnSystemInterface_getProviders")); + if (!get_providers) { + QNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s\n", dl_error()); return 2; } @@ -737,7 +742,7 @@ class qnn_instance { return 4; } - if (nullptr == provider_list) { + if (!provider_list) { QNN_LOG_WARN("can not get providers\n"); return 5; } @@ -758,61 +763,31 @@ class qnn_instance { } else { QNN_LOG_INFO("find a valid qnn system interface\n"); } - set_qnn_raw_system_interface(qnn_system_interface); - _qnn_interface.set_qnn_system_interface(provider_list[0]); - - _qnn_interface.qnn_system_context_create(&_qnn_system_handle); - if (nullptr == _qnn_system_handle) { - QNN_LOG_WARN("can not create QNN system contenxt\n"); - } else { - QNN_LOG_INFO("initialize qnn system successfully\n"); + auto qnn_sys_interface = std::make_shared(*provider_list[0], system_lib_handle); + if (!qnn_sys_interface->is_valid()) { + QNN_LOG_WARN("failed to create QNN system interface\n"); + return 7; } + _qnn_sys_interface = qnn_sys_interface; return 0; } - int unload_system() { - int result = 0; - - if (nullptr == _system_lib_handle) { - QNN_LOG_WARN("system lib handle is null\n"); - return 1; - } - - if (nullptr != _qnn_system_handle) { - result = _qnn_interface.qnn_system_context_free(_qnn_system_handle); - if (result != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN system context\n"); - } - _qnn_system_handle = nullptr; - } - - int dlclose_error = dlclose(_system_lib_handle); - if (dlclose_error != 0) { - QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", dlerror()); - return 2; - } - - _system_lib_handle = nullptr; - - return result; - } - int load_backend(std::string &lib_path, const QnnSaver_Config_t **saver_config) { Qnn_ErrorHandle_t error = QNN_SUCCESS; QNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str()); - void *lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL); - if (nullptr == lib_handle) { - QNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dlerror()); + auto lib_handle = dl_load(lib_path.c_str()); + if (!lib_handle) { + QNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dl_error()); return 1; } auto get_providers = qnn::load_qnn_functionpointers( lib_handle, "QnnInterface_getProviders"); - if (nullptr == get_providers) { - QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dlerror()); + if (!get_providers) { + QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dl_error()); return 2; } @@ -829,7 +804,7 @@ class qnn_instance { return 4; } - if (nullptr == provider_list) { + if (!provider_list) { QNN_LOG_WARN("failed to get qnn interface providers\n"); return 5; } @@ -850,7 +825,6 @@ class qnn_instance { } else { QNN_LOG_INFO("find a valid qnn interface\n"); } - set_qnn_raw_interface(qnn_interface); BackendIdType backend_id = provider_list[0]->backendId; _lib_path_to_backend_id[lib_path] = backend_id; @@ -860,9 +834,9 @@ class qnn_instance { _loaded_backend[backend_id] = provider_list[0]; if (_loaded_lib_handle.count(backend_id) > 0) { QNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]); - int dlclose_error = dlclose(_loaded_lib_handle[backend_id]); + int dlclose_error = dl_unload(_loaded_lib_handle[backend_id]); if (dlclose_error != 0) { - QNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], dlerror()); + QNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], dl_error()); } } _loaded_lib_handle[backend_id] = lib_handle; @@ -874,9 +848,9 @@ class qnn_instance { int unload_backend() { int dlclose_error = 0; for (auto &it : _loaded_lib_handle) { - dlclose_error = dlclose(it.second); + dlclose_error = dl_unload(it.second); if (dlclose_error != 0) { - QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, dlerror()); + QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, dl_error()); } } @@ -887,12 +861,6 @@ class qnn_instance { return 0; } - void set_qnn_raw_interface(QNN_INTERFACE_VER_TYPE &raw_interface) { _qnn_raw_interface = raw_interface; } - - void set_qnn_raw_system_interface(QNN_SYSTEM_INTERFACE_VER_TYPE &raw_interface) { - _qnn_raw_system_interface = raw_interface; - } - private: static constexpr const int _required_num_providers = 1; @@ -905,9 +873,8 @@ class qnn_instance { qnn::sdk_profile_level _profile_level = qnn::sdk_profile_level::profile_detail; - qnn_interface _qnn_interface; - - void *_system_lib_handle = nullptr; + std::shared_ptr _qnn_sys_interface; + std::shared_ptr _qnn_interface; Qnn_GraphHandle_t _qnn_graph_handle = nullptr; @@ -921,14 +888,9 @@ class qnn_instance { Qnn_ContextHandle_t _qnn_context_handle = nullptr; - QnnSystemContext_Handle_t _qnn_system_handle = nullptr; - QnnHtpDevice_PerfInfrastructure_t *_qnn_htp_perfinfra = nullptr; uint32_t _qnn_power_configid = 1; - QNN_INTERFACE_VER_TYPE _qnn_raw_interface; - QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface; - std::unordered_map _qnn_mem_set; std::mutex _init_mutex; @@ -936,7 +898,7 @@ class qnn_instance { std::unordered_map _lib_path_to_backend_id; std::unordered_map _loaded_backend; - void *_rpc_lib_handle = nullptr; + dl_handler_t _rpc_lib_handle = nullptr; std::atomic_bool _rpcmem_initialized{ false }; qnn::pfn_rpc_mem_alloc _pfn_rpc_mem_alloc; qnn::pfn_rpc_mem_free _pfn_rpc_mem_free; diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index e6bb63c54481c..e5dc436adaa5c 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -12,7 +12,7 @@ #include "System/QnnSystemInterface.h" #include "graph.hpp" #include "logger.hpp" -#include "qnn.hpp" +#include "qnn-lib.hpp" #include "utils.hpp" namespace qnn { diff --git a/tests/ggml-qnn/CMakeLists.txt b/tests/ggml-qnn/CMakeLists.txt index e72cc13e78ce4..f9678d3d88f00 100644 --- a/tests/ggml-qnn/CMakeLists.txt +++ b/tests/ggml-qnn/CMakeLists.txt @@ -20,6 +20,7 @@ set(SOURCE_FILES ../../ggml/src/ggml-alloc.c ../../ggml/src/ggml-backend.c ../../ggml/src/ggml-quants.c + ../../ggml/src/ggml-qnn/qnn-lib.cpp ../../ggml/src/ggml-qnn/logger.cpp ../../ggml/src/ggml-qnn/utils.cpp ../../ggml/src/ggml-qnn/backend-ops.cpp From b1ef302991577ce1ab0913e71848436542f71ad1 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 17 Jul 2024 12:21:33 +0800 Subject: [PATCH 073/166] refactoring: remove depend of dlsym at utils.hpp --- ggml/src/ggml-qnn/qnn-lib.hpp | 13 +++++++++---- ggml/src/ggml-qnn/utils.hpp | 5 ----- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-qnn/qnn-lib.hpp b/ggml/src/ggml-qnn/qnn-lib.hpp index 7307c9f63e75f..a676f989566e5 100644 --- a/ggml/src/ggml-qnn/qnn-lib.hpp +++ b/ggml/src/ggml-qnn/qnn-lib.hpp @@ -36,6 +36,11 @@ inline int dl_unload(dl_handler_t handle) { return dlclose(handle); } inline const char *dl_error() { return dlerror(); } +template +Fn dl_sym_typed(dl_handler_t handle, const std::string &function_name) { + return reinterpret_cast(dl_sym(handle, function_name)); +} + // ================================================================================================= // // wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK @@ -722,8 +727,8 @@ class qnn_instance { return 1; } - auto *get_providers = reinterpret_cast( - dl_sym(system_lib_handle, "QnnSystemInterface_getProviders")); + auto *get_providers = dl_sym_typed( + system_lib_handle, "QnnSystemInterface_getProviders"); if (!get_providers) { QNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s\n", dl_error()); return 2; @@ -784,8 +789,8 @@ class qnn_instance { return 1; } - auto get_providers = qnn::load_qnn_functionpointers( - lib_handle, "QnnInterface_getProviders"); + auto get_providers = + qnn::dl_sym_typed(lib_handle, "QnnInterface_getProviders"); if (!get_providers) { QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dl_error()); return 2; diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp index 4a01347d0fc1b..66c3eeba471e2 100644 --- a/ggml/src/ggml-qnn/utils.hpp +++ b/ggml/src/ggml-qnn/utils.hpp @@ -24,11 +24,6 @@ uint32_t get_ggml_tensor_data_size(const ggml_tensor *tensor); const char *opname_from_ggmlop(enum ggml_op ggmlop); -template -Fn load_qnn_functionpointers(void *handle, const char *function_name) { - return reinterpret_cast(dlsym(handle, function_name)); -} - inline int validate_tensor_version(const Qnn_Tensor_t &tensor) { if (tensor.version != QNN_TENSOR_VERSION_1) { QNN_LOG_WARN("validate_tensor_version() tensor %s, got unsupported version %d\n", tensor.v1.name, From 63dc587dffae40c0cd7f1468859f2d430039a29e Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 17 Jul 2024 13:34:05 +0800 Subject: [PATCH 074/166] refactoring: make the buffer alloc and free stay in same class --- ggml/src/ggml-qnn.cpp | 86 +++++++++++++++++++++---------------- ggml/src/ggml-qnn/utils.cpp | 21 ++++++++- ggml/src/ggml-qnn/utils.hpp | 3 ++ 3 files changed, 73 insertions(+), 37 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 46f7e64bcdedb..46fdf87a64a9c 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -83,22 +83,54 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { ggml_backend_qnn_context(QNN_BACKEND_NPU, 1, "qnn-npu", "libQnnHtp.so"), /* QNN_BACKEND_NPU */ }; -struct ggml_backend_qnn_buffer_context { - ggml_backend_qnn_buffer_context(size_t device) : device(device), name(QNN_BACKEND_NAME + std::to_string(device)) {} +class ggml_backend_qnn_buffer_context { +public: + ggml_backend_qnn_buffer_context(QNNBackend device, std::shared_ptr instance, size_t size) : + _device(device), _instance(instance), _name(QNN_BACKEND_NAME + std::to_string(device)) { + + size_t size_page = sysconf(_SC_PAGESIZE); + + // TODO: for qnn npu, a better way here is to reuse the buffer allocated by qnn rpc, will save an extra copy + _buffer = qnn::align_alloc(size_page, size); + + if (!_buffer) { + QNN_LOG_WARN("failed to allocate %.2f MiB\n", float(size / (1 << 20))); + return; + } + + _buffer_size = size; + } ~ggml_backend_qnn_buffer_context() { - tensors.clear(); - if (buffer) { - free(buffer); + _tensors.clear(); + + // the free will do nothing if the _buffer is nullptr + qnn::align_free(_buffer); + } + + bool is_valid() const { return _buffer != nullptr; } + + bool init_tensor(ggml_tensor *tensor) { + auto qnn_tensor = std::make_unique(tensor, _device, _instance); + if (!qnn_tensor->is_valid()) { + QNN_LOG_WARN("Create ggml_qnn_tensor failed"); + return false; } + + _tensors.push_back(std::move(qnn_tensor)); + return true; } - void *buffer = nullptr; - struct ggml_backend_qnn_context *backend_ctx = nullptr; - std::list> tensors; - size_t buffer_size = 0; - size_t device; - std::string name; + void *get_buffer() { return _buffer; } + size_t get_buffer_size() { return _buffer_size; } + +private: + QNNBackend _device; + std::shared_ptr _instance; + std::string _name; + std::list> _tensors; + void *_buffer = nullptr; + size_t _buffer_size = 0; }; struct ggml_backend_qnn_buffer_type_context { @@ -189,20 +221,16 @@ GGML_CALL static void ggml_backend_qnn_buffer_free_buffer(ggml_backend_buffer_t GGML_CALL static void *ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t buffer) { ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context; - return ctx->buffer; + return ctx->get_buffer(); } GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor) { ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context; - auto instance = ctx->backend_ctx->instance; - auto qnn_tensor = std::make_unique(tensor, (QNNBackend)(ctx->device), instance); - if (!qnn_tensor->is_valid()) { - QNN_LOG_WARN("Create ggml_qnn_tensor failed"); + if (!ctx->init_tensor(tensor)) { + QNN_LOG_WARN("init ggml_qnn_tensor failed"); return; } - - ctx->tensors.push_back(std::move(qnn_tensor)); } GGML_CALL static void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor, @@ -232,7 +260,7 @@ GGML_CALL static bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t b GGML_CALL static void ggml_backend_qnn_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context; - memset(ctx->buffer, value, ctx->buffer_size); + memset(ctx->get_buffer(), value, ctx->get_buffer_size()); } static ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = { @@ -263,23 +291,9 @@ static void *ggml_qnn_host_malloc(size_t n) { GGML_CALL static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { ggml_backend_qnn_buffer_type_context *buft_ctx = (ggml_backend_qnn_buffer_type_context *)buft->context; - ggml_backend_qnn_buffer_context *ctx = new ggml_backend_qnn_buffer_context(buft_ctx->device); - - size_t size_page = sysconf(_SC_PAGESIZE); - - size_t size_aligned = size; - if ((size_aligned % size_page) != 0) { - size_aligned += (size_page - (size_aligned % size_page)); - } - - // TODO:use pre-allocated buffer in internal memory pool - ctx->buffer = ggml_qnn_host_malloc(size_aligned); - ctx->buffer_size = size_aligned; - - ctx->backend_ctx = &g_qnn_mgr[buft_ctx->device]; - - if (nullptr == ctx->buffer) { - QNN_LOG_WARN("%s: failed to allocate %.2f MiB\n", __func__, size / (1 << 20)); + ggml_backend_qnn_buffer_context *ctx = + new ggml_backend_qnn_buffer_context((QNNBackend)buft_ctx->device, g_qnn_mgr[buft_ctx->device].instance, size); + if (!ctx->is_valid()) { return nullptr; } diff --git a/ggml/src/ggml-qnn/utils.cpp b/ggml/src/ggml-qnn/utils.cpp index 7c25314f731f0..2b594bfa0503b 100644 --- a/ggml/src/ggml-qnn/utils.cpp +++ b/ggml/src/ggml-qnn/utils.cpp @@ -1,6 +1,8 @@ #include "utils.hpp" +#include + #include "ggml-qnn.h" #include "qnn-types.hpp" @@ -111,7 +113,7 @@ const char *get_htparch_desc(size_t htp_arch) { intptr_t align_to(size_t alignment, intptr_t offset) { return offset % alignment == 0 ? offset - : offset + (static_cast(alignment) - offset % static_cast(alignment)); + : offset + (static_cast(alignment) - (offset % static_cast(alignment))); } uint32_t get_ggml_tensor_data_size(const ggml_tensor *tensor) { @@ -127,6 +129,23 @@ uint32_t get_ggml_tensor_data_size(const ggml_tensor *tensor) { return ggml_nbytes(tensor); } +void *align_alloc(size_t alignment, size_t size) { + size_t size_aligned = size; + if ((size_aligned % alignment) != 0) { + size_aligned += (alignment - (size_aligned % alignment)); + } + + void *data = std::aligned_alloc(alignment, size_aligned); + if (!data) { + QNN_LOG_WARN("aligned_alloc failed\n"); + return nullptr; + } + + return data; +} + +void align_free(void *ptr) { std::free(ptr); } + // ================================================================================================= // // QNN backend internal helper functions diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp index 66c3eeba471e2..b264f2326c7b2 100644 --- a/ggml/src/ggml-qnn/utils.hpp +++ b/ggml/src/ggml-qnn/utils.hpp @@ -22,6 +22,9 @@ const char *get_htparch_desc(size_t htp_arch); intptr_t align_to(size_t alignment, intptr_t offset); uint32_t get_ggml_tensor_data_size(const ggml_tensor *tensor); +void *align_alloc(size_t alignment, size_t size); +void align_free(void *ptr); + const char *opname_from_ggmlop(enum ggml_op ggmlop); inline int validate_tensor_version(const Qnn_Tensor_t &tensor) { From bb13795dce15c783c75ad92d1ea50ea214912324 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 17 Jul 2024 14:13:42 +0800 Subject: [PATCH 075/166] refactoring: remove unused functions and variables --- ggml/src/ggml-qnn.cpp | 26 -------- ggml/src/ggml-qnn/utils.cpp | 115 ------------------------------------ ggml/src/ggml-qnn/utils.hpp | 7 --- 3 files changed, 148 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 46fdf87a64a9c..13998a73ef7aa 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -49,21 +49,6 @@ #define QNN_BACKEND_NAME "qnn" -static struct qnn::qcom_socinfo g_qnn_soc_info_table[] = { - /* Qualcomm SnapDragon 8 Gen 1 */ - [qnn::SM8450] = { .soc_model = qnn::SM8450, .htp_arch = qnn::V69, .vtcm_size_in_mb = 8 }, - - /* Qualcomm SnapDragon 8 Gen 1+ */ - [qnn::SM8475] = { .soc_model = qnn::SM8475, .htp_arch = qnn::V69, .vtcm_size_in_mb = 8 }, - - /* Qualcomm SnapDragon 8 Gen 2 */ - [qnn::SM8550] = { .soc_model = qnn::SM8550, .htp_arch = qnn::V73, .vtcm_size_in_mb = 8 }, - - /* Qualcomm SnapDragon 8 Gen 3 */ - [qnn::SM8650] = { .soc_model = qnn::SM8650, .htp_arch = qnn::V75, .vtcm_size_in_mb = 8 }, - -}; - // according to the QNN SDK Reference Guide, // CPU - Choose a non-quantized model.Quantized models are currently incompatible with the CPU backend // GPU - Choose a non-quantized model.Quantized models are currently incompatible with the GPU backend @@ -277,17 +262,6 @@ static ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = { GGML_CALL static const char *ggml_backend_qnn_buffer_type_name(ggml_backend_buffer_type_t buft) { return "QNN"; } -static void *ggml_qnn_host_malloc(size_t n) { - void *data = nullptr; - int result = posix_memalign((void **)&data, sysconf(_SC_PAGESIZE), n); - if (result != 0) { - QNN_LOG_WARN("%s: error: posix_memalign failed\n", __func__); - return nullptr; - } - - return data; -} - GGML_CALL static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { ggml_backend_qnn_buffer_type_context *buft_ctx = (ggml_backend_qnn_buffer_type_context *)buft->context; diff --git a/ggml/src/ggml-qnn/utils.cpp b/ggml/src/ggml-qnn/utils.cpp index 2b594bfa0503b..11358395219ca 100644 --- a/ggml/src/ggml-qnn/utils.cpp +++ b/ggml/src/ggml-qnn/utils.cpp @@ -7,20 +7,6 @@ #include "qnn-types.hpp" -namespace { - -size_t memscpy(void *dst, size_t dst_size, const void *src, size_t copy_size) { - if (!dst || !src || !dst_size || !copy_size) return 0; - - size_t min_size = dst_size < copy_size ? dst_size : copy_size; - - memcpy(dst, src, min_size); - - return min_size; -} - -} // namespace - namespace qnn { // TODO: mapping more ggml data type to QNN data type @@ -166,105 +152,4 @@ const char *opname_from_ggmlop(enum ggml_op ggmlop) { return nullptr; } -void device_tensor_init(Qnn_Tensor_t &tensor, uint32_t rank, Qnn_TensorMemType_t mem_type, const char *tensor_name, - Qnn_TensorType_t qnn_tensor_type, Qnn_DataType_t qnn_data_type, uint32_t *dimensions) { - tensor = QNN_TENSOR_INIT; - tensor = { .version = QNN_TENSOR_VERSION_1, - { .v1 = { .id = 0, - .name = tensor_name, - .type = qnn_tensor_type, - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = qnn_data_type, - .quantizeParams = { QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - { .scaleOffsetEncoding = { .scale = 0.0000000000000000f, .offset = 0 } } }, - .rank = rank, - .dimensions = dimensions, - .memType = mem_type, - { .clientBuf = {} } } } }; -} - -Qnn_ErrorHandle_t device_tensor_deep_copy(const Qnn_Tensor_t &src, Qnn_Tensor_t &dst) { - Qnn_ErrorHandle_t err = validate_tensor_version(src); - if (err != QNN_SUCCESS) { - QNN_LOG_WARN("validate_tensor_version expected QNN_SUCCESS\n"); - return err; - } - - dst.version = src.version; - QNN_TENSOR_SET_NAME(dst, ::strndup(QNN_TENSOR_GET_NAME(src), std::string(QNN_TENSOR_GET_NAME(src)).size())); - if (nullptr == QNN_TENSOR_GET_NAME(dst)) { - return (Qnn_ErrorHandle_t)1; - } - QNN_TENSOR_SET_ID(dst, QNN_TENSOR_GET_ID(src)); - QNN_TENSOR_SET_TYPE(dst, QNN_TENSOR_GET_TYPE(src)); - QNN_TENSOR_SET_DATA_FORMAT(dst, QNN_TENSOR_GET_DATA_FORMAT(src)); - QNN_TENSOR_SET_DATA_TYPE(dst, QNN_TENSOR_GET_DATA_TYPE(src)); - QNN_TENSOR_SET_MEM_TYPE(dst, QNN_TENSOR_GET_MEM_TYPE(src)); - - if (QNN_TENSOR_GET_MEM_TYPE(src) == QNN_TENSORMEMTYPE_RAW) { - Qnn_ClientBuffer_t client_buf = { nullptr, 0 }; - QNN_TENSOR_SET_CLIENT_BUF(dst, client_buf); - } else if (QNN_TENSOR_GET_MEM_TYPE(src) == QNN_TENSORMEMTYPE_MEMHANDLE) { - QNN_TENSOR_SET_MEM_HANDLE(dst, nullptr); - } else { - return (Qnn_ErrorHandle_t)1; - } - - Qnn_QuantizeParams_t src_qparam = QNN_TENSOR_GET_QUANT_PARAMS(src); - Qnn_QuantizationEncoding_t encoding = src_qparam.quantizationEncoding; - if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) { - Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; - Qnn_AxisScaleOffset_t &axis_scale_offset = src_qparam_cpy.axisScaleOffsetEncoding; - Qnn_ScaleOffset_t **scaleOffset = &axis_scale_offset.scaleOffset; - size_t scaleOffsetSize = axis_scale_offset.numScaleOffsets * sizeof(Qnn_ScaleOffset_t); - *scaleOffset = (Qnn_ScaleOffset_t *)malloc(scaleOffsetSize); - memscpy(*scaleOffset, scaleOffsetSize, src_qparam.axisScaleOffsetEncoding.scaleOffset, scaleOffsetSize); - QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam_cpy); - } else if (encoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) { - Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; - Qnn_BwAxisScaleOffset_t &bwaxis_scale_offset = src_qparam_cpy.bwAxisScaleOffsetEncoding; - size_t scaleSize = bwaxis_scale_offset.numElements * sizeof(float); - float **scales = &bwaxis_scale_offset.scales; - int32_t **offsets = &bwaxis_scale_offset.offsets; - *scales = (float *)malloc(scaleSize); - memscpy(*scales, scaleSize, src_qparam.bwAxisScaleOffsetEncoding.scales, scaleSize); - - if (bwaxis_scale_offset.offsets != nullptr) { - size_t offsetSize = bwaxis_scale_offset.numElements * sizeof(int32_t); - *offsets = (int32_t *)malloc(offsetSize); - memscpy(*offsets, offsetSize, src_qparam.bwAxisScaleOffsetEncoding.offsets, offsetSize); - } - QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam_cpy); - } else { - QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam); - } - - uint32_t rank = QNN_TENSOR_GET_RANK(src); - QNN_TENSOR_SET_RANK(dst, rank); - size_t dim_size = rank * sizeof(uint32_t); - uint32_t *dimensions = (uint32_t *)malloc(dim_size); - if (dimensions == nullptr) { - QNN_LOG_WARN( - "deep_copy_qnn_tensors() allocation error while copying " - "tensor %s\n", - QNN_TENSOR_GET_NAME(src)); - return (Qnn_ErrorHandle_t)1; - } - memscpy(dimensions, dim_size, QNN_TENSOR_GET_DIMENSIONS(src), dim_size); - QNN_TENSOR_SET_DIMENSIONS(dst, dimensions); - - return err; -} - -void device_tensor_free(Qnn_Tensor_t &tensor) { - if (validate_tensor_version(tensor) != QNN_SUCCESS) { - QNN_LOG_WARN("validate_tensor_version expected QNN_SUCCESS\n"); - return; - } - - free((void *)QNN_TENSOR_GET_NAME(tensor)); - free(QNN_TENSOR_GET_DIMENSIONS(tensor)); -} - } // namespace qnn diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp index b264f2326c7b2..d00673e9a47ce 100644 --- a/ggml/src/ggml-qnn/utils.hpp +++ b/ggml/src/ggml-qnn/utils.hpp @@ -176,13 +176,6 @@ inline void set_qnn_tensor_memhandle(Qnn_Tensor_t &tensor, Qnn_MemHandle_t handl Qnn_DataType_t device_datatype_from_ggml_datatype(ggml_type ggml_type); Qnn_TensorType_t device_tensortype_from_ggml_tensor(ggml_tensor *ggml_tensor); -void device_tensor_init(Qnn_Tensor_t &tensor, uint32_t rank, Qnn_TensorMemType_t mem_type, const char *tensor_name, - Qnn_TensorType_t qnn_tensor_type, Qnn_DataType_t qnn_data_type, uint32_t *dimensions); - -Qnn_ErrorHandle_t device_tensor_deep_copy(const Qnn_Tensor_t &src, Qnn_Tensor_t &dst); - -void device_tensor_free(Qnn_Tensor_t &tensor); - #if ENABLE_QNNBACKEND_PERF class qnn_perf { public: From eed960575fa8a4819c9a0e240a302ab9f1119a77 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 17 Jul 2024 19:43:01 +0800 Subject: [PATCH 076/166] add build step of QNN backend at ggml --- CMakeLists.txt | 1 + ggml/CMakeLists.txt | 3 ++- ggml/src/CMakeLists.txt | 28 ++++++++++++++++++++++++++++ 3 files changed, 31 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 67dcf86d4fab7..1afc63c639089 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -106,6 +106,7 @@ llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE) llama_option_depr(WARNING LLAMA_RPC GGML_RPC) llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL) llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16) +llama_option_depr(WARNING LLAMA_QNN GGML_QNN) # # build the library diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 649ac3dcc4f63..294653804b5f8 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -145,6 +145,7 @@ option(GGML_SYCL "ggml: use SYCL" option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF) set (GGML_SYCL_TARGET "INTEL" CACHE STRING "ggml: sycl target device") +option(GGML_QNN "ggml: use QNN" OFF) # extra artifacts option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE}) @@ -157,7 +158,7 @@ option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE}) set(CMAKE_C_STANDARD 11) set(CMAKE_C_STANDARD_REQUIRED true) -if (GGML_SYCL) +if (GGML_SYCL OR GGML_QNN) set(CMAKE_CXX_STANDARD 17) else() set(CMAKE_CXX_STANDARD 11) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index cbadaf4d931c3..e2ba88a1781e0 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -770,6 +770,33 @@ if (GGML_CPU_HBM) target_link_libraries(ggml PUBLIC memkind) endif() +if (GGML_QNN) + if (CMAKE_SYSTEM_NAME STREQUAL "Android") + find_library(LOG_LIB log) + find_library(ANDROID_LIB android) + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${LOG_LIB} ${ANDROID_LIB}) + else() + message(FATAL_ERROR "QNN now only available on Android") + endif() + + if (NOT DEFINED GGML_QNN_SDK_PATH) + # try read from environment variable + if (DEFINED ENV{QNN_SDK_PATH}) + set(GGML_QNN_SDK_PATH $ENV{QNN_SDK_PATH}) + else() + message(FATAL_ERROR "GGML_QNN_SDK_PATH not defined") + endif() + endif() + + message("QNN_SDK_PATH: ${GGML_QNN_SDK_PATH}") + file(GLOB GGML_SOURCES_QNN "ggml-qnn/*.cpp") + list(APPEND GGML_SOURCES_QNN "ggml-qnn.cpp") + set(GGML_HEADERS_QNN ../include/ggml-qnn.h) + set(QNN_INC_PATH ${GGML_QNN_SDK_PATH}/include/QNN) + set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${QNN_INC_PATH} "ggml-qnn") + list(APPEND GGML_CDEF_PUBLIC GGML_USE_QNN) +endif() + function(get_flags CCID CCVER) set(C_FLAGS "") set(CXX_FLAGS "") @@ -1184,6 +1211,7 @@ add_library(ggml ${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM} ${GGML_SOURCES_BLAS} ${GGML_HEADERS_BLAS} ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE} + ${GGML_SOURCES_QNN} ${GGML_HEADERS_QNN} ggml-aarch64.c ggml-aarch64.h ) From 454deef83c14ae33543d289ac40d3a6ad277a3cf Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 17 Jul 2024 20:53:53 +0800 Subject: [PATCH 077/166] register qnn backend --- ggml/include/ggml-qnn.h | 2 -- ggml/src/ggml-backend.c | 5 +++++ ggml/src/ggml-qnn.cpp | 4 +--- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/ggml/include/ggml-qnn.h b/ggml/include/ggml-qnn.h index 026c6ddf06672..2433af1668408 100644 --- a/ggml/include/ggml-qnn.h +++ b/ggml/include/ggml-qnn.h @@ -18,8 +18,6 @@ enum QNNBackend { // QNN and original GGML }; -GGML_API int ggml_backend_qnn_reg_devices(void); - /** * * @param device 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2: diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c index dbbaa3941febe..80272855de860 100644 --- a/ggml/src/ggml-backend.c +++ b/ggml/src/ggml-backend.c @@ -445,6 +445,11 @@ GGML_CALL static void ggml_backend_registry_init(void) { extern GGML_CALL void ggml_backend_kompute_reg_devices(void); ggml_backend_kompute_reg_devices(); #endif + +#ifdef GGML_USE_QNN + extern GGML_CALL void ggml_backend_qnn_reg_devices(void); + ggml_backend_qnn_reg_devices(); +#endif } GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) { diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 13998a73ef7aa..f8031bb0fd516 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -534,13 +534,11 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char *qnn_lib_path) { return qnn_backend; } -int ggml_backend_qnn_reg_devices() { +extern "C" GGML_CALL void ggml_backend_qnn_reg_devices() { for (size_t idx = 0; idx < GGML_QNN_MAX_DEVICES; idx++) { char name[GGML_MAX_NAME]; ggml_backend_qnn_get_device_description(idx, name, GGML_MAX_NAME); ggml_backend_register(name, ggml_backend_qnn_reg_init, ggml_backend_qnn_buffer_type(idx), (void *)(intptr_t)idx); } - - return GGML_QNN_MAX_DEVICES; } From 2502b57203c69916eb7fde14ed46a3b2199ebbcc Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 17 Jul 2024 21:39:25 +0800 Subject: [PATCH 078/166] fix warnings --- ggml/src/ggml-qnn.cpp | 18 ++++++++++++------ ggml/src/ggml-qnn/graph.hpp | 3 +-- ggml/src/ggml-qnn/logger.cpp | 9 ++++++--- ggml/src/ggml-qnn/qnn-lib.hpp | 24 +++++++++++++++--------- ggml/src/ggml-qnn/utils.hpp | 2 +- 5 files changed, 35 insertions(+), 21 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index f8031bb0fd516..8ba258d632f38 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -173,7 +173,7 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, const struct g return true; } -bool ggml_qnn_compute_forward(ggml_backend_qnn_context *ctx, struct ggml_tensor *tensor) { +static bool ggml_qnn_compute_forward(ggml_backend_qnn_context *ctx, struct ggml_tensor *tensor) { auto unary_op = qnn::ggml_qnn_unary_op_array()[tensor->op]; if (unary_op) { return unary_op(ctx, tensor->src[0], tensor); @@ -260,7 +260,10 @@ static ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = { /* .reset = */ nullptr, }; -GGML_CALL static const char *ggml_backend_qnn_buffer_type_name(ggml_backend_buffer_type_t buft) { return "QNN"; } +GGML_CALL static const char *ggml_backend_qnn_buffer_type_name(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + return "QNN"; +} GGML_CALL static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { @@ -291,7 +294,10 @@ GGML_CALL static bool ggml_backend_qnn_buffer_is_host(ggml_backend_buffer_type_t return true; } -GGML_CALL static const char *ggml_backend_qnn_name(ggml_backend_t backend) { return "QNN"; } +GGML_CALL static const char *ggml_backend_qnn_name(ggml_backend_t backend) { + GGML_UNUSED(backend); + return "QNN"; +} GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { QNN_LOG_INFO("enter %s", __func__); @@ -408,8 +414,6 @@ void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int n_threads) { ctx->threads = n_threads; } -const char *ggml_backend_qnn_get_name(ggml_backend_t backend) { return backend->iface.get_name(backend); } - int ggml_backend_qnn_get_device_count() { return GGML_QNN_MAX_DEVICES; } void ggml_backend_qnn_get_device_description(size_t dev_num, char *description, size_t description_size) { @@ -534,7 +538,9 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char *qnn_lib_path) { return qnn_backend; } -extern "C" GGML_CALL void ggml_backend_qnn_reg_devices() { +extern "C" GGML_CALL void ggml_backend_qnn_reg_devices(); + +GGML_CALL void ggml_backend_qnn_reg_devices() { for (size_t idx = 0; idx < GGML_QNN_MAX_DEVICES; idx++) { char name[GGML_MAX_NAME]; ggml_backend_qnn_get_device_description(idx, name, GGML_MAX_NAME); diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index e4900906ce3e9..9621ad1b4dd68 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -101,10 +101,9 @@ class ggml_qnn_graph { _tensor_inputs = tensor_inputs; _tensor_outputs = tensor_outputs; - Qnn_Param_t qnn_params[] = {}; Qnn_OpConfig_t op_config = { .version = QNN_OPCONFIG_VERSION_1, .v1 = { _graph_name.c_str(), QNN_OP_PACKAGE_NAME_QTI_AISW, op_name.c_str(), 0, - qnn_params, (uint32_t)_tensor_inputs.size(), _tensor_inputs.data(), + nullptr, (uint32_t)_tensor_inputs.size(), _tensor_inputs.data(), (uint32_t)_tensor_outputs.size(), _tensor_outputs.data() } }; auto error = _qnn_interface->qnn_graph_add_node(_graph_handle, op_config); if (error != QNN_SUCCESS) { diff --git a/ggml/src/ggml-qnn/logger.cpp b/ggml/src/ggml-qnn/logger.cpp index 8b74b90edf476..fc37161edba17 100644 --- a/ggml/src/ggml-qnn/logger.cpp +++ b/ggml/src/ggml-qnn/logger.cpp @@ -9,7 +9,8 @@ #include #endif -void qnn::internal_log(ggml_log_level level, const char *file, const char *func, int line, const char *format, ...) { +void qnn::internal_log(ggml_log_level level, const char * /*file*/, const char *func, int line, const char *format, + ...) { static std::mutex qnn_internal_log_mutex; static char s_qnn_internal_log_buf[QNN_LOGBUF_LEN]; @@ -32,8 +33,8 @@ void qnn::internal_log(ggml_log_level level, const char *file, const char *func, } } -void qnn::sdk_logcallback(const char *fmt, QnnLog_Level_t level, uint64_t timestamp, va_list argp) { #if ENABLE_QNNSDK_LOG +void qnn::sdk_logcallback(const char *fmt, QnnLog_Level_t level, uint64_t timestamp, va_list argp) { static std::mutex log_mutex; static unsigned char s_ggml_qnn_logbuf[QNN_LOGBUF_LEN]; @@ -67,5 +68,7 @@ void qnn::sdk_logcallback(const char *fmt, QnnLog_Level_t level, uint64_t timest vsnprintf(reinterpret_cast(s_ggml_qnn_logbuf), QNN_LOGBUF_LEN, fmt, argp); QNN_LOG_INFO("%8.1fms [%-7s] %s", ms, log_level_desc, s_ggml_qnn_logbuf); } -#endif } +#else +void qnn::sdk_logcallback(const char *, QnnLog_Level_t, uint64_t, va_list) {} +#endif diff --git a/ggml/src/ggml-qnn/qnn-lib.hpp b/ggml/src/ggml-qnn/qnn-lib.hpp index a676f989566e5..a46901695aa6d 100644 --- a/ggml/src/ggml-qnn/qnn-lib.hpp +++ b/ggml/src/ggml-qnn/qnn-lib.hpp @@ -47,6 +47,10 @@ Fn dl_sym_typed(dl_handler_t handle, const std::string &function_name) { // ref:https://github.com/pytorch/executorch/tree/main/backends/qualcomm // ================================================================================================= +// TODO: fix this for other compilers +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wextra-semi" + class qnn_system_interface { #define DEFINE_SHIM_FUNCTION_SYS_INTERFACE(F, pointer_name) \ @@ -176,12 +180,14 @@ class qnn_interface { const QnnInterface_t _qnn_interface = {}; }; +#pragma GCC diagnostic pop + class qnn_instance { public: using BackendIdType = decltype(QnnInterface_t{}.backendId); explicit qnn_instance(const std::string &lib_path, const std::string &backend_name, const std::string &model_name) : - _lib_path(std::move(lib_path)), _backend_name(std::move(backend_name)), _model_name(std::move(model_name)) {}; + _lib_path(std::move(lib_path)), _backend_name(std::move(backend_name)), _model_name(std::move(model_name)) {} ~qnn_instance() {} @@ -250,7 +256,7 @@ class qnn_instance { QNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices); QnnDevice_HardwareDeviceInfo_t *infos = p_info->v1.hwDevices; QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = {}; - for (int i = 0; i < p_info->v1.numHwDevices; i++) { + for (uint32_t i = 0; i < p_info->v1.numHwDevices; i++) { QNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId, infos[i].v1.deviceType, infos[i].v1.numCores); QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension; @@ -464,17 +470,17 @@ class qnn_instance { return _qnn_interface; } - const Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; } + Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; } - const Qnn_ProfileHandle_t get_qnn_profile_handle() { return _qnn_profile_handle; } + Qnn_ProfileHandle_t get_qnn_profile_handle() { return _qnn_profile_handle; } - const Qnn_DeviceHandle_t get_qnn_device_handle() { return _qnn_device_handle; } + Qnn_DeviceHandle_t get_qnn_device_handle() { return _qnn_device_handle; } - const Qnn_BackendHandle_t get_qnn_backend_handle() { return _qnn_backend_handle; } + Qnn_BackendHandle_t get_qnn_backend_handle() { return _qnn_backend_handle; } - const Qnn_ContextHandle_t get_qnn_context_handle() { return _qnn_context_handle; } + Qnn_ContextHandle_t get_qnn_context_handle() { return _qnn_context_handle; } - const Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; } + Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; } int init_htp_perfinfra() { QnnDevice_Infrastructure_t device_infra = nullptr; @@ -779,7 +785,7 @@ class qnn_instance { return 0; } - int load_backend(std::string &lib_path, const QnnSaver_Config_t **saver_config) { + int load_backend(std::string &lib_path, const QnnSaver_Config_t ** /*saver_config*/) { Qnn_ErrorHandle_t error = QNN_SUCCESS; QNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str()); diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp index d00673e9a47ce..e8f1bf71e88be 100644 --- a/ggml/src/ggml-qnn/utils.hpp +++ b/ggml/src/ggml-qnn/utils.hpp @@ -202,7 +202,7 @@ class qnn_perf { #else class qnn_perf { public: - qnn_perf(const std::string &perf_name) {} + qnn_perf(const std::string &) {} ~qnn_perf() { info(); } qnn_perf() = delete; qnn_perf(const qnn_perf &) = delete; From b7d781ec81eb2bdeedabdf540fdbec37cfb02e90 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 17 Jul 2024 23:08:16 +0800 Subject: [PATCH 079/166] remove qnn dedicated unit tests since we're now using the `test-backend-ops` to cross-validate backend ops --- tests/ggml-qnn/CMakeLists.txt | 68 --- tests/ggml-qnn/ggml-qnn-ut-build-run.sh | 207 --------- tests/ggml-qnn/ggml-qnn-ut.cpp | 544 ------------------------ 3 files changed, 819 deletions(-) delete mode 100644 tests/ggml-qnn/CMakeLists.txt delete mode 100755 tests/ggml-qnn/ggml-qnn-ut-build-run.sh delete mode 100644 tests/ggml-qnn/ggml-qnn-ut.cpp diff --git a/tests/ggml-qnn/CMakeLists.txt b/tests/ggml-qnn/CMakeLists.txt deleted file mode 100644 index f9678d3d88f00..0000000000000 --- a/tests/ggml-qnn/CMakeLists.txt +++ /dev/null @@ -1,68 +0,0 @@ -cmake_minimum_required(VERSION 3.22.1) -project(ggml-qnn-test) - -set(CMAKE_VERBOSE_MAKEFILE on) -set(CMAKE_CXX_STANDARD 17) -set(CMAKE_CXX_STANDARD_REQUIRED ON) -set(CMAKE_POSITION_INDEPENDENT_CODE ON) - -#set to OFF if target Android phone is not equipped with Qualcomm Snapdragon 8 Gen 3 -set(TARGET_SNAPDRAGON_8_GEN3 ON) - -set(QNN_INC_PATH ${QNN_SDK_PATH}/include/QNN) -set(QNN_LIB_PATH ${QNN_SDK_PATH}/lib/aarch64-android) - -include_directories(${QNN_INC_PATH}) -include_directories(../../ggml/include) # ggml.h, ggml-qnn.h - -set(SOURCE_FILES - ../../ggml/src/ggml.c - ../../ggml/src/ggml-alloc.c - ../../ggml/src/ggml-backend.c - ../../ggml/src/ggml-quants.c - ../../ggml/src/ggml-qnn/qnn-lib.cpp - ../../ggml/src/ggml-qnn/logger.cpp - ../../ggml/src/ggml-qnn/utils.cpp - ../../ggml/src/ggml-qnn/backend-ops.cpp - ../../ggml/src/ggml-qnn.cpp - ggml-qnn-ut.cpp -) - - -message("QNN_SDK_PATH : ${QNN_SDK_PATH}") -message("QNN_INC_PATH : ${QNN_INC_PATH}") -message("QNN_LIB_PATH : ${QNN_LIB_PATH}") - -add_definitions(-D__ARM_NEON) -add_definitions(-DGGML_USE_QNN) - -if(CMAKE_BUILD_TYPE STREQUAL "Release") - add_definitions(-DNDEBUG) - add_definitions(-O3) -else() - add_definitions(-O3) -endif() - -if (TARGET_SNAPDRAGON_8_GEN3) - # the below build optimization only verified and works well on Qualcomm SM8650-AB Snapdragon 8 Gen 3 - add_definitions(-march=armv8.7-a) - add_definitions(-mcpu=cortex-x1) - add_definitions(-mtune=cortex-x1) -else() - # the below build optimization might be works well on ALL Android phone equipped with Qualcomm mainstream mobile SoC - add_definitions(-mcpu=cortex-a72) -endif() - -add_compile_options("-Wall" "-Wno-sign-compare") - -find_library(LOG_LIB log) - -link_libraries(${LOG_LIB} android) - -add_executable(${TARGET_NAME} - ${SOURCE_FILES} -) - -target_include_directories(${TARGET_NAME} PRIVATE - ../../ggml/src/ggml-qnn/ -) diff --git a/tests/ggml-qnn/ggml-qnn-ut-build-run.sh b/tests/ggml-qnn/ggml-qnn-ut-build-run.sh deleted file mode 100755 index e12b987b8d69d..0000000000000 --- a/tests/ggml-qnn/ggml-qnn-ut-build-run.sh +++ /dev/null @@ -1,207 +0,0 @@ -#!/bin/bash - -set -e - -#https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct -#https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools -#QNN SDK released on 20240531 -QNN_SDK_PATH=/opt/qcom/aistack/qairt/2.23.0.240531/ - -ANDROID_NDK=`pwd`/android-ndk-r26c -ANDROID_PLATFORM=android-34 - -GGML_QNN_UT=ggml-qnn-ut -REMOTE_PATH=/data/local/tmp/ -BUILDTYPE=Release -BUILDTYPE=Debug - - -function dump_vars() -{ - echo -e "ANDROID_NDK: ${ANDROID_NDK}" - echo -e "QNN_SDK_PATH: ${QNN_SDK_PATH}" -} - - -function show_pwd() -{ - echo -e "current working path:$(pwd)\n" -} - - -function check_qnn_sdk() -{ - if [ ! -d ${QNN_SDK_PATH} ]; then - echo -e "QNN_SDK_PATH ${QNN_SDK_PATH} not exist, pls check or download it from https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct...\n" - exit 1 - fi -} - - -function check_and_download_ndk() -{ - is_android_ndk_exist=1 - - if [ ! -d ${ANDROID_NDK} ]; then - is_android_ndk_exist=0 - fi - - if [ ! -f ${ANDROID_NDK}/build/cmake/android.toolchain.cmake ]; then - is_android_ndk_exist=0 - fi - - if [ ${is_android_ndk_exist} -eq 0 ]; then - - if [ ! -f android-ndk-r26c-linux.zip ]; then - wget --no-config --quiet --show-progress -O android-ndk-r26c-linux.zip https://dl.google.com/android/repository/android-ndk-r26c-linux.zip - fi - - unzip android-ndk-r26c-linux.zip - - if [ $? -ne 0 ]; then - printf "failed to download android ndk to %s \n" "${ANDROID_NDK}" - exit 1 - fi - - printf "android ndk saved to ${ANDROID_NDK} \n\n" - else - printf "android ndk already exist:${ANDROID_NDK} \n\n" - fi -} - - -function build_arm64 -{ - cmake -H. -B./out/arm64-v8a -DTARGET_NAME=${GGML_QNN_UT} -DCMAKE_BUILD_TYPE=${BUILDTYPE} -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=${ANDROID_PLATFORM} -DANDROID_NDK=${ANDROID_NDK} -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DQNN_SDK_PATH=${QNN_SDK_PATH} - - cd ./out/arm64-v8a - make - - ls -lah ${GGML_QNN_UT} - /bin/cp ${GGML_QNN_UT} ../../ - cd - -} - - -function remove_temp_dir() -{ - if [ -d out ]; then - echo "remove out directory in `pwd`" - rm -rf out - fi -} - - -function update_qnn_libs() -{ - check_qnn_sdk - - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnSystem.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnCpu.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnGpu.so ${REMOTE_PATH}/ - - #the QNN NPU(aka HTP) backend only verified on Qualcomm Snapdragon 8 Gen 3 equipped Android phone - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpNetRunExtensions.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpPrepare.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpV75Stub.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so ${REMOTE_PATH}/ -} - - -function check_qnn_libs() -{ - #reuse the cached qnn libs in Android phone - adb shell ls ${REMOTE_PATH}/libQnnCpu.so - if [ $? -eq 0 ]; then - printf "QNN libs already exist on Android phone\n" - else - update_qnn_libs - fi -} - - -function build_ggml_qnn_ut() -{ - show_pwd - check_and_download_ndk - check_qnn_sdk - dump_vars - remove_temp_dir - build_arm64 -} - - -function run_ggml_qnn_ut() -{ - check_qnn_libs - - #upload the latest ggml_qnn_test - adb push ${GGML_QNN_UT} ${REMOTE_PATH} - adb shell chmod +x ${REMOTE_PATH}/${GGML_QNN_UT} - - case "$ggmlop" in - GGML_OP_ADD) - adb shell ${REMOTE_PATH}/${GGML_QNN_UT} -t GGML_OP_ADD -b $qnnbackend - ;; - - GGML_OP_MUL_MAT) - adb shell ${REMOTE_PATH}/${GGML_QNN_UT} -t GGML_OP_MUL_MAT -b $qnnbackend - ;; - - *) - printf " \n$arg not supported currently\n" - show_usage - exit 1 - ;; - esac -} - - -function show_usage() -{ - echo "Usage:" - echo " $0 build (build Android command line UT program)" - echo " $0 updateqnnlibs (upload the latest QNN libs to Android phone)" - echo " $0 GGML_OP_ADD 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU) / 3(ggml)" - echo " $0 GGML_OP_MUL_MAT 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU) / 3(ggml)" - echo -e "\n\n\n" -} - - -unset ggmlop -unset qnnbackend - -check_qnn_sdk - -if [ $# == 0 ]; then - show_usage - exit 1 -elif [ $# == 1 ]; then - if [ "$1" == "-h" ]; then - #avoid upload command line program to Android phone in this scenario - show_usage - exit 1 - elif [ "$1" == "help" ]; then - #avoid upload command line program to Android phone in this scenario - show_usage - exit 1 - elif [ "$1" == "build" ]; then - build_ggml_qnn_ut - exit 0 - elif [ "$1" == "updateqnnlibs" ]; then - update_qnn_libs - exit 0 - else - ggmlop=$1 - qnnbackend=0 - run_ggml_qnn_ut - fi -elif [ $# == 2 ]; then - ggmlop=$1 - qnnbackend=$2 - run_ggml_qnn_ut -else - show_usage - exit 1 -fi diff --git a/tests/ggml-qnn/ggml-qnn-ut.cpp b/tests/ggml-qnn/ggml-qnn-ut.cpp deleted file mode 100644 index 71cb86a71bdf1..0000000000000 --- a/tests/ggml-qnn/ggml-qnn-ut.cpp +++ /dev/null @@ -1,544 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "ggml.h" - -#include "ggml-alloc.h" -#include "ggml-backend.h" -#include "ggml-qnn.h" - -#include "logger.hpp" - -static const char *get_qnn_backend_name(int n_backend_type) { - switch (n_backend_type) { - case QNN_BACKEND_CPU: - return "QNN-CPU"; - case QNN_BACKEND_GPU: - return "QNN-GPU"; - case QNN_BACKEND_NPU: - return "QNN-NPU"; - case QNN_BACKEND_GGML: - return "ggml"; - default: - return "unknown"; - } -} - -static bool ggml_graph_compute_helper(struct ggml_backend *backend, struct ggml_cgraph *graph, - std::vector &buf, int n_threads, ggml_abort_callback abort_callback, - void *abort_callback_data) { - struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); - - plan.abort_callback = abort_callback; - plan.abort_callback_data = abort_callback_data; - - if (plan.work_size > 0) { - buf.resize(plan.work_size); - plan.work_data = buf.data(); - } - - if (ggml_backend_is_cpu(backend)) { - ggml_backend_cpu_set_n_threads(backend, n_threads); - } - -#ifdef GGML_USE_QNN - if (ggml_backend_is_qnn(backend)) { - ggml_backend_qnn_set_n_threads(backend, n_threads); - } -#endif - - if (nullptr != backend) - return ggml_backend_graph_compute(backend, graph) == GGML_STATUS_SUCCESS; - else - return ggml_graph_compute(graph, &plan); -} - -#define QK8_0 32 - -typedef struct { - uint16_t d; // delta - int8_t qs[QK8_0]; // quants -} block_q8_0; - -static inline float ggml_compute_fp16_to_fp32(uint16_t h) { - __fp16 tmp; - memcpy(&tmp, &h, sizeof(uint16_t)); - return (float)tmp; -} - -#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) - -#define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor) - -static void tensor_dump(const ggml_tensor *tensor, const char *name) { - QNN_LOG_INFO("dump ggml tensor %s(%s): type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 - ", nb = (%5zi, %5zi, %5zi)\n", - name, tensor->name, tensor->type, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1], - tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]); - - float value = 0; - std::ostringstream tmposs; - if (nullptr == tensor) { - QNN_LOG_WARN("tensor is null"); - return; - } - - if (tensor->type == GGML_TYPE_I8) { - for (int h = 0; h < tensor->ne[3]; h++) { - for (int i = 0; i < tensor->ne[2]; i++) { - for (int j = 0; j < tensor->ne[1]; j++) { - for (int k = 0; k < tensor->ne[0]; k++) { - value = ((int8_t *)tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] + j * tensor->ne[0] + k]; - tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value << " "; - } - tmposs << "\n"; - } - } - } - if (strlen(tmposs.str().c_str()) <= (QNN_LOGBUF_LEN - 96)) { - QNN_LOG_INFO("\n%s\n", tmposs.str().c_str()); - tmposs.clear(); - tmposs.str(""); - } - } - - if (tensor->type == GGML_TYPE_F32) { - for (int h = 0; h < tensor->ne[3]; h++) { - for (int i = 0; i < tensor->ne[2]; i++) { - for (int j = 0; j < tensor->ne[1]; j++) { - for (int k = 0; k < tensor->ne[0]; k++) { - value = ((float *)tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] + j * tensor->ne[0] + k]; - tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value << " "; - } - tmposs << "\n"; - } - } - } - if (strlen(tmposs.str().c_str()) <= (QNN_LOGBUF_LEN - 96)) { - QNN_LOG_INFO("\n%s\n", tmposs.str().c_str()); - tmposs.clear(); - tmposs.str(""); - } - } - - if (tensor->type == GGML_TYPE_F16) { - for (int h = 0; h < tensor->ne[3]; h++) { - for (int i = 0; i < tensor->ne[2]; i++) { - for (int j = 0; j < tensor->ne[1]; j++) { - for (int k = 0; k < tensor->ne[0]; k++) { - unsigned short tmpvalue = - ((unsigned short *) - tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] + j * tensor->ne[0] + k]; - value = GGML_FP16_TO_FP32(tmpvalue); - tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value << " "; - } - tmposs << "\n"; - } - } - } - if (strlen(tmposs.str().c_str()) <= (QNN_LOGBUF_LEN - 96)) { - QNN_LOG_INFO("\n%s\n", tmposs.str().c_str()); - tmposs.clear(); - tmposs.str(""); - } - } - - if (tensor->type == GGML_TYPE_Q8_0) { - block_q8_0 *tmp = ((block_q8_0 *)tensor->data); - for (int j = 0; j < tensor->ne[1]; j++) { - int n = tensor->ne[0] / QK8_0; // blocks per row - for (int z = 0; z < n; z++) { - const float d = GGML_FP16_TO_FP32(tmp[j * n + z].d); - for (int k = 0; k < QK8_0; k++) { - value = tmp[j * n + z].qs[k] * d; - tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value << " "; - } - } - tmposs << "\n"; - } - if (strlen(tmposs.str().c_str()) <= (QNN_LOGBUF_LEN - 96)) { - QNN_LOG_INFO("\n%s\n", tmposs.str().c_str()); - tmposs.clear(); - tmposs.str(""); - } - } -} - -static uint32_t get_tensor_rank(const ggml_tensor *tensor) { - uint32_t rank = 0; - for (int i = 0; i < GGML_MAX_DIMS; i++) { - if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) { - rank++; - } - } - return rank; -} - -static uint32_t get_tensor_data_size(const ggml_tensor *tensor) { -#if ENABLE_QNNSDK_LOG - size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); - size_t n_dims = get_tensor_rank(tensor); - for (size_t i = 1; i < n_dims; i++) { - data_size *= tensor->ne[i]; - } - - QNN_LOG_DEBUG("get_tensor_data_size %d", data_size); - QNN_LOG_DEBUG("ggml_nbytes(tensor) %d", ggml_nbytes(tensor)); -#endif - - return ggml_nbytes(tensor); -} - -// ref: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-backend-ops.cpp#L20 -static void init_tensor_uniform(ggml_tensor *tensor, float min = -1.0f, float max = 1.0f) { - size_t size = ggml_nelements(tensor); - std::vector data(size); - for (size_t i = 0; i < size; i++) { - data[i] = i + 1; - } - - if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) { -#ifdef GGML_USE_QNN - memcpy((char *)tensor->data, data.data(), size * sizeof(float)); -#else - ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float)); -#endif - } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) { - GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0); - std::vector dataq(ggml_row_size(tensor->type, size)); - std::vector imatrix(tensor->ne[0], 1.0f); // dummy importance matrix - const float *im = imatrix.data(); - if (!ggml_quantize_requires_imatrix(tensor->type)) { - // when the imatrix is optional, we want to test both quantization with and without imatrix - // use one of the random numbers to decide - if (data[0] > 0.5f * (min + max)) { - im = nullptr; - } - } - ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size / tensor->ne[0], tensor->ne[0], im); - GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size())); -#ifdef GGML_USE_QNN - memcpy((char *)tensor->data, dataq.data(), dataq.size()); -#else - ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size()); -#endif - } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) { - // This is going to create some weird integers though. -#ifdef GGML_USE_QNN - memcpy((char *)tensor->data, data.data(), ggml_nbytes(tensor)); -#else - ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor)); -#endif - } else { - GGML_ASSERT(false); - } -} - -// ref: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-backend-ops.cpp#L310 -static void initialize_tensors(ggml_context *ctx) { - for (ggml_tensor *t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { - init_tensor_uniform(t); - } -} - -static void show_usage() { - printf( - " " - "\nUsage: test_qnn_ops [options]\n" - "\n" - "Options:\n" - " -t GGML_OP_ADD / GGML_OP_MULMAT\n" - " -b 0(QNN_CPU) 1(QNN_GPU) 2(QNN_NPU) 3(ggml)\n" - " ?/h print usage infomation\n\n"); -} - -typedef ggml_tensor *(*ggml_op_unary_t)(ggml_context *ctx, ggml_tensor *a); - -typedef ggml_tensor *(*ggml_op_binary_t)(ggml_context *ctx, ggml_tensor *a, ggml_tensor *b); - -static constexpr const ggml_op_unary_t kUnaryOps[] = { - nullptr, // GGML_OP_NONE - nullptr, // GGML_OP_DUP - nullptr, // GGML_OP_ADD - nullptr, // GGML_OP_ADD1 - nullptr, // GGML_OP_ACC - nullptr, // GGML_OP_SUB - nullptr, // GGML_OP_MUL - nullptr, // GGML_OP_DIV - nullptr, // GGML_OP_SQR - ggml_sqrt, // GGML_OP_SQRT - ggml_log, // GGML_OP_LOG - nullptr, // GGML_OP_SUM - nullptr, // GGML_OP_SUM_ROWS - nullptr, // GGML_OP_MEAN - nullptr, // GGML_OP_ARGMAX - nullptr, // GGML_OP_REPEAT - nullptr, // GGML_OP_REPEAT_BACK - nullptr, // GGML_OP_CONCAT - nullptr, // GGML_OP_SILU_BACK - nullptr, // GGML_OP_NORM - nullptr, // GGML_OP_RMS_NORM - nullptr, // GGML_OP_RMS_NORM_BACK - nullptr, // GGML_OP_GROUP_NORM - nullptr, // GGML_OP_MUL_MAT -}; - -static constexpr const ggml_op_binary_t kBinaryOps[] = { - nullptr, // GGML_OP_NONE - nullptr, // GGML_OP_DUP - ggml_add, // GGML_OP_ADD - nullptr, // GGML_OP_ADD1 - nullptr, // GGML_OP_ACC - ggml_sub, // GGML_OP_SUB - ggml_mul, // GGML_OP_MUL - ggml_div, // GGML_OP_DIV - nullptr, // GGML_OP_SQR - nullptr, // GGML_OP_SQRT - nullptr, // GGML_OP_LOG - nullptr, // GGML_OP_SUM - nullptr, // GGML_OP_SUM_ROWS - nullptr, // GGML_OP_MEAN - nullptr, // GGML_OP_ARGMAX - nullptr, // GGML_OP_REPEAT - nullptr, // GGML_OP_REPEAT_BACK - nullptr, // GGML_OP_CONCAT - nullptr, // GGML_OP_SILU_BACK - nullptr, // GGML_OP_NORM - nullptr, // GGML_OP_RMS_NORM - nullptr, // GGML_OP_RMS_NORM_BACK - nullptr, // GGML_OP_GROUP_NORM - ggml_mul_mat, // GGML_OP_MUL_MAT -}; - -static_assert(kBinaryOps[GGML_OP_MUL_MAT] == ggml_mul_mat, "ggml_mul_mat at wrong index, check kBinaryOps"); - -static void qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type, ggml_type qtype, - std::vector &results) { - int64_t n_begin_time = 0LL; - int64_t n_end_time = 0LL; - int64_t n_duration = 0LL; - size_t ctx_size = 0; - int sizey = 4; - int sizex = 4; - - struct ggml_context *ctx = nullptr; - struct ggml_cgraph *gf = nullptr; - struct ggml_tensor *src0 = nullptr; - struct ggml_tensor *src1 = nullptr; - struct ggml_tensor *dst = nullptr; - ggml_backend_t backend = nullptr; - ggml_backend_buffer_t buffer = nullptr; - - std::vector work_buffer; - QNN_LOG_DEBUG("enter qnn_ggml_op\n"); - QNN_LOG_DEBUG("ggml op:%d(%s)\n", n_ggml_op_type, ggml_op_name((enum ggml_op)n_ggml_op_type)); - - n_begin_time = ggml_time_us(); - - ctx_size += 1024 * 1024 * 32; - QNN_LOG_DEBUG("Allocating Memory of size %zi bytes, %zi MB\n", ctx_size, (ctx_size / 1024 / 1024)); - - struct ggml_init_params params = { /*.mem_size =*/ctx_size, - /*.mem_buffer =*/NULL, - /* no_alloc =*/0 }; - - if (n_backend_type != QNN_BACKEND_GGML) { - params.no_alloc = true; - backend = ggml_backend_qnn_init(n_backend_type, "/data/local/tmp/"); - if (nullptr == backend) { - QNN_LOG_ERROR("create qnn backend %d(%s) failed\n", n_backend_type, get_qnn_backend_name(n_backend_type)); - return; - } - } - - ctx = ggml_init(params); - if (!ctx) { - QNN_LOG_ERROR("%s: ggml_init() failed\n"); - return; - } - - QNN_LOG_DEBUG("creating new tensors\n"); - QNN_LOG_DEBUG("ggml_blck_size(%s) %d\n", ggml_type_name(qtype), ggml_blck_size(qtype)); - QNN_LOG_DEBUG("ggml_type_size(%s) %d\n", ggml_type_name(qtype), ggml_type_size(qtype)); - if (ggml_is_quantized(qtype)) { - sizex = ggml_blck_size(qtype); - - if (n_ggml_op_type == GGML_OP_MUL_MAT) { - sizex = ggml_blck_size(qtype) * 2; - } - } - QNN_LOG_DEBUG("sizex: %d\n", sizex); - QNN_LOG_DEBUG("sizey: %d\n", sizey); - - src0 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); - src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); - - ggml_set_input(src0); - ggml_set_input(src1); - - auto unary_op = kUnaryOps[n_ggml_op_type]; - auto binary_op = kBinaryOps[n_ggml_op_type]; - if (unary_op) { - dst = unary_op(ctx, src0); - } else if (binary_op) { - dst = binary_op(ctx, src0, src1); - } else { - QNN_LOG_WARN("ggml op %d(%s) not supported", n_ggml_op_type, ggml_op_name((enum ggml_op)n_ggml_op_type)); - ggml_free(ctx); - ggml_backend_free(backend); - return; - } - - ggml_set_output(dst); -#ifdef GGML_USE_QNN - if (n_backend_type != QNN_BACKEND_GGML) { - buffer = ggml_backend_alloc_ctx_tensors(ctx, backend); - if (!buffer) { - QNN_LOG_ERROR("%s: failed to allocate backend buffer\n", __func__); - ggml_free(ctx); - ggml_backend_free(backend); - return; - } - } -#endif - - QNN_LOG_DEBUG("creating compute graph\n"); - gf = ggml_new_graph(ctx); - ggml_build_forward_expand(gf, dst); - - initialize_tensors(ctx); - - ggml_graph_compute_helper(backend, gf, work_buffer, num_threads, nullptr, nullptr); - - if (get_tensor_data_size(dst) < (32 * 32)) { - QNN_LOG_DEBUG("dump tensors:\n"); - TENSOR_DUMP(src0); - TENSOR_DUMP(src1); - TENSOR_DUMP(dst); - } else { - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 - ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], - src0->nb[0], src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 - ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], - src1->nb[0], src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 - ", nb = (%5zi, %5zi, %5zi)\n", - dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], - dst->nb[1], dst->nb[2]); - } - - results.resize(ggml_nbytes(dst)); - memcpy(results.data(), ggml_get_data(dst), ggml_nbytes(dst)); - ggml_free(ctx); - ggml_backend_buffer_free(buffer); - ggml_backend_free(backend); - - n_end_time = ggml_time_us(); - n_duration = (n_end_time - n_begin_time) / 1000; - QNN_LOG_DEBUG("duration of ut GGML_OP_%s using QNN backend %s: %lld milliseconds\n", - ggml_op_name((enum ggml_op)n_ggml_op_type), get_qnn_backend_name(n_backend_type), n_duration); -} - -#define DEFINE_OP(op) { #op, op } - -static const std::unordered_map kMapStringToGGMLOp = { - DEFINE_OP(GGML_OP_ADD), DEFINE_OP(GGML_OP_SUB), DEFINE_OP(GGML_OP_MUL), DEFINE_OP(GGML_OP_DIV), - DEFINE_OP(GGML_OP_SQRT), DEFINE_OP(GGML_OP_MUL_MAT), DEFINE_OP(GGML_OP_LOG), -}; - -#define CONSOLE_RED "\033[31m" -#define CONSOLE_GREEN "\033[32m" -#define CONSOLE_RESET "\033[0m" - -int main(int argc, char *argv[]) { - int num_threads = 4; - int n_backend_type = QNN_BACKEND_CPU; - int n_ggml_op_type = GGML_OP_ADD; - - for (int i = 1; i < argc; i++) { - if (0 == strcmp(argv[i], "-t")) { - if (i + 1 < argc) { - auto it = kMapStringToGGMLOp.find(argv[i + 1]); - if (it != kMapStringToGGMLOp.end()) { - n_ggml_op_type = it->second; - } else { - show_usage(); - return 1; - } - i++; - } - } else if (0 == strcmp(argv[i], "-b")) { - if (i + 1 < argc) { - int backend = atoi(argv[i + 1]); - if (backend <= QNN_BACKEND_GGML) - n_backend_type = backend; - else { - show_usage(); - return 1; - } - i++; - } - } else { - show_usage(); - return 1; - } - } - - QNN_LOG_DEBUG("enter qnn_ggml_op\n"); - QNN_LOG_DEBUG("backend %d, ggml op:%d(%s)", n_backend_type, n_ggml_op_type, - ggml_op_name((enum ggml_op)n_ggml_op_type)); - - std::vector results; - qnn_op_ut(num_threads, n_backend_type, n_ggml_op_type, GGML_TYPE_F32, results); - std::vector cpu_results; - qnn_op_ut(num_threads, QNN_BACKEND_GGML, n_ggml_op_type, GGML_TYPE_F32, cpu_results); - - // TODO: theoretically, the results should be the same, but the results may be different due to the different hardware - // a better way to compare the results is to compare the floating point numbers with allowed error - if (results == cpu_results) { - QNN_LOG_INFO(CONSOLE_GREEN "[Success] results equal to CPU backend!" CONSOLE_RESET); - return 0; - } else { - QNN_LOG_ERROR(CONSOLE_RED "[Failed] results mismatch with CPU backend!" CONSOLE_RESET); - return 1; - } -} From 6457a68bd7273eef0843d3ed6faf70f7012d0731 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 17 Jul 2024 23:24:29 +0800 Subject: [PATCH 080/166] disable qnn profiling in release build --- ggml/src/ggml-qnn/qnn-lib.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ggml/src/ggml-qnn/qnn-lib.hpp b/ggml/src/ggml-qnn/qnn-lib.hpp index a46901695aa6d..136b1af08b086 100644 --- a/ggml/src/ggml-qnn/qnn-lib.hpp +++ b/ggml/src/ggml-qnn/qnn-lib.hpp @@ -882,7 +882,11 @@ class qnn_instance { QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_DEBUG; +#ifdef NDEBUG + qnn::sdk_profile_level _profile_level = qnn::sdk_profile_level::profile_off; +#else qnn::sdk_profile_level _profile_level = qnn::sdk_profile_level::profile_detail; +#endif std::shared_ptr _qnn_sys_interface; std::shared_ptr _qnn_interface; From c76fc9aa2f8585d9840366f1dad387eae30b2c4c Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 17 Jul 2024 23:30:14 +0800 Subject: [PATCH 081/166] fix warnings --- ggml/src/ggml-qnn.cpp | 7 +++---- ggml/src/ggml-qnn/graph.hpp | 8 ++++---- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 8ba258d632f38..0e5e86e4add4e 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -134,8 +134,7 @@ struct ggml_backend_qnn_buffer_type_context { // implementation of QNN backend for GGML // // ================================================================================================= -static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, const struct ggml_tensor *tensor, - bool b_dump_tensor_info) { +static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, const struct ggml_tensor *tensor) { if (ggml_is_empty(tensor) || (!qnn::ggml_qnn_unary_op_array()[tensor->op] && !qnn::ggml_qnn_binary_op_array()[tensor->op])) { return false; @@ -353,13 +352,13 @@ GGML_CALL static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backe GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const ggml_tensor *op) { ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *)backend->context; - return (ggml_qnn_can_handle_op(ctx, op, false)); + return ggml_qnn_can_handle_op(ctx, op); } GGML_CALL static bool ggml_backend_qnn_offload_op(ggml_backend_t backend, const ggml_tensor *tensor) { ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *)backend->context; - return ggml_qnn_can_handle_op(ctx, tensor, false); + return ggml_qnn_can_handle_op(ctx, tensor); } static ggml_backend_i ggml_backend_qnn_interface = { diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 9621ad1b4dd68..462ed92034b2c 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -101,10 +101,10 @@ class ggml_qnn_graph { _tensor_inputs = tensor_inputs; _tensor_outputs = tensor_outputs; - Qnn_OpConfig_t op_config = { .version = QNN_OPCONFIG_VERSION_1, - .v1 = { _graph_name.c_str(), QNN_OP_PACKAGE_NAME_QTI_AISW, op_name.c_str(), 0, - nullptr, (uint32_t)_tensor_inputs.size(), _tensor_inputs.data(), - (uint32_t)_tensor_outputs.size(), _tensor_outputs.data() } }; + Qnn_OpConfig_t op_config = { /*.version = */ QNN_OPCONFIG_VERSION_1, + /*.v1 = */ { _graph_name.c_str(), QNN_OP_PACKAGE_NAME_QTI_AISW, op_name.c_str(), 0, + nullptr, (uint32_t)_tensor_inputs.size(), _tensor_inputs.data(), + (uint32_t)_tensor_outputs.size(), _tensor_outputs.data() } }; auto error = _qnn_interface->qnn_graph_add_node(_graph_handle, op_config); if (error != QNN_SUCCESS) { QNN_LOG_ERROR("graphAddNode.error = %d\n", error); From ce199b2de788a5e314761f29868d724070beb254 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 17 Jul 2024 23:43:22 +0800 Subject: [PATCH 082/166] refactoring: downgrade some log to debug level --- ggml/src/ggml-qnn/qnn-lib.hpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-qnn/qnn-lib.hpp b/ggml/src/ggml-qnn/qnn-lib.hpp index 136b1af08b086..6d0ee05671a8c 100644 --- a/ggml/src/ggml-qnn/qnn-lib.hpp +++ b/ggml/src/ggml-qnn/qnn-lib.hpp @@ -2,6 +2,7 @@ #include +#include #include #include #include @@ -366,8 +367,8 @@ class qnn_instance { size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); for (size_t idx = 0; idx < probe_counts; idx++) { rpc_buffer = static_cast(alloc_rpcmem(probe_slots[idx] * size_in_mb, 4)); - if (nullptr == rpc_buffer) { - QNN_LOG_INFO("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno)); + if (!rpc_buffer) { + QNN_LOG_DEBUG("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno)); break; } else { candidate_size = probe_slots[idx]; @@ -375,7 +376,8 @@ class qnn_instance { rpc_buffer = nullptr; } } - if (candidate_size > _rpcmem_capacity) _rpcmem_capacity = candidate_size; + + _rpcmem_capacity = std::max(candidate_size, _rpcmem_capacity); QNN_LOG_INFO("capacity of QNN rpc ion memory is about %d MB\n", _rpcmem_capacity); if (0 != init_htp_perfinfra()) { @@ -600,7 +602,7 @@ class qnn_instance { auto allocate_bytes = static_cast(bytes + alignment); void *buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, allocate_bytes); if (buf == nullptr) { - QNN_LOG_WARN("failed to allocate rpc memory\n"); + QNN_LOG_WARN("failed to allocate rpc memory, size: %d MB\n", (int)(allocate_bytes / (1 << 20))); return nullptr; } From d82b3a0bdb3ad491e22b4a5b182ff75a5a0597d3 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Thu, 18 Jul 2024 10:25:45 +0800 Subject: [PATCH 083/166] feat: add GGML_UNARY_OP_GELU --- ggml/src/ggml-qnn.cpp | 25 ++++++++------ ggml/src/ggml-qnn/backend-ops.cpp | 56 ++++++++++++++++++++++++------- ggml/src/ggml-qnn/backend-ops.hpp | 4 ++- 3 files changed, 60 insertions(+), 25 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 0e5e86e4add4e..282a3d85941b8 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -135,14 +135,19 @@ struct ggml_backend_qnn_buffer_type_context { // // ================================================================================================= static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, const struct ggml_tensor *tensor) { - if (ggml_is_empty(tensor) || - (!qnn::ggml_qnn_unary_op_array()[tensor->op] && !qnn::ggml_qnn_binary_op_array()[tensor->op])) { + if (ggml_is_empty(tensor)) { + return false; + } + + if (!qnn::ggml_qnn_unary_op_array()[tensor->op] && !qnn::ggml_qnn_binary_op_array()[tensor->op] && + (tensor->op != GGML_OP_UNARY || + qnn::ggml_qnn_unary_op_array()[qnn::kGgmlUnaryOpStart + ggml_get_unary_op(tensor)])) { return false; } const struct ggml_tensor *src0 = tensor->src[0]; const struct ggml_tensor *src1 = tensor->src[1]; - if (nullptr == src0 || nullptr == src1) { + if (!src0 || !src1) { return false; } @@ -162,18 +167,16 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, const struct g } } - if (tensor->op == GGML_OP_MUL_MAT) { - if (ne00 <= 32 || ne01 <= 32 || ne10 <= 32 || ne11 <= 32) { - // comment it for make UT of mul_mat with QNN RPC happy - // return false; - } - } - return true; } static bool ggml_qnn_compute_forward(ggml_backend_qnn_context *ctx, struct ggml_tensor *tensor) { - auto unary_op = qnn::ggml_qnn_unary_op_array()[tensor->op]; + size_t unary_op_idx = tensor->op; + if (tensor->op == GGML_OP_UNARY) { + unary_op_idx = qnn::kGgmlUnaryOpStart + ggml_get_unary_op(tensor); + } + + auto unary_op = qnn::ggml_qnn_unary_op_array()[unary_op_idx]; if (unary_op) { return unary_op(ctx, tensor->src[0], tensor); } diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index e1a8c4da5ed40..6367e7c7064d1 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -158,12 +158,16 @@ qnn::ggml_qnn_binary_graph_cache_t &get_qnn_graph_cache(ggml_backend_qnn_context template qnn::ggml_qnn_graph<_InputSize, _OutputSize> *get_qnn_graph_from_cache( - ggml_backend_qnn_context *ctx, ggml_op op, const std::string &qnn_op, + ggml_backend_qnn_context *ctx, size_t op, const std::string &qnn_op, const std::array &inputs, const std::array &outputs) { using graph_t = qnn::ggml_qnn_graph<_InputSize, _OutputSize>; + GGML_ASSERT(op < (GGML_OP_COUNT + GGML_UNARY_OP_COUNT)); + auto &graph_cache = get_qnn_graph_cache(ctx, inputs, outputs); - const std::string graph_key(ggml_op_name(op)); + const auto *op_name = op < qnn::kGgmlUnaryOpStart ? ggml_op_name(ggml_op(op)) + : ggml_unary_op_name(ggml_unary_op(op - qnn::kGgmlUnaryOpStart)); + const std::string graph_key(op_name); auto it = graph_cache.find(graph_key); graph_t *graph_ptr = nullptr; if (it != graph_cache.end()) { @@ -276,10 +280,27 @@ constexpr const char *kGgmlOpToQnnOp[] = { nullptr, // GGML_OP_CROSS_ENTROPY_LOSS nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK + + // ggml_unary_op + nullptr, // GGML_UNARY_OP_ABS + nullptr, // GGML_UNARY_OP_SGN + nullptr, // GGML_UNARY_OP_NEG + nullptr, // GGML_UNARY_OP_STEP + nullptr, // GGML_UNARY_OP_TANH + nullptr, // GGML_UNARY_OP_ELU + nullptr, // GGML_UNARY_OP_RELU + nullptr, // GGML_UNARY_OP_SIGMOID + QNN_OP_GELU, // GGML_UNARY_OP_GELU + nullptr, // GGML_UNARY_OP_GELU_QUICK + nullptr, // GGML_UNARY_OP_SILU + nullptr, // GGML_UNARY_OP_HARDSWISH + nullptr, // GGML_UNARY_OP_HARDSIGMOID }; -static_assert(sizeof(kGgmlOpToQnnOp) / sizeof(kGgmlOpToQnnOp[0]) == GGML_OP_COUNT, - "GGML_OP_COUNT does not match the size of the ops table"); +static_assert(sizeof(kGgmlOpToQnnOp) / sizeof(kGgmlOpToQnnOp[0]) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), + "GGML_OP_COUNT does not match the size of the kGgmlOpToQnnOp table"); +static_assert(kGgmlOpToQnnOp[GGML_UNARY_OP_GELU + qnn::kGgmlUnaryOpStart] != nullptr, + "GGML_UNARY_OP_GELU does not correspond to QNN_OP_GELU"); template bool qnn_binary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, @@ -288,9 +309,6 @@ bool qnn_binary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, CHECK_PARAMS(ctx, src0, src1, dst); - qnn::qnn_perf perf(ggml_op_name(_GgmlOp)); - perf.start(); - bool succeed = false; qnn::ggml_qnn_graph_binary *graph_ptr = get_qnn_graph_from_cache<2, 1>(ctx, _GgmlOp, kGgmlOpToQnnOp[_GgmlOp], { src0, src1 }, { dst }); @@ -307,15 +325,12 @@ bool qnn_binary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, return succeed; } -template +template bool qnn_unary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src, ggml_tensor *dst) { static_assert(kGgmlOpToQnnOp[_GgmlOp] != nullptr, "GGML_OP does not have a corresponding QNN_OP"); CHECK_PARAMS(ctx, src, dst); - qnn::qnn_perf perf(ggml_op_name(_GgmlOp)); - perf.start(); - bool succeed = false; auto *graph_ptr = get_qnn_graph_from_cache<1, 1>(ctx, _GgmlOp, kGgmlOpToQnnOp[_GgmlOp], { src }, { dst }); if (graph_ptr) { @@ -416,10 +431,25 @@ qnn::ggml_qnn_unary_op_array_t qnn::ggml_qnn_unary_op_array() { nullptr, // GGML_OP_CROSS_ENTROPY_LOSS nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK + + // ggml_unary_op + nullptr, // GGML_UNARY_OP_ABS + nullptr, // GGML_UNARY_OP_SGN + nullptr, // GGML_UNARY_OP_NEG + nullptr, // GGML_UNARY_OP_STEP + nullptr, // GGML_UNARY_OP_TANH + nullptr, // GGML_UNARY_OP_ELU + nullptr, // GGML_UNARY_OP_RELU + nullptr, // GGML_UNARY_OP_SIGMOID + qnn_unary_op_impl, // GGML_UNARY_OP_GELU + nullptr, // GGML_UNARY_OP_GELU_QUICK + nullptr, // GGML_UNARY_OP_SILU + nullptr, // GGML_UNARY_OP_HARDSWISH + nullptr, // GGML_UNARY_OP_HARDSIGMOID }; - static_assert(sizeof(kQnnOpsTable) / sizeof(kQnnOpsTable[0]) == GGML_OP_COUNT, - "GGML_OP_COUNT does not match the size of the ops table"); + static_assert(sizeof(kQnnOpsTable) / sizeof(kQnnOpsTable[0]) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), + "GGML_OP_COUNT does not match the size of the kQnnOpsTable table"); return kQnnOpsTable; } diff --git a/ggml/src/ggml-qnn/backend-ops.hpp b/ggml/src/ggml-qnn/backend-ops.hpp index 8d94fc6c25424..8cc2dc366fbfa 100644 --- a/ggml/src/ggml-qnn/backend-ops.hpp +++ b/ggml/src/ggml-qnn/backend-ops.hpp @@ -10,9 +10,11 @@ typedef bool (*ggml_qnn_unary_op_t)(ggml_backend_qnn_context *ctx, const ggml_te typedef bool (*ggml_qnn_binary_op_t)(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst); -typedef const ggml_qnn_unary_op_t (&ggml_qnn_unary_op_array_t)[GGML_OP_COUNT]; +typedef const ggml_qnn_unary_op_t (&ggml_qnn_unary_op_array_t)[GGML_OP_COUNT + GGML_UNARY_OP_COUNT]; typedef const ggml_qnn_binary_op_t (&ggml_qnn_binary_op_array_t)[GGML_OP_COUNT]; +constexpr const size_t kGgmlUnaryOpStart = GGML_OP_COUNT; + ggml_qnn_unary_op_array_t ggml_qnn_unary_op_array(); ggml_qnn_binary_op_array_t ggml_qnn_binary_op_array(); From 15f5cc450c53c890a4656b01bc3f220d3d27095a Mon Sep 17 00:00:00 2001 From: hongruichen Date: Thu, 18 Jul 2024 19:44:05 +0800 Subject: [PATCH 084/166] bug: fix allocation size overflow at log --- ggml/src/ggml-qnn/qnn-lib.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-qnn/qnn-lib.hpp b/ggml/src/ggml-qnn/qnn-lib.hpp index 6d0ee05671a8c..517df493ccb16 100644 --- a/ggml/src/ggml-qnn/qnn-lib.hpp +++ b/ggml/src/ggml-qnn/qnn-lib.hpp @@ -599,9 +599,9 @@ class qnn_instance { return nullptr; } - auto allocate_bytes = static_cast(bytes + alignment); - void *buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, allocate_bytes); - if (buf == nullptr) { + auto allocate_bytes = static_cast(bytes + alignment); + void *buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, (int)allocate_bytes); + if (!buf) { QNN_LOG_WARN("failed to allocate rpc memory, size: %d MB\n", (int)(allocate_bytes / (1 << 20))); return nullptr; } From 665f823748d13feab4cc747caec1d6896e83ec87 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Thu, 18 Jul 2024 20:26:05 +0800 Subject: [PATCH 085/166] fix op checker --- ggml/src/ggml-qnn.cpp | 88 ++++++++++++++++++++++--------------------- 1 file changed, 45 insertions(+), 43 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 282a3d85941b8..3f228935c6fbb 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -134,42 +134,6 @@ struct ggml_backend_qnn_buffer_type_context { // implementation of QNN backend for GGML // // ================================================================================================= -static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, const struct ggml_tensor *tensor) { - if (ggml_is_empty(tensor)) { - return false; - } - - if (!qnn::ggml_qnn_unary_op_array()[tensor->op] && !qnn::ggml_qnn_binary_op_array()[tensor->op] && - (tensor->op != GGML_OP_UNARY || - qnn::ggml_qnn_unary_op_array()[qnn::kGgmlUnaryOpStart + ggml_get_unary_op(tensor)])) { - return false; - } - - const struct ggml_tensor *src0 = tensor->src[0]; - const struct ggml_tensor *src1 = tensor->src[1]; - if (!src0 || !src1) { - return false; - } - - const auto ne00 = src0->ne[0]; - const auto ne01 = src0->ne[1]; - const auto ne10 = src1->ne[0]; - const auto ne11 = src1->ne[1]; - // make qnn_get_ggml_tensor_rank and QNN SDK happy - if (ne00 <= 1 || ne01 <= 1 || ne10 <= 1 || ne11 <= 1) { - return false; - } - - // TODO: support other quantized data type - if (ggml_is_quantized(src0->type)) { - if (src0->type != GGML_TYPE_Q8_0 && src0->type != GGML_TYPE_Q4_0) { - return false; - } - } - - return true; -} - static bool ggml_qnn_compute_forward(ggml_backend_qnn_context *ctx, struct ggml_tensor *tensor) { size_t unary_op_idx = tensor->op; if (tensor->op == GGML_OP_UNARY) { @@ -297,8 +261,8 @@ GGML_CALL static bool ggml_backend_qnn_buffer_is_host(ggml_backend_buffer_type_t } GGML_CALL static const char *ggml_backend_qnn_name(ggml_backend_t backend) { - GGML_UNUSED(backend); - return "QNN"; + ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *)backend->context; + return g_qnn_mgr[ctx->device].name; } GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { @@ -353,15 +317,53 @@ GGML_CALL static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backe } GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const ggml_tensor *op) { - ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *)backend->context; + GGML_UNUSED(backend); + + if (op->op == GGML_OP_NONE) { + return true; + } + + if (op->op == GGML_OP_UNARY) { + if (!qnn::ggml_qnn_unary_op_array()[qnn::kGgmlUnaryOpStart + ggml_get_unary_op(op)]) { + QNN_LOG_DEBUG("unsupported unary op %d", ggml_get_unary_op(op)); + return false; + } + + if (!op->src[0]) { + QNN_LOG_DEBUG("src0 is nullptr"); + return false; + } + } else { + if (!qnn::ggml_qnn_unary_op_array()[op->op] && !qnn::ggml_qnn_binary_op_array()[op->op]) { + QNN_LOG_DEBUG("unsupported op %d", op->op); + return false; + } - return ggml_qnn_can_handle_op(ctx, op); + if (!op->src[0] || !op->src[1]) { + QNN_LOG_DEBUG("src0 or src1 is nullptr"); + return false; + } + } + + switch (op->src[0]->type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_I8: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q4_0: + break; + default: + QNN_LOG_DEBUG("unsupported src0 type %d", op->src[0]->type); + return false; + } + + return true; } -GGML_CALL static bool ggml_backend_qnn_offload_op(ggml_backend_t backend, const ggml_tensor *tensor) { - ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *)backend->context; +GGML_CALL static bool ggml_backend_qnn_offload_op(ggml_backend_t backend, const ggml_tensor *op) { + GGML_UNUSED(backend); - return ggml_qnn_can_handle_op(ctx, tensor); + return op->ne[0] > 1 && op->ne[1] > 1; } static ggml_backend_i ggml_backend_qnn_interface = { From ce3d09e5f2bfa95ca448e7cc053040108d0373e3 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 19 Jul 2024 10:13:56 +0800 Subject: [PATCH 086/166] tried fix the add node error 6005 --- ggml/src/ggml-qnn.cpp | 6 +----- ggml/src/ggml-qnn/graph.hpp | 6 ++++-- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 3f228935c6fbb..d62a8074ef823 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -319,10 +319,6 @@ GGML_CALL static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backe GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const ggml_tensor *op) { GGML_UNUSED(backend); - if (op->op == GGML_OP_NONE) { - return true; - } - if (op->op == GGML_OP_UNARY) { if (!qnn::ggml_qnn_unary_op_array()[qnn::kGgmlUnaryOpStart + ggml_get_unary_op(op)]) { QNN_LOG_DEBUG("unsupported unary op %d", ggml_get_unary_op(op)); @@ -333,7 +329,7 @@ GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const QNN_LOG_DEBUG("src0 is nullptr"); return false; } - } else { + } else if (op->op != GGML_OP_NONE) { if (!qnn::ggml_qnn_unary_op_array()[op->op] && !qnn::ggml_qnn_binary_op_array()[op->op]) { QNN_LOG_DEBUG("unsupported op %d", op->op); return false; diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 462ed92034b2c..5fe5dc83d3a72 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -102,8 +102,9 @@ class ggml_qnn_graph { _tensor_outputs = tensor_outputs; Qnn_OpConfig_t op_config = { /*.version = */ QNN_OPCONFIG_VERSION_1, - /*.v1 = */ { _graph_name.c_str(), QNN_OP_PACKAGE_NAME_QTI_AISW, op_name.c_str(), 0, - nullptr, (uint32_t)_tensor_inputs.size(), _tensor_inputs.data(), + /*.v1 = */ { _graph_name.c_str(), QNN_OP_PACKAGE_NAME_QTI_AISW, op_name.c_str(), + (uint32_t)_param_types.size(), _param_types.data(), + (uint32_t)_tensor_inputs.size(), _tensor_inputs.data(), (uint32_t)_tensor_outputs.size(), _tensor_outputs.data() } }; auto error = _qnn_interface->qnn_graph_add_node(_graph_handle, op_config); if (error != QNN_SUCCESS) { @@ -154,6 +155,7 @@ class ggml_qnn_graph { Qnn_GraphHandle_t _graph_handle = nullptr; std::array _tensor_inputs; std::array _tensor_outputs; + std::vector _param_types; ggml_qnn_graph(const ggml_qnn_graph &) = delete; void operator=(const ggml_qnn_graph &) = delete; From f45fbec8f43a4c2bf50726fdf777255d232e56a7 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 19 Jul 2024 12:59:38 +0800 Subject: [PATCH 087/166] Revert "tried fix the add node error 6005" This reverts commit ce3d09e5f2bfa95ca448e7cc053040108d0373e3. --- ggml/src/ggml-qnn.cpp | 6 +++++- ggml/src/ggml-qnn/graph.hpp | 6 ++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index d62a8074ef823..3f228935c6fbb 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -319,6 +319,10 @@ GGML_CALL static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backe GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const ggml_tensor *op) { GGML_UNUSED(backend); + if (op->op == GGML_OP_NONE) { + return true; + } + if (op->op == GGML_OP_UNARY) { if (!qnn::ggml_qnn_unary_op_array()[qnn::kGgmlUnaryOpStart + ggml_get_unary_op(op)]) { QNN_LOG_DEBUG("unsupported unary op %d", ggml_get_unary_op(op)); @@ -329,7 +333,7 @@ GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const QNN_LOG_DEBUG("src0 is nullptr"); return false; } - } else if (op->op != GGML_OP_NONE) { + } else { if (!qnn::ggml_qnn_unary_op_array()[op->op] && !qnn::ggml_qnn_binary_op_array()[op->op]) { QNN_LOG_DEBUG("unsupported op %d", op->op); return false; diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 5fe5dc83d3a72..462ed92034b2c 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -102,9 +102,8 @@ class ggml_qnn_graph { _tensor_outputs = tensor_outputs; Qnn_OpConfig_t op_config = { /*.version = */ QNN_OPCONFIG_VERSION_1, - /*.v1 = */ { _graph_name.c_str(), QNN_OP_PACKAGE_NAME_QTI_AISW, op_name.c_str(), - (uint32_t)_param_types.size(), _param_types.data(), - (uint32_t)_tensor_inputs.size(), _tensor_inputs.data(), + /*.v1 = */ { _graph_name.c_str(), QNN_OP_PACKAGE_NAME_QTI_AISW, op_name.c_str(), 0, + nullptr, (uint32_t)_tensor_inputs.size(), _tensor_inputs.data(), (uint32_t)_tensor_outputs.size(), _tensor_outputs.data() } }; auto error = _qnn_interface->qnn_graph_add_node(_graph_handle, op_config); if (error != QNN_SUCCESS) { @@ -155,7 +154,6 @@ class ggml_qnn_graph { Qnn_GraphHandle_t _graph_handle = nullptr; std::array _tensor_inputs; std::array _tensor_outputs; - std::vector _param_types; ggml_qnn_graph(const ggml_qnn_graph &) = delete; void operator=(const ggml_qnn_graph &) = delete; From 0153a23d3f51f66eff0beceeac1cf287ddc66b7a Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 19 Jul 2024 15:22:23 +0800 Subject: [PATCH 088/166] fix support ops This reverts commit f45fbec8f43a4c2bf50726fdf777255d232e56a7. --- ggml/src/ggml-qnn.cpp | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 3f228935c6fbb..e448d73821cb3 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -319,10 +319,6 @@ GGML_CALL static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backe GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const ggml_tensor *op) { GGML_UNUSED(backend); - if (op->op == GGML_OP_NONE) { - return true; - } - if (op->op == GGML_OP_UNARY) { if (!qnn::ggml_qnn_unary_op_array()[qnn::kGgmlUnaryOpStart + ggml_get_unary_op(op)]) { QNN_LOG_DEBUG("unsupported unary op %d", ggml_get_unary_op(op)); @@ -333,7 +329,7 @@ GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const QNN_LOG_DEBUG("src0 is nullptr"); return false; } - } else { + } else if (op->op != GGML_OP_NONE) { if (!qnn::ggml_qnn_unary_op_array()[op->op] && !qnn::ggml_qnn_binary_op_array()[op->op]) { QNN_LOG_DEBUG("unsupported op %d", op->op); return false; @@ -345,7 +341,7 @@ GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const } } - switch (op->src[0]->type) { + switch (op->type) { case GGML_TYPE_F32: case GGML_TYPE_F16: case GGML_TYPE_I8: From a607995f95adc182a2d519bc86021776c27d1708 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 19 Jul 2024 15:35:55 +0800 Subject: [PATCH 089/166] Reapply "tried fix the add node error 6005" This reverts commit f45fbec8f43a4c2bf50726fdf777255d232e56a7. --- ggml/src/ggml-qnn/graph.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 462ed92034b2c..5fe5dc83d3a72 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -102,8 +102,9 @@ class ggml_qnn_graph { _tensor_outputs = tensor_outputs; Qnn_OpConfig_t op_config = { /*.version = */ QNN_OPCONFIG_VERSION_1, - /*.v1 = */ { _graph_name.c_str(), QNN_OP_PACKAGE_NAME_QTI_AISW, op_name.c_str(), 0, - nullptr, (uint32_t)_tensor_inputs.size(), _tensor_inputs.data(), + /*.v1 = */ { _graph_name.c_str(), QNN_OP_PACKAGE_NAME_QTI_AISW, op_name.c_str(), + (uint32_t)_param_types.size(), _param_types.data(), + (uint32_t)_tensor_inputs.size(), _tensor_inputs.data(), (uint32_t)_tensor_outputs.size(), _tensor_outputs.data() } }; auto error = _qnn_interface->qnn_graph_add_node(_graph_handle, op_config); if (error != QNN_SUCCESS) { @@ -154,6 +155,7 @@ class ggml_qnn_graph { Qnn_GraphHandle_t _graph_handle = nullptr; std::array _tensor_inputs; std::array _tensor_outputs; + std::vector _param_types; ggml_qnn_graph(const ggml_qnn_graph &) = delete; void operator=(const ggml_qnn_graph &) = delete; From b1b5cc10b1d3d922f9b8e0350458a4c7b3143815 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 19 Jul 2024 22:51:17 +0800 Subject: [PATCH 090/166] add function to convert qnn error into string --- ggml/src/ggml-qnn/graph.hpp | 14 ++++++++++++-- ggml/src/ggml-qnn/utils.cpp | 33 +++++++++++++++++++++++++++++++++ ggml/src/ggml-qnn/utils.hpp | 2 ++ 3 files changed, 47 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 5fe5dc83d3a72..2d412dffd743a 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -108,13 +108,23 @@ class ggml_qnn_graph { (uint32_t)_tensor_outputs.size(), _tensor_outputs.data() } }; auto error = _qnn_interface->qnn_graph_add_node(_graph_handle, op_config); if (error != QNN_SUCCESS) { - QNN_LOG_ERROR("graphAddNode.error = %d\n", error); + auto *error_str = get_qnn_error_string(error); + if (error_str) { + QNN_LOG_ERROR("qnn_graph_add_node.error: %s\n", error_str); + } else { + QNN_LOG_ERROR("qnn_graph_add_node.error: %d\n", error); + } return false; } error = _qnn_interface->qnn_graph_finalize(_graph_handle, nullptr, nullptr); if (error != QNN_SUCCESS) { - QNN_LOG_ERROR("graphFinalize.error = %d\n", error); + auto *error_str = get_qnn_error_string(error); + if (error_str) { + QNN_LOG_ERROR("qnn_graph_finalize.error: %s\n", error_str); + } else { + QNN_LOG_ERROR("qnn_graph_finalize.error: %d\n", error); + } return false; } diff --git a/ggml/src/ggml-qnn/utils.cpp b/ggml/src/ggml-qnn/utils.cpp index 11358395219ca..e36142f283d0d 100644 --- a/ggml/src/ggml-qnn/utils.cpp +++ b/ggml/src/ggml-qnn/utils.cpp @@ -152,4 +152,37 @@ const char *opname_from_ggmlop(enum ggml_op ggmlop) { return nullptr; } +const char *get_qnn_error_string(Qnn_ErrorHandle_t error) { + switch (error) { + case QNN_SUCCESS: + return "QNN_SUCCESS"; + case QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE: + return "QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE"; + case QNN_GRAPH_ERROR_MEM_ALLOC: + return "QNN_GRAPH_ERROR_MEM_ALLOC"; + case QNN_GRAPH_ERROR_GENERAL: + return "QNN_GRAPH_ERROR_GENERAL"; + case QNN_GRAPH_ERROR_INVALID_ARGUMENT: + return "QNN_GRAPH_ERROR_INVALID_ARGUMENT"; + case QNN_GRAPH_ERROR_INVALID_HANDLE: + return "QNN_GRAPH_ERROR_INVALID_HANDLE"; + case QNN_GRAPH_ERROR_GRAPH_DOES_NOT_EXIST: + return "QNN_GRAPH_ERROR_GRAPH_DOES_NOT_EXIST"; + case QNN_GRAPH_ERROR_INVALID_NAME: + return "QNN_GRAPH_ERROR_INVALID_NAME"; + case QNN_GRAPH_ERROR_INVALID_TENSOR: + return "QNN_GRAPH_ERROR_INVALID_TENSOR"; + case QNN_GRAPH_ERROR_INVALID_OP_CONFIG: + return "QNN_GRAPH_ERROR_INVALID_OP_CONFIG"; + case QNN_GRAPH_ERROR_SET_PROFILE: + return "QNN_GRAPH_ERROR_SET_PROFILE"; + case QNN_GRAPH_ERROR_UNCONNECTED_NODE: + return "QNN_GRAPH_ERROR_UNCONNECTED_NODE"; + case QNN_GRAPH_ERROR_CREATE_FAILED: + return "QNN_GRAPH_ERROR_CREATE_FAILED"; + default: + return nullptr; + } +} + } // namespace qnn diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp index e8f1bf71e88be..e91a5ae8730d6 100644 --- a/ggml/src/ggml-qnn/utils.hpp +++ b/ggml/src/ggml-qnn/utils.hpp @@ -27,6 +27,8 @@ void align_free(void *ptr); const char *opname_from_ggmlop(enum ggml_op ggmlop); +const char *get_qnn_error_string(Qnn_ErrorHandle_t error); + inline int validate_tensor_version(const Qnn_Tensor_t &tensor) { if (tensor.version != QNN_TENSOR_VERSION_1) { QNN_LOG_WARN("validate_tensor_version() tensor %s, got unsupported version %d\n", tensor.v1.name, From 1679dcf47ea660254a7b7ccdbbcbd4d858370d5c Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 19 Jul 2024 22:56:00 +0800 Subject: [PATCH 091/166] fix: check all dimentions in `can offload` --- ggml/src/ggml-qnn.cpp | 11 ++++++++++- ggml/src/ggml-qnn/utils.cpp | 2 ++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index e448d73821cb3..aadf53c35d872 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -359,7 +359,16 @@ GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const GGML_CALL static bool ggml_backend_qnn_offload_op(ggml_backend_t backend, const ggml_tensor *op) { GGML_UNUSED(backend); - return op->ne[0] > 1 && op->ne[1] > 1; + size_t dims = ggml_n_dims(op); + bool can_offload = false; + for (size_t i = 0; i < dims; i++) { + if (op->ne[i] > 1) { + can_offload = true; + break; + } + } + + return can_offload; } static ggml_backend_i ggml_backend_qnn_interface = { diff --git a/ggml/src/ggml-qnn/utils.cpp b/ggml/src/ggml-qnn/utils.cpp index e36142f283d0d..70a898b95a63a 100644 --- a/ggml/src/ggml-qnn/utils.cpp +++ b/ggml/src/ggml-qnn/utils.cpp @@ -153,6 +153,8 @@ const char *opname_from_ggmlop(enum ggml_op ggmlop) { } const char *get_qnn_error_string(Qnn_ErrorHandle_t error) { + // A complete list of error codes can be found at here: + // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/api_error_codes.html switch (error) { case QNN_SUCCESS: return "QNN_SUCCESS"; From 28a00e5e6c9cc691bca6c49d706b7fbf81ba9625 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 20 Jul 2024 14:10:00 +0800 Subject: [PATCH 092/166] fix: try fix QNN_GRAPH_ERROR_INVALID_OP_CONFIG --- ggml/src/ggml-qnn.cpp | 6 ------ ggml/src/ggml-qnn/graph.hpp | 19 +++++++++++++------ 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index aadf53c35d872..3ca0dc607eb60 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -123,12 +123,6 @@ struct ggml_backend_qnn_buffer_type_context { std::string name; }; -// ================================================================================================= -// -// QNN backend internal helper functions -// -// ================================================================================================= - // ================================================================================================= // // implementation of QNN backend for GGML diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 2d412dffd743a..30f96a994cd84 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -101,12 +101,19 @@ class ggml_qnn_graph { _tensor_inputs = tensor_inputs; _tensor_outputs = tensor_outputs; - Qnn_OpConfig_t op_config = { /*.version = */ QNN_OPCONFIG_VERSION_1, - /*.v1 = */ { _graph_name.c_str(), QNN_OP_PACKAGE_NAME_QTI_AISW, op_name.c_str(), - (uint32_t)_param_types.size(), _param_types.data(), - (uint32_t)_tensor_inputs.size(), _tensor_inputs.data(), - (uint32_t)_tensor_outputs.size(), _tensor_outputs.data() } }; - auto error = _qnn_interface->qnn_graph_add_node(_graph_handle, op_config); + Qnn_OpConfig_t config = QNN_OPCONFIG_INIT; + config.version = QNN_OPCONFIG_VERSION_1; + auto &op_config = config.v1; + op_config.name = _graph_name.c_str(); + op_config.packageName = QNN_OP_PACKAGE_NAME_QTI_AISW; + op_config.typeName = op_name.c_str(); + op_config.numOfParams = (uint32_t)_param_types.size(); + op_config.params = _param_types.data(); + op_config.numOfInputs = (uint32_t)_tensor_inputs.size(); + op_config.inputTensors = _tensor_inputs.data(); + op_config.numOfOutputs = (uint32_t)_tensor_outputs.size(); + op_config.outputTensors = _tensor_outputs.data(); + auto error = _qnn_interface->qnn_graph_add_node(_graph_handle, config); if (error != QNN_SUCCESS) { auto *error_str = get_qnn_error_string(error); if (error_str) { From 27299463ae74b8d72ce84780a61f25ad77634f0f Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 20 Jul 2024 14:23:44 +0800 Subject: [PATCH 093/166] fix: try fix tensor type error --- ggml/src/ggml-qnn/backend-ops.cpp | 4 ++-- ggml/src/ggml-qnn/tensor.hpp | 10 +++++++--- ggml/src/ggml-qnn/utils.cpp | 2 +- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 6367e7c7064d1..1e79205986918 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -74,7 +74,7 @@ bool qnn_bind_tensors_to_graph(qnn::ggml_qnn_graph<_InputSize, _OutputSize> *gra std::array qnn_input_tensors; for (size_t i = 0; i < inputs.size(); ++i) { auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(inputs[i]); - if (!tensor || !tensor->bind_to_graph(*graph)) { + if (!tensor || !tensor->bind_to_graph(*graph, true)) { return false; } @@ -84,7 +84,7 @@ bool qnn_bind_tensors_to_graph(qnn::ggml_qnn_graph<_InputSize, _OutputSize> *gra std::array qnn_output_tensors; for (size_t i = 0; i < outputs.size(); ++i) { auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(outputs[i]); - if (!tensor || !tensor->bind_to_graph(*graph)) { + if (!tensor || !tensor->bind_to_graph(*graph, false)) { return false; } diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index e5dc436adaa5c..9137b5d86381c 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -43,7 +43,8 @@ class ggml_qnn_tensor { _dimensions[2] = (uint32_t)tensor->ne[2]; _dimensions[3] = (uint32_t)tensor->ne[3]; QNN_TENSOR_SET_DIMENSIONS(_qnn_tensor, _dimensions); - QNN_TENSOR_SET_TYPE(_qnn_tensor, device_tensortype_from_ggml_tensor(tensor)); + auto qnn_tensor_type = device_tensortype_from_ggml_tensor(tensor); + QNN_TENSOR_SET_TYPE(_qnn_tensor, qnn_tensor_type); QNN_TENSOR_SET_DATA_FORMAT(_qnn_tensor, QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER); QNN_TENSOR_SET_DATA_TYPE(_qnn_tensor, device_datatype_from_ggml_datatype(tensor->type)); // TODO: set the quantizeParams base on the tensor type @@ -54,11 +55,11 @@ class ggml_qnn_tensor { QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); tensor->extra = this; - QNN_LOG_DEBUG("create tensor %s with device %d", _tensor_name.c_str(), device); + QNN_LOG_DEBUG("create tensor %s, device: %d, qnn_type: %d", _tensor_name.c_str(), device, (int)qnn_tensor_type); } template - bool bind_to_graph(ggml_qnn_graph<_InputSize, _OutputSize> &graph) { + bool bind_to_graph(ggml_qnn_graph<_InputSize, _OutputSize> &graph, bool is_input) { if (!is_valid()) { QNN_LOG_WARN("tensor %s not valid", _tensor_name.c_str()); return false; @@ -75,6 +76,9 @@ class ggml_qnn_tensor { } } + Qnn_TensorType_t new_tensor_type = is_input ? QNN_TENSOR_TYPE_APP_WRITE : QNN_TENSOR_TYPE_APP_READ; + QNN_TENSOR_SET_TYPE(_qnn_tensor, new_tensor_type); + QNN_LOG_INFO("tensor %s changed to type %d", _tensor_name.c_str(), new_tensor_type); Qnn_Tensor_t tensor = _qnn_tensor; if (!graph.create_graph_tensor(tensor)) { QNN_LOG_WARN("create graph tensor failed, tensor %s", _tensor_name.c_str()); diff --git a/ggml/src/ggml-qnn/utils.cpp b/ggml/src/ggml-qnn/utils.cpp index 70a898b95a63a..820b72b8969f8 100644 --- a/ggml/src/ggml-qnn/utils.cpp +++ b/ggml/src/ggml-qnn/utils.cpp @@ -30,7 +30,7 @@ Qnn_DataType_t device_datatype_from_ggml_datatype(ggml_type ggml_type) { } Qnn_TensorType_t device_tensortype_from_ggml_tensor(ggml_tensor *ggml_tensor) { - Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; + Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_NATIVE; if (ggml_tensor->flags & GGML_TENSOR_FLAG_INPUT) { qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; From 51f95d698004def0202d642ac0fe7e49f566d13d Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 20 Jul 2024 16:11:35 +0800 Subject: [PATCH 094/166] fix: dimension could be wrong for tensor liked 1x1x8 --- ggml/src/ggml-qnn/tensor.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index 9137b5d86381c..7fb71de38787f 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -48,7 +48,7 @@ class ggml_qnn_tensor { QNN_TENSOR_SET_DATA_FORMAT(_qnn_tensor, QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER); QNN_TENSOR_SET_DATA_TYPE(_qnn_tensor, device_datatype_from_ggml_datatype(tensor->type)); // TODO: set the quantizeParams base on the tensor type - QNN_TENSOR_SET_RANK(_qnn_tensor, qnn::get_ggml_tensor_rank(tensor)); + QNN_TENSOR_SET_RANK(_qnn_tensor, (uint32_t)ggml_n_dims(tensor)); QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); Qnn_ClientBuffer_t client_buf = {}; From 5f3b1ae3b0997c92399629ed9f98a2dff1b76eac Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 20 Jul 2024 16:21:09 +0800 Subject: [PATCH 095/166] fix: try fix graph cache with append the tensors name --- ggml/src/ggml-qnn.cpp | 7 ++++++- ggml/src/ggml-qnn/backend-ops.cpp | 19 ++++++++++++------- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 3ca0dc607eb60..46718af09dbcb 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -96,9 +96,14 @@ class ggml_backend_qnn_buffer_context { bool is_valid() const { return _buffer != nullptr; } bool init_tensor(ggml_tensor *tensor) { + if (qnn::ggml_qnn_tensor::from_ggml_tensor(tensor)) { + QNN_LOG_INFO("tensor %s already initialized", tensor->name); + return true; + } + auto qnn_tensor = std::make_unique(tensor, _device, _instance); if (!qnn_tensor->is_valid()) { - QNN_LOG_WARN("Create ggml_qnn_tensor failed"); + QNN_LOG_WARN("create ggml_qnn_tensor failed"); return false; } diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 1e79205986918..f6eb61731381d 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -167,19 +167,23 @@ qnn::ggml_qnn_graph<_InputSize, _OutputSize> *get_qnn_graph_from_cache( auto &graph_cache = get_qnn_graph_cache(ctx, inputs, outputs); const auto *op_name = op < qnn::kGgmlUnaryOpStart ? ggml_op_name(ggml_op(op)) : ggml_unary_op_name(ggml_unary_op(op - qnn::kGgmlUnaryOpStart)); - const std::string graph_key(op_name); + std::string graph_key(op_name); + for (auto &input : inputs) { + graph_key += "_"; + graph_key += input->name; + } + for (auto &output : outputs) { + graph_key += "_"; + graph_key += output->name; + } + auto it = graph_cache.find(graph_key); graph_t *graph_ptr = nullptr; if (it != graph_cache.end()) { graph_ptr = it->second.get(); } else { - std::string graph_name = graph_key + "_" + std::to_string(ctx->threads); - for (auto &input : inputs) { - graph_name += "_"; - graph_name += input->name; - } auto graph = - std::make_unique(graph_name, (QNNBackend)(ctx->device), ctx->instance->get_qnn_context_handle(), + std::make_unique(graph_key, (QNNBackend)(ctx->device), ctx->instance->get_qnn_context_handle(), ctx->qnn_interface, ctx->socinfo.vtcm_size_in_mb); if (!graph->is_valid()) { @@ -187,6 +191,7 @@ qnn::ggml_qnn_graph<_InputSize, _OutputSize> *get_qnn_graph_from_cache( } if (!qnn_bind_tensors_to_graph<_InputSize, _OutputSize>(graph.get(), qnn_op.c_str(), inputs, outputs)) { + QNN_LOG_ERROR("qnn_bind_tensors_to_graph failed\n"); return nullptr; } From b173c4e061b1a4bb3bd3ed2e4b968b51a143c4b5 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 20 Jul 2024 16:57:51 +0800 Subject: [PATCH 096/166] feat: update tensor name when bind to graph --- ggml/src/ggml-qnn/backend-ops.cpp | 1 + ggml/src/ggml-qnn/tensor.hpp | 33 +++++++++++++++++++------------ 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index f6eb61731381d..6896454aa55e7 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -180,6 +180,7 @@ qnn::ggml_qnn_graph<_InputSize, _OutputSize> *get_qnn_graph_from_cache( auto it = graph_cache.find(graph_key); graph_t *graph_ptr = nullptr; if (it != graph_cache.end()) { + QNN_LOG_DEBUG("found graph %s in cache\n", graph_key.c_str()); graph_ptr = it->second.get(); } else { auto graph = diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index 7fb71de38787f..49e9258c38a60 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -29,14 +29,7 @@ class ggml_qnn_tensor { explicit ggml_qnn_tensor(ggml_tensor *tensor, QNNBackend device, std::shared_ptr qnn_instance) : _tensor(tensor), _device(device), _qnn_instance(qnn_instance) { - _tensor_name = ggml_get_name(tensor); - if (_tensor_name.empty()) { - static std::atomic_uint32_t unnamed_tensor_count = 0; - char buffer[GGML_MAX_NAME] = {}; - snprintf(buffer, sizeof(buffer), "unnamed_%d", (int)(unnamed_tensor_count++)); - _tensor_name = buffer; - } - + update_tensor_name(); QNN_TENSOR_SET_NAME(_qnn_tensor, _tensor_name.c_str()); _dimensions[0] = (uint32_t)tensor->ne[0]; _dimensions[1] = (uint32_t)tensor->ne[1]; @@ -79,6 +72,7 @@ class ggml_qnn_tensor { Qnn_TensorType_t new_tensor_type = is_input ? QNN_TENSOR_TYPE_APP_WRITE : QNN_TENSOR_TYPE_APP_READ; QNN_TENSOR_SET_TYPE(_qnn_tensor, new_tensor_type); QNN_LOG_INFO("tensor %s changed to type %d", _tensor_name.c_str(), new_tensor_type); + update_tensor_name(); Qnn_Tensor_t tensor = _qnn_tensor; if (!graph.create_graph_tensor(tensor)) { QNN_LOG_WARN("create graph tensor failed, tensor %s", _tensor_name.c_str()); @@ -116,15 +110,14 @@ class ggml_qnn_tensor { auto tensor_type = QNN_TENSOR_GET_TYPE(_qnn_tensor); if (tensor_type != QNN_TENSOR_TYPE_APP_WRITE && tensor_type != QNN_TENSOR_TYPE_APP_READWRITE) { - QNN_LOG_WARN("tensor %s not writable", _tensor_name.c_str()); - return false; + QNN_LOG_WARN("tensor %s type(%d) not WRITE", _tensor_name.c_str(), (int)tensor_type); } if (should_use_mem_handle()) { if (_qnn_rpc_buffer) { memcpy(_qnn_rpc_buffer, _tensor->data, ggml_nbytes(_tensor)); } else { - QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n"); + QNN_LOG_WARN("tensor %s: can't find rpcmem from qnn mem handle\n", _tensor_name.c_str()); return false; } } @@ -142,8 +135,7 @@ class ggml_qnn_tensor { auto tensor_type = QNN_TENSOR_GET_TYPE(_qnn_tensor); if (tensor_type != QNN_TENSOR_TYPE_APP_READ && tensor_type != QNN_TENSOR_TYPE_APP_READWRITE) { - QNN_LOG_WARN("tensor %s not readable", _tensor_name.c_str()); - return false; + QNN_LOG_WARN("tensor %s type(%d) not READ", _tensor_name.c_str(), (int)tensor_type); } if (should_use_mem_handle()) { @@ -190,6 +182,21 @@ class ggml_qnn_tensor { bool should_use_mem_handle() const { return _device == QNN_BACKEND_NPU; } + void update_tensor_name() { + auto *tensor_name = ggml_get_name(_tensor); + if (!strnlen(tensor_name, GGML_MAX_NAME)) { + if (_tensor_name.empty()) { + static std::atomic_uint32_t unnamed_tensor_count = 0; + char buffer[GGML_MAX_NAME] = {}; + snprintf(buffer, sizeof(buffer), "unnamed_%d", (int)(unnamed_tensor_count++)); + _tensor_name = buffer; + } + } else { + QNN_LOG_DEBUG("tensor name changed: %s -> %s", _tensor_name.c_str(), tensor_name); + _tensor_name = tensor_name; + } + } + const ggml_tensor *_tensor; QNNBackend _device; std::shared_ptr _qnn_instance; From 3b47056c97a01fd176ce46ce969b95d71884919f Mon Sep 17 00:00:00 2001 From: hongruichen Date: Mon, 22 Jul 2024 12:45:26 +0800 Subject: [PATCH 097/166] refactoring: change the tensor binding mode between qnn tensor and ggml tensor --- ggml/src/ggml-qnn.cpp | 36 +------ ggml/src/ggml-qnn/backend-ops.cpp | 126 ++++------------------ ggml/src/ggml-qnn/backend-ops.hpp | 4 +- ggml/src/ggml-qnn/backend.hpp | 6 +- ggml/src/ggml-qnn/graph.hpp | 120 ++++++++++++++------- ggml/src/ggml-qnn/qnn-lib.hpp | 2 +- ggml/src/ggml-qnn/tensor.hpp | 169 +++++++++++++++--------------- ggml/src/ggml-qnn/utils.hpp | 109 ++++++++++--------- 8 files changed, 256 insertions(+), 316 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 46718af09dbcb..87653cfb1f741 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -87,30 +87,12 @@ class ggml_backend_qnn_buffer_context { } ~ggml_backend_qnn_buffer_context() { - _tensors.clear(); - // the free will do nothing if the _buffer is nullptr qnn::align_free(_buffer); } bool is_valid() const { return _buffer != nullptr; } - bool init_tensor(ggml_tensor *tensor) { - if (qnn::ggml_qnn_tensor::from_ggml_tensor(tensor)) { - QNN_LOG_INFO("tensor %s already initialized", tensor->name); - return true; - } - - auto qnn_tensor = std::make_unique(tensor, _device, _instance); - if (!qnn_tensor->is_valid()) { - QNN_LOG_WARN("create ggml_qnn_tensor failed"); - return false; - } - - _tensors.push_back(std::move(qnn_tensor)); - return true; - } - void *get_buffer() { return _buffer; } size_t get_buffer_size() { return _buffer_size; } @@ -118,7 +100,6 @@ class ggml_backend_qnn_buffer_context { QNNBackend _device; std::shared_ptr _instance; std::string _name; - std::list> _tensors; void *_buffer = nullptr; size_t _buffer_size = 0; }; @@ -175,12 +156,9 @@ GGML_CALL static void *ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t bu } GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor) { - ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context; - - if (!ctx->init_tensor(tensor)) { - QNN_LOG_WARN("init ggml_qnn_tensor failed"); - return; - } + // Do nothing here, the qnn tensor will be create along with the graph. + GGML_UNUSED(buffer); + GGML_UNUSED(tensor); } GGML_CALL static void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor, @@ -271,13 +249,7 @@ GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { auto instance = g_qnn_mgr[ctx->device].instance; if (instance) { - ctx->qnn_unary_graph_cache.clear(); - for (const auto &graph_item : ctx->qnn_binary_graph_cache) { - QNN_LOG_INFO("graph type:%s", graph_item.first.c_str()); - } - - ctx->qnn_binary_graph_cache.clear(); - + ctx->qnn_graph_cache.clear(); instance->qnn_finalize(); g_qnn_mgr[ctx->device].instance.reset(); } diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 6896454aa55e7..bd87cfc9e66eb 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -19,10 +19,8 @@ bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src, } auto instance = ctx->instance; - auto *tensor0 = qnn::ggml_qnn_tensor::from_ggml_tensor(src); - auto *tensor1 = qnn::ggml_qnn_tensor::from_ggml_tensor(dst); - if (!instance || !tensor0 || !tensor1) { - QNN_LOG_WARN("invalid tensors\n"); + if (!instance) { + QNN_LOG_WARN("invalid instance\n"); return false; } @@ -37,11 +35,8 @@ bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, } auto instance = ctx->instance; - auto *tensor0 = qnn::ggml_qnn_tensor::from_ggml_tensor(src0); - auto *tensor1 = qnn::ggml_qnn_tensor::from_ggml_tensor(src1); - auto *tensor2 = qnn::ggml_qnn_tensor::from_ggml_tensor(dst); - if (!instance || !tensor0 || !tensor1 || !tensor2) { - QNN_LOG_WARN("invalid tensors\n"); + if (!instance) { + QNN_LOG_WARN("invalid instance\n"); return false; } @@ -67,104 +62,29 @@ void print_ggml_tensor(const ggml_tensor *tensor) { tensor->nb[0], tensor->nb[1], tensor->nb[2]); } -template -bool qnn_bind_tensors_to_graph(qnn::ggml_qnn_graph<_InputSize, _OutputSize> *graph, const std::string &op_name, - const std::array &inputs, - const std::array &outputs) { - std::array qnn_input_tensors; - for (size_t i = 0; i < inputs.size(); ++i) { - auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(inputs[i]); - if (!tensor || !tensor->bind_to_graph(*graph, true)) { - return false; - } - - qnn_input_tensors[i] = tensor->get_qnn_tensor(); - } - - std::array qnn_output_tensors; - for (size_t i = 0; i < outputs.size(); ++i) { - auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(outputs[i]); - if (!tensor || !tensor->bind_to_graph(*graph, false)) { - return false; - } - - qnn_output_tensors[i] = tensor->get_qnn_tensor(); - } - - if (!graph->add_nodes(op_name, qnn_input_tensors, qnn_output_tensors)) { - return false; - } - - return true; +template +qnn::ggml_tensor_array_t to_ggml_tensor_array(const std::array &array) { + return qnn::ggml_tensor_array_t(array.data(), array.data() + _Size); } template -bool execute_graph(qnn::ggml_qnn_graph<_InputSize, _OutputSize> *graph, - const std::array &inputs, +bool execute_graph(qnn::ggml_qnn_graph *graph, const std::array &inputs, const std::array &outputs) { - - std::array qnn_input_tensors; - for (size_t i = 0; i < inputs.size(); ++i) { - auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(inputs[i]); - if (!tensor || !tensor->write_to_qnn_tensor()) { - QNN_LOG_WARN("write_to_qnn_tensor failed\n"); - return false; - } - - qnn_input_tensors[i] = tensor->get_qnn_tensor(); - } - - std::array qnn_output_tensors; - for (size_t i = 0; i < outputs.size(); ++i) { - auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(outputs[i]); - if (!tensor) { - return false; - } - - qnn_output_tensors[i] = tensor->get_qnn_tensor(); - } - - if (!graph->execute(qnn_input_tensors, qnn_output_tensors)) { + if (!graph->execute(to_ggml_tensor_array<_InputSize>(inputs), to_ggml_tensor_array<_OutputSize>(outputs))) { QNN_LOG_WARN("execute failed\n"); return false; } - for (auto &output : outputs) { - auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(output); - if (!tensor || !tensor->read_from_qnn_tensor()) { - QNN_LOG_WARN("read_from_qnn_tensors failed\n"); - return false; - } - } - return true; } -qnn::ggml_qnn_unary_graph_cache_t &get_qnn_graph_cache(ggml_backend_qnn_context *ctx, - const std::array &inputs, - const std::array &outputs) { - GGML_UNUSED(inputs); - GGML_UNUSED(outputs); - return ctx->qnn_unary_graph_cache; -} - -qnn::ggml_qnn_binary_graph_cache_t &get_qnn_graph_cache(ggml_backend_qnn_context *ctx, - const std::array &inputs, - const std::array &outputs) { - GGML_UNUSED(inputs); - GGML_UNUSED(outputs); - return ctx->qnn_binary_graph_cache; -} - template -qnn::ggml_qnn_graph<_InputSize, _OutputSize> *get_qnn_graph_from_cache( - ggml_backend_qnn_context *ctx, size_t op, const std::string &qnn_op, - const std::array &inputs, const std::array &outputs) { - using graph_t = qnn::ggml_qnn_graph<_InputSize, _OutputSize>; - +qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_context *ctx, size_t op, const std::string &qnn_op, + const std::array &inputs, + const std::array &outputs) { GGML_ASSERT(op < (GGML_OP_COUNT + GGML_UNARY_OP_COUNT)); - auto &graph_cache = get_qnn_graph_cache(ctx, inputs, outputs); + auto &graph_cache = ctx->qnn_graph_cache; const auto *op_name = op < qnn::kGgmlUnaryOpStart ? ggml_op_name(ggml_op(op)) : ggml_unary_op_name(ggml_unary_op(op - qnn::kGgmlUnaryOpStart)); std::string graph_key(op_name); @@ -178,21 +98,21 @@ qnn::ggml_qnn_graph<_InputSize, _OutputSize> *get_qnn_graph_from_cache( } auto it = graph_cache.find(graph_key); - graph_t *graph_ptr = nullptr; + qnn::ggml_qnn_graph *graph_ptr = nullptr; if (it != graph_cache.end()) { QNN_LOG_DEBUG("found graph %s in cache\n", graph_key.c_str()); graph_ptr = it->second.get(); } else { - auto graph = - std::make_unique(graph_key, (QNNBackend)(ctx->device), ctx->instance->get_qnn_context_handle(), - ctx->qnn_interface, ctx->socinfo.vtcm_size_in_mb); + auto graph = std::make_unique(graph_key, (QNNBackend)(ctx->device), ctx->instance, + ctx->socinfo.vtcm_size_in_mb); if (!graph->is_valid()) { return nullptr; } - if (!qnn_bind_tensors_to_graph<_InputSize, _OutputSize>(graph.get(), qnn_op.c_str(), inputs, outputs)) { - QNN_LOG_ERROR("qnn_bind_tensors_to_graph failed\n"); + if (!graph->build_graph(qnn_op, to_ggml_tensor_array<_InputSize>(inputs), + to_ggml_tensor_array<_OutputSize>(outputs))) { + QNN_LOG_ERROR("build_graph failed\n"); return nullptr; } @@ -309,15 +229,13 @@ static_assert(kGgmlOpToQnnOp[GGML_UNARY_OP_GELU + qnn::kGgmlUnaryOpStart] != nul "GGML_UNARY_OP_GELU does not correspond to QNN_OP_GELU"); template -bool qnn_binary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) { +bool qnn_binary_op_impl(ggml_backend_qnn_context *ctx, ggml_tensor *src0, ggml_tensor *src1, ggml_tensor *dst) { static_assert(kGgmlOpToQnnOp[_GgmlOp] != nullptr, "GGML_OP does not have a corresponding QNN_OP"); CHECK_PARAMS(ctx, src0, src1, dst); bool succeed = false; - qnn::ggml_qnn_graph_binary *graph_ptr = - get_qnn_graph_from_cache<2, 1>(ctx, _GgmlOp, kGgmlOpToQnnOp[_GgmlOp], { src0, src1 }, { dst }); + auto *graph_ptr = get_qnn_graph_from_cache<2, 1>(ctx, _GgmlOp, kGgmlOpToQnnOp[_GgmlOp], { src0, src1 }, { dst }); if (graph_ptr) { succeed = execute_graph<2, 1>(graph_ptr, { src0, src1 }, { dst }); } @@ -332,7 +250,7 @@ bool qnn_binary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, } template -bool qnn_unary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src, ggml_tensor *dst) { +bool qnn_unary_op_impl(ggml_backend_qnn_context *ctx, ggml_tensor *src, ggml_tensor *dst) { static_assert(kGgmlOpToQnnOp[_GgmlOp] != nullptr, "GGML_OP does not have a corresponding QNN_OP"); CHECK_PARAMS(ctx, src, dst); diff --git a/ggml/src/ggml-qnn/backend-ops.hpp b/ggml/src/ggml-qnn/backend-ops.hpp index 8cc2dc366fbfa..614bcf651b86b 100644 --- a/ggml/src/ggml-qnn/backend-ops.hpp +++ b/ggml/src/ggml-qnn/backend-ops.hpp @@ -6,8 +6,8 @@ namespace qnn { -typedef bool (*ggml_qnn_unary_op_t)(ggml_backend_qnn_context *ctx, const ggml_tensor *src, ggml_tensor *dst); -typedef bool (*ggml_qnn_binary_op_t)(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, +typedef bool (*ggml_qnn_unary_op_t)(ggml_backend_qnn_context *ctx, ggml_tensor *src, ggml_tensor *dst); +typedef bool (*ggml_qnn_binary_op_t)(ggml_backend_qnn_context *ctx, ggml_tensor *src0, ggml_tensor *src1, ggml_tensor *dst); typedef const ggml_qnn_unary_op_t (&ggml_qnn_unary_op_array_t)[GGML_OP_COUNT + GGML_UNARY_OP_COUNT]; diff --git a/ggml/src/ggml-qnn/backend.hpp b/ggml/src/ggml-qnn/backend.hpp index 32f3c6cd445f6..b2f93a8f7a9e5 100644 --- a/ggml/src/ggml-qnn/backend.hpp +++ b/ggml/src/ggml-qnn/backend.hpp @@ -12,8 +12,7 @@ #include "qnn-lib.hpp" namespace qnn { -typedef std::unordered_map> ggml_qnn_unary_graph_cache_t; -typedef std::unordered_map> ggml_qnn_binary_graph_cache_t; +typedef std::unordered_map> ggml_qnn_graph_cache_t; } // namespace qnn struct ggml_backend_qnn_context { @@ -25,8 +24,7 @@ struct ggml_backend_qnn_context { qnn::qcom_socinfo socinfo = {}; std::shared_ptr instance; std::shared_ptr qnn_interface; - qnn::ggml_qnn_unary_graph_cache_t qnn_unary_graph_cache; - qnn::ggml_qnn_binary_graph_cache_t qnn_binary_graph_cache; + qnn::ggml_qnn_graph_cache_t qnn_graph_cache; explicit ggml_backend_qnn_context(int device, int threads, const char *name, const char *lib) : device(device), threads(threads) { diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 30f96a994cd84..9941365f7e897 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -1,27 +1,29 @@ #pragma once -#include +#include #include +#include #include "ggml-qnn.h" #include "logger.hpp" #include "qnn-lib.hpp" +#include "tensor.hpp" namespace qnn { -template +using ggml_tensor_array_t = std::vector; + class ggml_qnn_graph { public: - typedef std::array input_tensor_array_t; - typedef std::array output_tensor_array_t; - - explicit ggml_qnn_graph(const std::string &graph_name, QNNBackend device, Qnn_ContextHandle_t qnn_context, - std::shared_ptr qnn_interface, size_t vtcm_size_in_mb) : - _graph_name(graph_name), _device(device), _qnn_interface(qnn_interface) { + explicit ggml_qnn_graph(const std::string &graph_name, QNNBackend device, + std::shared_ptr qnn_instance, size_t vtcm_size_in_mb) : + _graph_name(graph_name), _device(device), _qnn_instance(qnn_instance) { QNN_LOG_INFO("graph name %s", graph_name.c_str()); + auto qnn_interface = qnn_instance->get_qnn_interface(); + auto qnn_context = qnn_instance->get_qnn_context_handle(); Qnn_ErrorHandle_t error = QNN_SUCCESS; Qnn_GraphHandle_t graph_handle = nullptr; if (device == QNN_BACKEND_NPU) { @@ -72,35 +74,53 @@ class ggml_qnn_graph { QNN_LOG_INFO("create qnn graph handle with graph name %s ok\n", graph_name.c_str()); _graph_handle = graph_handle; + _qnn_interface = qnn_interface; } - bool create_graph_tensor(Qnn_Tensor_t &tensor) { + ~ggml_qnn_graph() { QNN_LOG_DEBUG("graph name %s, destroy", _graph_name.c_str()); } + + bool build_graph(const std::string &op_name, const ggml_tensor_array_t &tensor_inputs, + const ggml_tensor_array_t &tensor_outputs) { if (!is_valid()) { QNN_LOG_ERROR("Invalid graph\n"); return false; } - auto err = _qnn_interface->qnn_tensor_create_graph_tensor(_graph_handle, &tensor); - if (err != QNN_SUCCESS) { - QNN_LOG_INFO("error = %d\n", err); - QNN_LOG_DEBUG("tensor%p name %s", &tensor, QNN_TENSOR_GET_NAME(tensor)); - return false; + QNN_LOG_DEBUG("graph name %s, add_nodes start", _graph_name.c_str()); + _qnn_tensor_inputs.resize(tensor_inputs.size()); + _tensor_inputs.resize(tensor_inputs.size()); + for (size_t i = 0; i < tensor_inputs.size(); i++) { + char buffer[GGML_MAX_NAME] = {}; + snprintf(buffer, GGML_MAX_NAME, "src%d", (int)i); + auto qnn_tensor = + std::make_shared(std::string(buffer), _device, _graph_handle, _qnn_instance); + auto *ggml_tensor = tensor_inputs[i]; + if (!qnn_tensor->bind_ggml_tensor(ggml_tensor, true)) { + QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); + return false; + } + + _qnn_tensor_inputs[i] = qnn_tensor->get_qnn_tensor(); + _tensor_inputs[i] = qnn_tensor; } - return true; - } + _qnn_tensor_outputs.resize(tensor_outputs.size()); + _tensor_outputs.resize(tensor_outputs.size()); + for (size_t i = 0; i < tensor_outputs.size(); i++) { + char buffer[GGML_MAX_NAME] = {}; + snprintf(buffer, GGML_MAX_NAME, "dst%d", (int)i); + auto qnn_tensor = + std::make_shared(std::string(buffer), _device, _graph_handle, _qnn_instance); + auto *ggml_tensor = tensor_inputs[i]; + if (!qnn_tensor->bind_ggml_tensor(ggml_tensor, false)) { + QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); + return false; + } - bool add_nodes(const std::string &op_name, const input_tensor_array_t &tensor_inputs, - const output_tensor_array_t &tensor_outputs) { - if (!is_valid()) { - QNN_LOG_ERROR("Invalid graph\n"); - return false; + _qnn_tensor_outputs[i] = qnn_tensor->get_qnn_tensor(); + _tensor_outputs[i] = qnn_tensor; } - QNN_LOG_DEBUG("graph name %s, add_nodes start", _graph_name.c_str()); - _tensor_inputs = tensor_inputs; - _tensor_outputs = tensor_outputs; - Qnn_OpConfig_t config = QNN_OPCONFIG_INIT; config.version = QNN_OPCONFIG_VERSION_1; auto &op_config = config.v1; @@ -109,10 +129,10 @@ class ggml_qnn_graph { op_config.typeName = op_name.c_str(); op_config.numOfParams = (uint32_t)_param_types.size(); op_config.params = _param_types.data(); - op_config.numOfInputs = (uint32_t)_tensor_inputs.size(); - op_config.inputTensors = _tensor_inputs.data(); - op_config.numOfOutputs = (uint32_t)_tensor_outputs.size(); - op_config.outputTensors = _tensor_outputs.data(); + op_config.numOfInputs = (uint32_t)_qnn_tensor_inputs.size(); + op_config.inputTensors = _qnn_tensor_inputs.data(); + op_config.numOfOutputs = (uint32_t)_qnn_tensor_outputs.size(); + op_config.outputTensors = _qnn_tensor_outputs.data(); auto error = _qnn_interface->qnn_graph_add_node(_graph_handle, config); if (error != QNN_SUCCESS) { auto *error_str = get_qnn_error_string(error); @@ -139,12 +159,32 @@ class ggml_qnn_graph { return true; } - bool execute(const input_tensor_array_t &tensor_inputs, const output_tensor_array_t &tensor_outputs) { - _tensor_inputs = tensor_inputs; - _tensor_outputs = tensor_outputs; + bool execute(const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) { + GGML_ASSERT(tensor_inputs.size() == _tensor_inputs.size()); + GGML_ASSERT(tensor_outputs.size() == _tensor_outputs.size()); + for (size_t i = 0; i < tensor_inputs.size(); i++) { + auto *ggml_tensor = tensor_inputs[i]; + if (!_tensor_inputs[i]->bind_ggml_tensor(ggml_tensor, true)) { + QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); + return false; + } + + _qnn_tensor_inputs[i] = _tensor_inputs[i]->get_qnn_tensor(); + } + + for (size_t i = 0; i < tensor_outputs.size(); i++) { + auto *ggml_tensor = tensor_inputs[i]; + if (!_tensor_outputs[i]->bind_ggml_tensor(ggml_tensor, false)) { + QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); + return false; + } + + _qnn_tensor_outputs[i] = _tensor_outputs[i]->get_qnn_tensor(); + } + auto error = - _qnn_interface->qnn_graph_execute(_graph_handle, _tensor_inputs.data(), _tensor_inputs.size(), - _tensor_outputs.data(), _tensor_outputs.size(), nullptr, nullptr); + _qnn_interface->qnn_graph_execute(_graph_handle, _qnn_tensor_inputs.data(), _qnn_tensor_inputs.size(), + _qnn_tensor_outputs.data(), _qnn_tensor_outputs.size(), nullptr, nullptr); if (_device == QNN_BACKEND_NPU) { if (error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) { QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); @@ -168,10 +208,13 @@ class ggml_qnn_graph { private: const std::string _graph_name; const QNNBackend _device; - std::shared_ptr _qnn_interface; Qnn_GraphHandle_t _graph_handle = nullptr; - std::array _tensor_inputs; - std::array _tensor_outputs; + std::shared_ptr _qnn_instance; + std::shared_ptr _qnn_interface; + std::vector> _tensor_inputs; + std::vector> _tensor_outputs; + std::vector _qnn_tensor_inputs; + std::vector _qnn_tensor_outputs; std::vector _param_types; ggml_qnn_graph(const ggml_qnn_graph &) = delete; @@ -180,7 +223,4 @@ class ggml_qnn_graph { void operator=(ggml_qnn_graph &&) = delete; }; -using ggml_qnn_graph_binary = ggml_qnn_graph<2, 1>; -using ggml_qnn_graph_unary = ggml_qnn_graph<1, 1>; - } // namespace qnn diff --git a/ggml/src/ggml-qnn/qnn-lib.hpp b/ggml/src/ggml-qnn/qnn-lib.hpp index 517df493ccb16..4e1dcb34c119f 100644 --- a/ggml/src/ggml-qnn/qnn-lib.hpp +++ b/ggml/src/ggml-qnn/qnn-lib.hpp @@ -366,7 +366,7 @@ class qnn_instance { size_t probe_slots[] = { 1024, 1536, 2048 - 48, 2048 }; size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); for (size_t idx = 0; idx < probe_counts; idx++) { - rpc_buffer = static_cast(alloc_rpcmem(probe_slots[idx] * size_in_mb, 4)); + rpc_buffer = static_cast(alloc_rpcmem(probe_slots[idx] * size_in_mb, sizeof(void *))); if (!rpc_buffer) { QNN_LOG_DEBUG("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno)); break; diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index 49e9258c38a60..5e45266b40b9b 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -10,7 +10,6 @@ #include "QnnTensor.h" #include "System/QnnSystemInterface.h" -#include "graph.hpp" #include "logger.hpp" #include "qnn-lib.hpp" #include "utils.hpp" @@ -19,68 +18,47 @@ namespace qnn { class ggml_qnn_tensor { public: - static ggml_qnn_tensor *from_ggml_tensor(const ggml_tensor *tensor) { - if (!tensor) { - return nullptr; - } - - return static_cast(tensor->extra); - } - - explicit ggml_qnn_tensor(ggml_tensor *tensor, QNNBackend device, std::shared_ptr qnn_instance) : - _tensor(tensor), _device(device), _qnn_instance(qnn_instance) { - update_tensor_name(); + explicit ggml_qnn_tensor(const std::string &name, QNNBackend device, Qnn_GraphHandle_t graph_handle, + std::shared_ptr qnn_instance) : + _tensor_name(name), _device(device), _qnn_instance(qnn_instance), _graph_handle(graph_handle) { QNN_TENSOR_SET_NAME(_qnn_tensor, _tensor_name.c_str()); - _dimensions[0] = (uint32_t)tensor->ne[0]; - _dimensions[1] = (uint32_t)tensor->ne[1]; - _dimensions[2] = (uint32_t)tensor->ne[2]; - _dimensions[3] = (uint32_t)tensor->ne[3]; - QNN_TENSOR_SET_DIMENSIONS(_qnn_tensor, _dimensions); - auto qnn_tensor_type = device_tensortype_from_ggml_tensor(tensor); - QNN_TENSOR_SET_TYPE(_qnn_tensor, qnn_tensor_type); + QNN_TENSOR_SET_DIMENSIONS(_qnn_tensor, _dimensions.data()); + QNN_TENSOR_SET_TYPE(_qnn_tensor, QNN_TENSOR_TYPE_NATIVE); QNN_TENSOR_SET_DATA_FORMAT(_qnn_tensor, QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER); - QNN_TENSOR_SET_DATA_TYPE(_qnn_tensor, device_datatype_from_ggml_datatype(tensor->type)); - // TODO: set the quantizeParams base on the tensor type - QNN_TENSOR_SET_RANK(_qnn_tensor, (uint32_t)ggml_n_dims(tensor)); - - QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); - Qnn_ClientBuffer_t client_buf = {}; - QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); - - tensor->extra = this; - QNN_LOG_DEBUG("create tensor %s, device: %d, qnn_type: %d", _tensor_name.c_str(), device, (int)qnn_tensor_type); + QNN_LOG_DEBUG("create tensor %s, device: %d", _tensor_name.c_str(), device); } - template - bool bind_to_graph(ggml_qnn_graph<_InputSize, _OutputSize> &graph, bool is_input) { - if (!is_valid()) { - QNN_LOG_WARN("tensor %s not valid", _tensor_name.c_str()); - return false; - } - - if (_graph_handle) { - if (_graph_handle != graph.get_graph_handler()) { - QNN_LOG_WARN("tensor %s has been bound to another graph", _tensor_name.c_str()); + bool bind_ggml_tensor(ggml_tensor *tensor, bool is_input) { + if (_tensor) { + if (_tensor != tensor) { + QNN_LOG_WARN("tensor %s has been bound to another ggml tensor %s", _tensor_name.c_str(), + ggml_get_name(_tensor)); return false; } else { - QNN_LOG_INFO("tensor %s already bound to same graph %s", _tensor_name.c_str(), - graph.get_name().c_str()); + QNN_LOG_INFO("tensor %s already bound to same ggml tensor %s", _tensor_name.c_str(), + ggml_get_name(_tensor)); return true; } } + update_params_from_ggml_tensor(tensor); Qnn_TensorType_t new_tensor_type = is_input ? QNN_TENSOR_TYPE_APP_WRITE : QNN_TENSOR_TYPE_APP_READ; QNN_TENSOR_SET_TYPE(_qnn_tensor, new_tensor_type); QNN_LOG_INFO("tensor %s changed to type %d", _tensor_name.c_str(), new_tensor_type); - update_tensor_name(); - Qnn_Tensor_t tensor = _qnn_tensor; - if (!graph.create_graph_tensor(tensor)) { - QNN_LOG_WARN("create graph tensor failed, tensor %s", _tensor_name.c_str()); - return false; + + if (!QNN_TENSOR_GET_ID(_qnn_tensor)) { + Qnn_Tensor_t qnn_tensor = _qnn_tensor; + auto qnn_interface = _qnn_instance->get_qnn_interface(); + auto error = qnn_interface->qnn_tensor_create_graph_tensor(_graph_handle, &qnn_tensor); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("create graph tensor failed, tensor %s, error: %d\n", _tensor_name.c_str(), error); + return false; + } + QNN_TENSOR_SET_ID(_qnn_tensor, QNN_TENSOR_GET_ID(qnn_tensor)); } if (should_use_mem_handle()) { - _qnn_rpc_buffer = alloc_rpc_mem(); + _qnn_rpc_buffer = alloc_rpc_mem(ggml_nbytes(tensor)); if (!_qnn_rpc_buffer) { QNN_LOG_WARN("alloc rpc mem failed, tensor %s", _tensor_name.c_str()); return false; @@ -89,28 +67,59 @@ class ggml_qnn_tensor { QNN_LOG_DEBUG("tensor %s, use mem handle %p", _tensor_name.c_str(), QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor)); } else { QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); - Qnn_ClientBuffer_t client_buf = { _tensor->data, get_ggml_tensor_data_size(_tensor) }; + Qnn_ClientBuffer_t client_buf = { tensor->data, get_ggml_tensor_data_size(tensor) }; QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); QNN_LOG_DEBUG("tensor %s, use client buffer %p size %d", _tensor_name.c_str(), client_buf.data, (int)client_buf.dataSize); } - QNN_TENSOR_SET_ID(_qnn_tensor, QNN_TENSOR_GET_ID(tensor)); - _graph_handle = graph.get_graph_handler(); + _tensor = tensor; - QNN_LOG_DEBUG("bind tensor %s to graph %s", _tensor_name.c_str(), graph.get_name().c_str()); + if (!write_to_qnn_tensor()) { + QNN_LOG_WARN("write to qnn tensor failed, tensor %s", _tensor_name.c_str()); + return false; + } + + QNN_LOG_DEBUG("bind tensor %s to ggml tensor %s", _tensor_name.c_str(), ggml_get_name(tensor)); return true; } - bool write_to_qnn_tensor() { - if (!is_valid()) { - QNN_LOG_WARN("tensor %s not valid", _tensor_name.c_str()); + bool unbind_ggml_tensor() { + if (!_graph_handle) { + QNN_LOG_WARN("tensor %s not bound to any graph", _tensor_name.c_str()); + return false; + } + + if (!_tensor) { + QNN_LOG_DEBUG("tensor %s not bound to ggml tensor", _tensor_name.c_str()); + return true; + } + + if (!read_from_qnn_tensor()) { + QNN_LOG_WARN("read from qnn tensor failed, tensor %s", _tensor_name.c_str()); return false; } + if (!should_use_mem_handle()) { + QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); + Qnn_ClientBuffer_t client_buf = {}; + QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); + QNN_LOG_DEBUG("tensor %s, clear client buffer", _tensor_name.c_str()); + } + + _tensor = nullptr; + QNN_LOG_DEBUG("unbind tensor: %s from ggml tensor: %s", _tensor_name.c_str(), _tensor->name); + return true; + } + + const Qnn_Tensor_t &get_qnn_tensor() const { return _qnn_tensor; } + +private: + bool write_to_qnn_tensor() { auto tensor_type = QNN_TENSOR_GET_TYPE(_qnn_tensor); if (tensor_type != QNN_TENSOR_TYPE_APP_WRITE && tensor_type != QNN_TENSOR_TYPE_APP_READWRITE) { - QNN_LOG_WARN("tensor %s type(%d) not WRITE", _tensor_name.c_str(), (int)tensor_type); + QNN_LOG_DEBUG("tensor %s type(%d) not WRITE", _tensor_name.c_str(), (int)tensor_type); + return true; } if (should_use_mem_handle()) { @@ -128,14 +137,10 @@ class ggml_qnn_tensor { } bool read_from_qnn_tensor() { - if (!is_valid()) { - QNN_LOG_WARN("tensor %s not valid", _tensor_name.c_str()); - return false; - } - auto tensor_type = QNN_TENSOR_GET_TYPE(_qnn_tensor); if (tensor_type != QNN_TENSOR_TYPE_APP_READ && tensor_type != QNN_TENSOR_TYPE_APP_READWRITE) { - QNN_LOG_WARN("tensor %s type(%d) not READ", _tensor_name.c_str(), (int)tensor_type); + QNN_LOG_DEBUG("tensor %s type(%d) not READ", _tensor_name.c_str(), (int)tensor_type); + return true; } if (should_use_mem_handle()) { @@ -152,13 +157,8 @@ class ggml_qnn_tensor { return true; } - bool is_valid() const { return _tensor; } - const Qnn_Tensor_t &get_qnn_tensor() const { return _qnn_tensor; } - -private: - uint8_t *alloc_rpc_mem() { - uint8_t *qnn_rpc_buffer = - static_cast(_qnn_instance->alloc_rpcmem(ggml_nbytes(_tensor), alignof(void *))); + uint8_t *alloc_rpc_mem(size_t bytes) { + uint8_t *qnn_rpc_buffer = static_cast(_qnn_instance->alloc_rpcmem(bytes, alignof(void *))); if (!qnn_rpc_buffer) { QNN_LOG_WARN("alloc rpc mem failure, %s\n", strerror(errno)); QNN_LOG_DEBUG("tensor name %s", _tensor_name.c_str()); @@ -180,29 +180,28 @@ class ggml_qnn_tensor { return qnn_rpc_buffer; } - bool should_use_mem_handle() const { return _device == QNN_BACKEND_NPU; } + void update_params_from_ggml_tensor(ggml_tensor *tensor) { + _dimensions[0] = (uint32_t)tensor->ne[0]; + _dimensions[1] = (uint32_t)tensor->ne[1]; + _dimensions[2] = (uint32_t)tensor->ne[2]; + _dimensions[3] = (uint32_t)tensor->ne[3]; + QNN_TENSOR_SET_DATA_TYPE(_qnn_tensor, device_datatype_from_ggml_datatype(tensor->type)); + // TODO: set the quantizeParams base on the tensor type + QNN_TENSOR_SET_RANK(_qnn_tensor, (uint32_t)ggml_n_dims(tensor)); - void update_tensor_name() { - auto *tensor_name = ggml_get_name(_tensor); - if (!strnlen(tensor_name, GGML_MAX_NAME)) { - if (_tensor_name.empty()) { - static std::atomic_uint32_t unnamed_tensor_count = 0; - char buffer[GGML_MAX_NAME] = {}; - snprintf(buffer, sizeof(buffer), "unnamed_%d", (int)(unnamed_tensor_count++)); - _tensor_name = buffer; - } - } else { - QNN_LOG_DEBUG("tensor name changed: %s -> %s", _tensor_name.c_str(), tensor_name); - _tensor_name = tensor_name; - } + QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); + Qnn_ClientBuffer_t client_buf = {}; + QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); } + bool should_use_mem_handle() const { return _device == QNN_BACKEND_NPU; } + + std::string _tensor_name; const ggml_tensor *_tensor; QNNBackend _device; std::shared_ptr _qnn_instance; - Qnn_Tensor_t _qnn_tensor = QNN_TENSOR_INIT; - uint32_t _dimensions[4] = {}; - std::string _tensor_name; + Qnn_Tensor_t _qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion); + std::array _dimensions = {}; Qnn_GraphHandle_t _graph_handle = nullptr; uint8_t *_qnn_rpc_buffer = nullptr; diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp index e91a5ae8730d6..c2da6cb27eaf7 100644 --- a/ggml/src/ggml-qnn/utils.hpp +++ b/ggml/src/ggml-qnn/utils.hpp @@ -13,6 +13,8 @@ #include "QnnTypes.h" #include "logger.hpp" +#define QNN_TENSOR_VER(x) ((x).v2) + namespace qnn { uint32_t get_ggml_tensor_rank(const ggml_tensor *tensor); @@ -29,149 +31,159 @@ const char *opname_from_ggmlop(enum ggml_op ggmlop); const char *get_qnn_error_string(Qnn_ErrorHandle_t error); -inline int validate_tensor_version(const Qnn_Tensor_t &tensor) { - if (tensor.version != QNN_TENSOR_VERSION_1) { - QNN_LOG_WARN("validate_tensor_version() tensor %s, got unsupported version %d\n", tensor.v1.name, - tensor.version); - return 1; +constexpr const Qnn_TensorVersion_t kDefaultQnnTensorVersion = QNN_TENSOR_VERSION_2; + +inline Qnn_Tensor_t qnn_tensor_init(Qnn_TensorVersion_t version) { + Qnn_Tensor_t tensor; + tensor.version = version; + if (version == QNN_TENSOR_VERSION_1) { + tensor.v1 = QNN_TENSOR_V1_INIT; + } else if (version == QNN_TENSOR_VERSION_2) { + tensor.v2 = QNN_TENSOR_V2_INIT; } - return 0; + return tensor; } inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t &tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.id; + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).id; } return 0u; } inline const char *get_qnn_tensorname(const Qnn_Tensor_t &tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.name; + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).name; } return nullptr; } inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t &tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.type; + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).type; } return QNN_TENSOR_TYPE_UNDEFINED; } inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_t &tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.dataFormat; + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).dataFormat; } return QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER; } inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t &tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.dataType; + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).dataType; } return QNN_DATATYPE_UNDEFINED; } inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t &tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.quantizeParams; + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).quantizeParams; } return QNN_QUANTIZE_PARAMS_INIT; } inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t &tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.rank; + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).rank; } return 0u; } inline uint32_t *get_qnn_tensor_dimensions(const Qnn_Tensor_t &tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.dimensions; + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).dimensions; } return nullptr; } inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t &tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.memType; + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).memType; } return QNN_TENSORMEMTYPE_UNDEFINED; } inline Qnn_MemHandle_t get_qnn_tensor_memhandle(const Qnn_Tensor_t &tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.memHandle; + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).memHandle; } return nullptr; } inline void set_qnn_tensor_id(Qnn_Tensor_t &tensor, uint32_t id) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.id = id; + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).id = id; } } inline void set_qnn_tensor_name(Qnn_Tensor_t &tensor, const char *name) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.name = name; + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).name = name; } } inline void set_qnn_tensor_type(Qnn_Tensor_t &tensor, Qnn_TensorType_t type) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.type = type; + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).type = type; } } inline void set_qnn_tensor_dataformat(Qnn_Tensor_t &tensor, Qnn_TensorDataFormat_t format) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.dataFormat = format; + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).dataFormat = format; } } inline void set_qnn_tensor_datatype(Qnn_Tensor_t &tensor, Qnn_DataType_t dataType) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.dataType = dataType; + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).dataType = dataType; } } inline void set_qnn_tensor_quantparams(Qnn_Tensor_t &tensor, Qnn_QuantizeParams_t params) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.quantizeParams = params; + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).quantizeParams = params; } } inline void set_qnn_tensor_rank(Qnn_Tensor_t &tensor, uint32_t rank) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.rank = rank; + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).rank = rank; } } inline void set_qnn_tensor_dimensions(Qnn_Tensor_t &tensor, uint32_t *dims) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.dimensions = dims; + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).dimensions = dims; } } inline void set_qnn_tensor_memtype(Qnn_Tensor_t &tensor, Qnn_TensorMemType_t mem_type) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.memType = mem_type; + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).memType = mem_type; } } inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t &tensor, Qnn_ClientBuffer_t client_buf) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.clientBuf = client_buf; + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).clientBuf = client_buf; } } inline void set_qnn_tensor_memhandle(Qnn_Tensor_t &tensor, Qnn_MemHandle_t handle) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.memHandle = handle; + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).memHandle = handle; + } +} + +inline void set_qnn_tensor_dyn_dimensions(Qnn_Tensor_t &tensor, uint8_t *isDynamicDimensions) { + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).isDynamicDimensions = isDynamicDimensions; } } @@ -239,3 +251,4 @@ class qnn_perf { #define QNN_TENSOR_SET_MEM_TYPE(tensor, value) qnn::set_qnn_tensor_memtype(tensor, value) #define QNN_TENSOR_SET_CLIENT_BUF(tensor, value) qnn::set_qnn_tensor_clientbuf(tensor, value) #define QNN_TENSOR_SET_MEM_HANDLE(tensor, value) qnn::set_qnn_tensor_memhandle(tensor, value) +#define QNN_TENSOR_SET_DYN_DIMENSIONS(tensor, value) qnn::set_qnn_tensor_dyn_dimensions(tensor, value) From 706793f078acab0d952a088fd78ebb8af342df7b Mon Sep 17 00:00:00 2001 From: hongruichen Date: Mon, 22 Jul 2024 21:34:33 +0800 Subject: [PATCH 098/166] fix: back to qnn tensor v1 to fix the create tensor error --- ggml/src/ggml-qnn/utils.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp index c2da6cb27eaf7..b7f29bdaa5663 100644 --- a/ggml/src/ggml-qnn/utils.hpp +++ b/ggml/src/ggml-qnn/utils.hpp @@ -13,7 +13,7 @@ #include "QnnTypes.h" #include "logger.hpp" -#define QNN_TENSOR_VER(x) ((x).v2) +#define QNN_TENSOR_VER(x) ((x).v1) namespace qnn { @@ -31,7 +31,7 @@ const char *opname_from_ggmlop(enum ggml_op ggmlop); const char *get_qnn_error_string(Qnn_ErrorHandle_t error); -constexpr const Qnn_TensorVersion_t kDefaultQnnTensorVersion = QNN_TENSOR_VERSION_2; +constexpr const Qnn_TensorVersion_t kDefaultQnnTensorVersion = QNN_TENSOR_VERSION_1; inline Qnn_Tensor_t qnn_tensor_init(Qnn_TensorVersion_t version) { Qnn_Tensor_t tensor; @@ -182,8 +182,8 @@ inline void set_qnn_tensor_memhandle(Qnn_Tensor_t &tensor, Qnn_MemHandle_t handl } inline void set_qnn_tensor_dyn_dimensions(Qnn_Tensor_t &tensor, uint8_t *isDynamicDimensions) { - if (tensor.version == kDefaultQnnTensorVersion) { - QNN_TENSOR_VER(tensor).isDynamicDimensions = isDynamicDimensions; + if (tensor.version == QNN_TENSOR_VERSION_2) { + tensor.v2.isDynamicDimensions = isDynamicDimensions; } } From f843e5aaf5a6f03af71984c197a2e1b92ef5e707 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Mon, 22 Jul 2024 23:41:23 +0800 Subject: [PATCH 099/166] fix: 1.free up rpc memory at destruct 2. unbind tesnsor --- ggml/src/ggml-qnn/graph.hpp | 16 ++++++++++++---- ggml/src/ggml-qnn/tensor.hpp | 24 ++++++++++++++++-------- 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 9941365f7e897..c82b7d66ae1cf 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -86,7 +86,7 @@ class ggml_qnn_graph { return false; } - QNN_LOG_DEBUG("graph name %s, add_nodes start", _graph_name.c_str()); + QNN_LOG_DEBUG("graph name %s, build_graph start", _graph_name.c_str()); _qnn_tensor_inputs.resize(tensor_inputs.size()); _tensor_inputs.resize(tensor_inputs.size()); for (size_t i = 0; i < tensor_inputs.size(); i++) { @@ -111,7 +111,7 @@ class ggml_qnn_graph { snprintf(buffer, GGML_MAX_NAME, "dst%d", (int)i); auto qnn_tensor = std::make_shared(std::string(buffer), _device, _graph_handle, _qnn_instance); - auto *ggml_tensor = tensor_inputs[i]; + auto *ggml_tensor = tensor_outputs[i]; if (!qnn_tensor->bind_ggml_tensor(ggml_tensor, false)) { QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); return false; @@ -155,7 +155,7 @@ class ggml_qnn_graph { return false; } - QNN_LOG_DEBUG("graph name %s, add_nodes succeed", _graph_name.c_str()); + QNN_LOG_DEBUG("graph name %s, build_graph succeed", _graph_name.c_str()); return true; } @@ -173,7 +173,7 @@ class ggml_qnn_graph { } for (size_t i = 0; i < tensor_outputs.size(); i++) { - auto *ggml_tensor = tensor_inputs[i]; + auto *ggml_tensor = tensor_outputs[i]; if (!_tensor_outputs[i]->bind_ggml_tensor(ggml_tensor, false)) { QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); return false; @@ -191,6 +191,14 @@ class ggml_qnn_graph { } } + for (auto tensor : _tensor_inputs) { + tensor->unbind_ggml_tensor(); + } + + for (auto tensor : _tensor_outputs) { + tensor->unbind_ggml_tensor(); + } + if (error != QNN_SUCCESS) { QNN_LOG_INFO("error = %d\n", error); return false; diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index 5e45266b40b9b..7709936ed9618 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -28,17 +28,22 @@ class ggml_qnn_tensor { QNN_LOG_DEBUG("create tensor %s, device: %d", _tensor_name.c_str(), device); } + ~ggml_qnn_tensor() { + if (_qnn_instance && _qnn_rpc_buffer) { + _qnn_instance->free_rpcmem(_qnn_rpc_buffer); + } + } + bool bind_ggml_tensor(ggml_tensor *tensor, bool is_input) { if (_tensor) { if (_tensor != tensor) { QNN_LOG_WARN("tensor %s has been bound to another ggml tensor %s", _tensor_name.c_str(), ggml_get_name(_tensor)); return false; - } else { - QNN_LOG_INFO("tensor %s already bound to same ggml tensor %s", _tensor_name.c_str(), - ggml_get_name(_tensor)); - return true; } + QNN_LOG_INFO("tensor %s already bound to same ggml tensor %s", _tensor_name.c_str(), + ggml_get_name(_tensor)); + return true; } update_params_from_ggml_tensor(tensor); @@ -55,13 +60,16 @@ class ggml_qnn_tensor { return false; } QNN_TENSOR_SET_ID(_qnn_tensor, QNN_TENSOR_GET_ID(qnn_tensor)); + QNN_LOG_DEBUG("create graph tensor %s, id: %d", _tensor_name.c_str(), QNN_TENSOR_GET_ID(qnn_tensor)); } if (should_use_mem_handle()) { - _qnn_rpc_buffer = alloc_rpc_mem(ggml_nbytes(tensor)); if (!_qnn_rpc_buffer) { - QNN_LOG_WARN("alloc rpc mem failed, tensor %s", _tensor_name.c_str()); - return false; + _qnn_rpc_buffer = alloc_rpc_mem(ggml_nbytes(tensor)); + if (!_qnn_rpc_buffer) { + QNN_LOG_WARN("alloc rpc mem failed, tensor %s", _tensor_name.c_str()); + return false; + } } QNN_LOG_DEBUG("tensor %s, use mem handle %p", _tensor_name.c_str(), QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor)); @@ -107,8 +115,8 @@ class ggml_qnn_tensor { QNN_LOG_DEBUG("tensor %s, clear client buffer", _tensor_name.c_str()); } + QNN_LOG_DEBUG("unbind tensor: %s from ggml tensor: %s", _tensor_name.c_str(), ggml_get_name(_tensor)); _tensor = nullptr; - QNN_LOG_DEBUG("unbind tensor: %s from ggml tensor: %s", _tensor_name.c_str(), _tensor->name); return true; } From ee305cc17158b35c20e137c2a55fdde2108edcbd Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 26 Jul 2024 22:33:30 +0800 Subject: [PATCH 100/166] refactoring: split qnn rpc buffer into dedicated class --- ggml/src/ggml-qnn/buffer.hpp | 56 +++++++++++++++++++++++ ggml/src/ggml-qnn/qnn-lib.hpp | 84 ++++++++++++++--------------------- ggml/src/ggml-qnn/tensor.hpp | 46 ++++++------------- 3 files changed, 103 insertions(+), 83 deletions(-) create mode 100644 ggml/src/ggml-qnn/buffer.hpp diff --git a/ggml/src/ggml-qnn/buffer.hpp b/ggml/src/ggml-qnn/buffer.hpp new file mode 100644 index 0000000000000..db8e8ccaf24fa --- /dev/null +++ b/ggml/src/ggml-qnn/buffer.hpp @@ -0,0 +1,56 @@ +#pragma once + +#include + +#include "logger.hpp" +#include "qnn-lib.hpp" + +namespace qnn { +class ggml_qnn_rpc_buffer { +public: + ggml_qnn_rpc_buffer(std::shared_ptr qnn_instance, size_t size, uint32_t rank, uint32_t *dimensions, + Qnn_DataType_t data_type) : + _qnn_instance(qnn_instance), _size(size) { + + auto *qnn_rpc_buffer = static_cast(qnn_instance->alloc_rpcmem(size, alignof(void *))); + _qnn_rpc_mem_handle = qnn_instance->register_rpcmem(qnn_rpc_buffer, rank, dimensions, data_type); + if (!_qnn_rpc_mem_handle) { + qnn_instance->free_rpcmem(qnn_rpc_buffer); + QNN_LOG_WARN("register rpc mem failure\n"); + return; + } + + _qnn_rpc_buffer = qnn_rpc_buffer; + QNN_LOG_DEBUG("alloc rpcmem(%p) successfully, size %d\n", _qnn_rpc_buffer, (int)size); + } + ~ggml_qnn_rpc_buffer() { + if (_qnn_instance) { + if (_qnn_rpc_mem_handle) { + _qnn_instance->unregister_rpcmem(_qnn_rpc_mem_handle); + } + + if (_qnn_rpc_buffer) { + _qnn_instance->free_rpcmem(_qnn_rpc_buffer); + } + } + } + + bool is_valid() const { return _qnn_rpc_buffer && _qnn_rpc_mem_handle; } + + uint8_t *get_buffer() const { return _qnn_rpc_buffer; } + size_t get_size() const { return _size; } + Qnn_MemHandle_t get_mem_handle() const { return _qnn_rpc_mem_handle; } + +private: + std::shared_ptr _qnn_instance; + size_t _size = 0; + uint8_t *_qnn_rpc_buffer = nullptr; + Qnn_MemHandle_t _qnn_rpc_mem_handle = nullptr; + + ggml_qnn_rpc_buffer(const ggml_qnn_rpc_buffer &) = delete; + void operator=(const ggml_qnn_rpc_buffer &) = delete; + ggml_qnn_rpc_buffer(ggml_qnn_rpc_buffer &&) = delete; + void operator=(ggml_qnn_rpc_buffer &&) = delete; +}; + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/qnn-lib.hpp b/ggml/src/ggml-qnn/qnn-lib.hpp index 4e1dcb34c119f..aa142c74adf82 100644 --- a/ggml/src/ggml-qnn/qnn-lib.hpp +++ b/ggml/src/ggml-qnn/qnn-lib.hpp @@ -638,84 +638,68 @@ class qnn_instance { return mem_fd; } - int register_rpcmem(void *p_data, Qnn_Tensor_t *p_tensor) { - if (nullptr == p_data || (nullptr == p_tensor)) { + void *get_rpcmem_from_memhandle(Qnn_MemHandle_t mem_handle) { + for (std::unordered_map::iterator it = _qnn_mem_set.begin(); it != _qnn_mem_set.end(); + it++) { + Qnn_MemHandle_t mem_handle = it->second; + if (it->second == mem_handle) { + return it->first; + } + } + QNN_LOG_WARN("can't find rpcmem from qnn mem handle %p", mem_handle); + return nullptr; + } + + Qnn_MemHandle_t register_rpcmem(void *p_data, uint32_t rank, uint32_t *dimensions, Qnn_DataType_t data_type) { + if (!p_data) { QNN_LOG_WARN("invalid param\n"); - return 1; + return nullptr; } if (!is_rpcmem_initialized()) { QNN_LOG_WARN("rpc memory not initialized\n"); - return 2; + return nullptr; } if (is_rpcmem_allocated(p_data)) { QNN_LOG_WARN("rpc memory already allocated\n"); - return 3; - } - - if (is_rpcmem_registered(QNN_TENSOR_GET_MEM_HANDLE(*p_tensor))) { - QNN_LOG_WARN("tensor %s has been registered shared memory\n", QNN_TENSOR_GET_NAME(*p_tensor)); - return 4; + return nullptr; } - int32_t mem_fd = rpcmem_to_fd(p_data); + auto mem_fd = rpcmem_to_fd(p_data); if (mem_fd == -1) { QNN_LOG_WARN("failed to get file descriptor\n"); - return 5; + return nullptr; } + QNN_LOG_INFO("mem_fd %d\n", mem_fd); - Qnn_MemDescriptor_t descriptor = { { QNN_TENSOR_GET_RANK(*p_tensor), QNN_TENSOR_GET_DIMENSIONS(*p_tensor), - nullptr }, - QNN_TENSOR_GET_DATA_TYPE(*p_tensor), - QNN_MEM_TYPE_ION, - { { mem_fd } } }; + Qnn_MemDescriptor_t descriptor = { { rank, dimensions, nullptr }, data_type, QNN_MEM_TYPE_ION, { { mem_fd } } }; Qnn_MemHandle_t handle = nullptr; - int error = QNN_SUCCESS; - error = _qnn_interface->qnn_mem_register(_qnn_context_handle, &descriptor, - /*numDescriptors=*/1, &handle); + auto error = _qnn_interface->qnn_mem_register(_qnn_context_handle, &descriptor, + /*numDescriptors=*/1, &handle); if (error != QNN_SUCCESS) { QNN_LOG_WARN("failed to register shared memory, error %d, %s\n", QNN_GET_ERROR_CODE(error), strerror(error)); - return 6; + return nullptr; } - QNN_TENSOR_SET_MEM_HANDLE(*p_tensor, handle); _qnn_mem_set.insert((std::pair(p_data, handle))); - QNN_LOG_INFO("tensor %s successfully register shared memory handler: %p\n", QNN_TENSOR_GET_NAME(*p_tensor), - handle); - return 0; + QNN_LOG_INFO("successfully register shared memory handler: %p\n", handle); + return handle; } - void *get_rpcmem_from_memhandle(Qnn_MemHandle_t mem_handle) { - for (std::unordered_map::iterator it = _qnn_mem_set.begin(); it != _qnn_mem_set.end(); - it++) { - Qnn_MemHandle_t mem_handle = it->second; - if (it->second == mem_handle) { - return it->first; - } - } - QNN_LOG_WARN("can't find rpcmem from qnn mem handle %p", mem_handle); - return nullptr; - } - - void unregister_rpcmem() { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - - if (_qnn_mem_set.empty()) { - QNN_LOG_WARN("no rpcmem registered\n"); + void unregister_rpcmem(Qnn_MemHandle_t mem_handle) { + Qnn_ErrorHandle_t error = _qnn_interface->qnn_mem_de_register(&mem_handle, 1); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to unregister shared memory, error %d\n", QNN_GET_ERROR_CODE(error)); } - for (std::unordered_map::iterator it = _qnn_mem_set.begin(); it != _qnn_mem_set.end(); - it++) { - Qnn_MemHandle_t mem_handle = it->second; - error = _qnn_interface->qnn_mem_de_register(&mem_handle, 1); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to unregister shared memory, error %d\n", QNN_GET_ERROR_CODE(error)); - } + auto it = std::find_if(_qnn_mem_set.begin(), _qnn_mem_set.end(), + [mem_handle](const auto &kv) { return kv.second == mem_handle; }); + if (it != _qnn_mem_set.end()) { + _qnn_mem_set.erase(it); } - _qnn_mem_set.clear(); } bool is_rpcmem_allocated(void *buf) { return _qnn_mem_set.count(buf) != 0U; } diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index 7709936ed9618..c4ea7a4095d5f 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -10,6 +10,7 @@ #include "QnnTensor.h" #include "System/QnnSystemInterface.h" +#include "buffer.hpp" #include "logger.hpp" #include "qnn-lib.hpp" #include "utils.hpp" @@ -28,11 +29,7 @@ class ggml_qnn_tensor { QNN_LOG_DEBUG("create tensor %s, device: %d", _tensor_name.c_str(), device); } - ~ggml_qnn_tensor() { - if (_qnn_instance && _qnn_rpc_buffer) { - _qnn_instance->free_rpcmem(_qnn_rpc_buffer); - } - } + ~ggml_qnn_tensor() { _qnn_rpc_buffer.reset(); } bool bind_ggml_tensor(ggml_tensor *tensor, bool is_input) { if (_tensor) { @@ -65,13 +62,19 @@ class ggml_qnn_tensor { if (should_use_mem_handle()) { if (!_qnn_rpc_buffer) { - _qnn_rpc_buffer = alloc_rpc_mem(ggml_nbytes(tensor)); - if (!_qnn_rpc_buffer) { + auto qnn_rpc_buffer = std::make_unique( + _qnn_instance, ggml_nbytes(tensor), QNN_TENSOR_GET_RANK(_qnn_tensor), + QNN_TENSOR_GET_DIMENSIONS(_qnn_tensor), QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor)); + if (!qnn_rpc_buffer->is_valid()) { QNN_LOG_WARN("alloc rpc mem failed, tensor %s", _tensor_name.c_str()); return false; } + + _qnn_rpc_buffer = std::move(qnn_rpc_buffer); } + QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_MEMHANDLE); + QNN_TENSOR_SET_MEM_HANDLE(_qnn_tensor, _qnn_rpc_buffer->get_mem_handle()); QNN_LOG_DEBUG("tensor %s, use mem handle %p", _tensor_name.c_str(), QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor)); } else { QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); @@ -132,7 +135,7 @@ class ggml_qnn_tensor { if (should_use_mem_handle()) { if (_qnn_rpc_buffer) { - memcpy(_qnn_rpc_buffer, _tensor->data, ggml_nbytes(_tensor)); + memcpy(_qnn_rpc_buffer->get_buffer(), _tensor->data, ggml_nbytes(_tensor)); } else { QNN_LOG_WARN("tensor %s: can't find rpcmem from qnn mem handle\n", _tensor_name.c_str()); return false; @@ -153,7 +156,7 @@ class ggml_qnn_tensor { if (should_use_mem_handle()) { if (_qnn_rpc_buffer) { - memcpy(_tensor->data, _qnn_rpc_buffer, ggml_nbytes(_tensor)); + memcpy(_tensor->data, _qnn_rpc_buffer->get_buffer(), ggml_nbytes(_tensor)); } else { QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n"); return false; @@ -165,29 +168,6 @@ class ggml_qnn_tensor { return true; } - uint8_t *alloc_rpc_mem(size_t bytes) { - uint8_t *qnn_rpc_buffer = static_cast(_qnn_instance->alloc_rpcmem(bytes, alignof(void *))); - if (!qnn_rpc_buffer) { - QNN_LOG_WARN("alloc rpc mem failure, %s\n", strerror(errno)); - QNN_LOG_DEBUG("tensor name %s", _tensor_name.c_str()); - return nullptr; - } - - QNN_LOG_INFO("tensor %s: alloc rpcmem(%p) successfully\n", _tensor_name.c_str(), qnn_rpc_buffer); - auto error = _qnn_instance->register_rpcmem(qnn_rpc_buffer, &_qnn_tensor); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("register rpc mem failure, %d\n", (int)error); - QNN_LOG_DEBUG("tensor name %s", _tensor_name.c_str()); - _qnn_instance->free_rpcmem(qnn_rpc_buffer); - return nullptr; - } - - // The mem handle will be set at qnn_instance::register_rpcmem - QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_MEMHANDLE); - QNN_LOG_INFO("tensor %s: register rpcmem(%p) successfully\n", _tensor_name.c_str(), qnn_rpc_buffer); - return qnn_rpc_buffer; - } - void update_params_from_ggml_tensor(ggml_tensor *tensor) { _dimensions[0] = (uint32_t)tensor->ne[0]; _dimensions[1] = (uint32_t)tensor->ne[1]; @@ -211,7 +191,7 @@ class ggml_qnn_tensor { Qnn_Tensor_t _qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion); std::array _dimensions = {}; Qnn_GraphHandle_t _graph_handle = nullptr; - uint8_t *_qnn_rpc_buffer = nullptr; + std::unique_ptr _qnn_rpc_buffer; ggml_qnn_tensor(const ggml_qnn_tensor &) = delete; void operator=(const ggml_qnn_tensor &) = delete; From 47735cb5896332745508f74a81d9fe4c45e96de1 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 26 Jul 2024 23:03:09 +0800 Subject: [PATCH 101/166] fix: try fix error in 2nd run by appending dimension into graph key --- ggml/src/ggml-qnn/backend-ops.cpp | 36 ++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index bd87cfc9e66eb..b138257b8c9f6 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -78,6 +78,31 @@ bool execute_graph(qnn::ggml_qnn_graph *graph, const std::array +std::string get_graph_key(const std::string &op_name, const std::array &inputs, + const std::array &outputs) { + constexpr static const auto append_dimensions = [](std::string &key, const ggml_tensor *tensor) { + key += "_"; + key += std::to_string(tensor->ne[0]); + key += "x"; + key += std::to_string(tensor->ne[1]); + key += "x"; + key += std::to_string(tensor->ne[2]); + key += "x"; + key += std::to_string(tensor->ne[3]); + }; + + std::string graph_key(op_name); + for (auto &input : inputs) { + append_dimensions(graph_key, input); + } + for (auto &output : outputs) { + append_dimensions(graph_key, output); + } + + return graph_key; +} + template qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_context *ctx, size_t op, const std::string &qnn_op, const std::array &inputs, @@ -87,16 +112,7 @@ qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_context *ctx, siz auto &graph_cache = ctx->qnn_graph_cache; const auto *op_name = op < qnn::kGgmlUnaryOpStart ? ggml_op_name(ggml_op(op)) : ggml_unary_op_name(ggml_unary_op(op - qnn::kGgmlUnaryOpStart)); - std::string graph_key(op_name); - for (auto &input : inputs) { - graph_key += "_"; - graph_key += input->name; - } - for (auto &output : outputs) { - graph_key += "_"; - graph_key += output->name; - } - + auto graph_key = get_graph_key<_InputSize, _OutputSize>(op_name, inputs, outputs); auto it = graph_cache.find(graph_key); qnn::ggml_qnn_graph *graph_ptr = nullptr; if (it != graph_cache.end()) { From be9a8c73a0d5822d75c07f551d6efe0280287924 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 26 Jul 2024 23:07:25 +0800 Subject: [PATCH 102/166] fix: suppress warning --- ggml/src/ggml-qnn.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 87653cfb1f741..6472d3e154367 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -71,7 +71,7 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { class ggml_backend_qnn_buffer_context { public: ggml_backend_qnn_buffer_context(QNNBackend device, std::shared_ptr instance, size_t size) : - _device(device), _instance(instance), _name(QNN_BACKEND_NAME + std::to_string(device)) { + _instance(instance), _name(QNN_BACKEND_NAME + std::to_string(device)) { size_t size_page = sysconf(_SC_PAGESIZE); @@ -97,7 +97,6 @@ class ggml_backend_qnn_buffer_context { size_t get_buffer_size() { return _buffer_size; } private: - QNNBackend _device; std::shared_ptr _instance; std::string _name; void *_buffer = nullptr; From 18aa6654d5c2af7c5dcea7d57abeb8f260ab0678 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 27 Jul 2024 10:38:43 +0800 Subject: [PATCH 103/166] refactoring: opt graph key gen --- ggml/src/ggml-qnn/backend-ops.cpp | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index b138257b8c9f6..1f8b75e5e3e0e 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -82,14 +82,10 @@ template std::string get_graph_key(const std::string &op_name, const std::array &inputs, const std::array &outputs) { constexpr static const auto append_dimensions = [](std::string &key, const ggml_tensor *tensor) { - key += "_"; - key += std::to_string(tensor->ne[0]); - key += "x"; - key += std::to_string(tensor->ne[1]); - key += "x"; - key += std::to_string(tensor->ne[2]); - key += "x"; - key += std::to_string(tensor->ne[3]); + char buffer[256] = {}; + snprintf(buffer, sizeof(buffer), "_%ldx%ldx%ldx%ld", (long)tensor->ne[0], (long)tensor->ne[1], + (long)tensor->ne[2], (long)tensor->ne[3]); + key += buffer; }; std::string graph_key(op_name); @@ -99,7 +95,7 @@ std::string get_graph_key(const std::string &op_name, const std::array Date: Sat, 27 Jul 2024 10:47:18 +0800 Subject: [PATCH 104/166] refactoring: remove dup code --- ggml/src/ggml-qnn/buffer.hpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-qnn/buffer.hpp b/ggml/src/ggml-qnn/buffer.hpp index db8e8ccaf24fa..4b4b2daaa75b4 100644 --- a/ggml/src/ggml-qnn/buffer.hpp +++ b/ggml/src/ggml-qnn/buffer.hpp @@ -12,15 +12,14 @@ class ggml_qnn_rpc_buffer { Qnn_DataType_t data_type) : _qnn_instance(qnn_instance), _size(size) { - auto *qnn_rpc_buffer = static_cast(qnn_instance->alloc_rpcmem(size, alignof(void *))); - _qnn_rpc_mem_handle = qnn_instance->register_rpcmem(qnn_rpc_buffer, rank, dimensions, data_type); - if (!_qnn_rpc_mem_handle) { - qnn_instance->free_rpcmem(qnn_rpc_buffer); + _qnn_rpc_buffer = static_cast(qnn_instance->alloc_rpcmem(size, alignof(void *))); + _qnn_rpc_mem_handle = qnn_instance->register_rpcmem(_qnn_rpc_buffer, rank, dimensions, data_type); + if (!_qnn_rpc_buffer || !_qnn_rpc_mem_handle) { QNN_LOG_WARN("register rpc mem failure\n"); + // let the destructor free the buffer return; } - _qnn_rpc_buffer = qnn_rpc_buffer; QNN_LOG_DEBUG("alloc rpcmem(%p) successfully, size %d\n", _qnn_rpc_buffer, (int)size); } ~ggml_qnn_rpc_buffer() { From ccfec7010657313bb030c1f58d7a78433f4435b7 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 27 Jul 2024 11:22:29 +0800 Subject: [PATCH 105/166] refactoring: remove unused get_rpcmem_from_memhandle func --- ggml/src/ggml-qnn/qnn-lib.hpp | 39 +++++++++++++---------------------- 1 file changed, 14 insertions(+), 25 deletions(-) diff --git a/ggml/src/ggml-qnn/qnn-lib.hpp b/ggml/src/ggml-qnn/qnn-lib.hpp index aa142c74adf82..da986e2e4c4ff 100644 --- a/ggml/src/ggml-qnn/qnn-lib.hpp +++ b/ggml/src/ggml-qnn/qnn-lib.hpp @@ -591,8 +591,6 @@ class qnn_instance { size_t get_rpcmem_capacity() { return _rpcmem_capacity; } - bool is_rpcmem_registered(Qnn_MemHandle_t handle) { return _qnn_mem_set.count(handle) != 0U; } - void *alloc_rpcmem(size_t bytes, size_t alignment) { if (!_rpcmem_initialized) { QNN_LOG_WARN("rpc memory not initialized\n"); @@ -619,7 +617,7 @@ class qnn_instance { void free_rpcmem(void *buf) { if (!_rpcmem_initialized) { QNN_LOG_WARN("rpc memory not initialized\n"); - } else if (0 == _rpcmem_store_map.count(buf)) { + } else if (_rpcmem_store_map.count(buf) == 0) { QNN_LOG_WARN("no allocated tensor\n"); } else { _pfn_rpc_mem_free(_rpcmem_store_map[buf]); @@ -638,18 +636,6 @@ class qnn_instance { return mem_fd; } - void *get_rpcmem_from_memhandle(Qnn_MemHandle_t mem_handle) { - for (std::unordered_map::iterator it = _qnn_mem_set.begin(); it != _qnn_mem_set.end(); - it++) { - Qnn_MemHandle_t mem_handle = it->second; - if (it->second == mem_handle) { - return it->first; - } - } - QNN_LOG_WARN("can't find rpcmem from qnn mem handle %p", mem_handle); - return nullptr; - } - Qnn_MemHandle_t register_rpcmem(void *p_data, uint32_t rank, uint32_t *dimensions, Qnn_DataType_t data_type) { if (!p_data) { QNN_LOG_WARN("invalid param\n"); @@ -661,9 +647,9 @@ class qnn_instance { return nullptr; } - if (is_rpcmem_allocated(p_data)) { - QNN_LOG_WARN("rpc memory already allocated\n"); - return nullptr; + if (is_rpcmem_registered(p_data)) { + QNN_LOG_WARN("rpc memory already registered\n"); + return _qnn_rpc_buffer_to_handles[p_data]; } auto mem_fd = rpcmem_to_fd(p_data); @@ -683,8 +669,7 @@ class qnn_instance { return nullptr; } - _qnn_mem_set.insert((std::pair(p_data, handle))); - + _qnn_rpc_buffer_to_handles.insert({ p_data, handle }); QNN_LOG_INFO("successfully register shared memory handler: %p\n", handle); return handle; } @@ -695,14 +680,18 @@ class qnn_instance { QNN_LOG_WARN("failed to unregister shared memory, error %d\n", QNN_GET_ERROR_CODE(error)); } - auto it = std::find_if(_qnn_mem_set.begin(), _qnn_mem_set.end(), + auto it = std::find_if(_qnn_rpc_buffer_to_handles.begin(), _qnn_rpc_buffer_to_handles.end(), [mem_handle](const auto &kv) { return kv.second == mem_handle; }); - if (it != _qnn_mem_set.end()) { - _qnn_mem_set.erase(it); + if (it == _qnn_rpc_buffer_to_handles.end()) { + QNN_LOG_WARN("failed to find shared memory handler: %p\n", mem_handle); + return; } + + _qnn_rpc_buffer_to_handles.erase(it); } - bool is_rpcmem_allocated(void *buf) { return _qnn_mem_set.count(buf) != 0U; } + bool is_rpcmem_allocated(void *buf) { return _rpcmem_store_map.count(buf) != 0; } + bool is_rpcmem_registered(void *buf) { return _qnn_rpc_buffer_to_handles.count(buf) != 0U; } const qnn::qcom_socinfo &get_soc_info() { return _soc_info; } @@ -892,7 +881,7 @@ class qnn_instance { QnnHtpDevice_PerfInfrastructure_t *_qnn_htp_perfinfra = nullptr; uint32_t _qnn_power_configid = 1; - std::unordered_map _qnn_mem_set; + std::unordered_map _qnn_rpc_buffer_to_handles; std::mutex _init_mutex; std::unordered_map _loaded_lib_handle; From 867c91bfaff57ba20eca24228c9568fa33b6769e Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 27 Jul 2024 11:56:21 +0800 Subject: [PATCH 106/166] feat: add error string for QnnOpPackage_Error_t --- ggml/src/ggml-qnn/utils.cpp | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-qnn/utils.cpp b/ggml/src/ggml-qnn/utils.cpp index 820b72b8969f8..e44d6dbccee42 100644 --- a/ggml/src/ggml-qnn/utils.cpp +++ b/ggml/src/ggml-qnn/utils.cpp @@ -158,12 +158,14 @@ const char *get_qnn_error_string(Qnn_ErrorHandle_t error) { switch (error) { case QNN_SUCCESS: return "QNN_SUCCESS"; + case QNN_COMMON_ERROR_GENERAL: + return "QNN_COMMON_ERROR_GENERAL"; + + // QnnGraph_Error_t case QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE: return "QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE"; case QNN_GRAPH_ERROR_MEM_ALLOC: return "QNN_GRAPH_ERROR_MEM_ALLOC"; - case QNN_GRAPH_ERROR_GENERAL: - return "QNN_GRAPH_ERROR_GENERAL"; case QNN_GRAPH_ERROR_INVALID_ARGUMENT: return "QNN_GRAPH_ERROR_INVALID_ARGUMENT"; case QNN_GRAPH_ERROR_INVALID_HANDLE: @@ -182,6 +184,22 @@ const char *get_qnn_error_string(Qnn_ErrorHandle_t error) { return "QNN_GRAPH_ERROR_UNCONNECTED_NODE"; case QNN_GRAPH_ERROR_CREATE_FAILED: return "QNN_GRAPH_ERROR_CREATE_FAILED"; + + // QnnOpPackage_Error_t + case QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED: + return "QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED"; + case QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED: + return "QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED"; + case QNN_OP_PACKAGE_ERROR_INVALID_HANDLE: + return "QNN_OP_PACKAGE_ERROR_INVALID_HANDLE"; + case QNN_OP_PACKAGE_ERROR_INVALID_INFRASTRUCTURE: + return "QNN_OP_PACKAGE_ERROR_INVALID_INFRASTRUCTURE"; + case QNN_OP_PACKAGE_ERROR_INVALID_INFO: + return "QNN_OP_PACKAGE_ERROR_INVALID_INFO"; + case QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE: + return "QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE"; + case QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT: + return "QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT"; default: return nullptr; } From 5da73f8085e9b3276ec7dba2c30c8ad775b7bcdd Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 27 Jul 2024 12:52:59 +0800 Subject: [PATCH 107/166] refactoring: move forward and supports_op into ops file --- ggml/src/ggml-qnn.cpp | 55 +------------- ggml/src/ggml-qnn/backend-ops.cpp | 115 ++++++++++++++++++++++++------ ggml/src/ggml-qnn/backend-ops.hpp | 13 +--- 3 files changed, 97 insertions(+), 86 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 6472d3e154367..22b57b1758a54 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -114,23 +114,7 @@ struct ggml_backend_qnn_buffer_type_context { // // ================================================================================================= static bool ggml_qnn_compute_forward(ggml_backend_qnn_context *ctx, struct ggml_tensor *tensor) { - size_t unary_op_idx = tensor->op; - if (tensor->op == GGML_OP_UNARY) { - unary_op_idx = qnn::kGgmlUnaryOpStart + ggml_get_unary_op(tensor); - } - - auto unary_op = qnn::ggml_qnn_unary_op_array()[unary_op_idx]; - if (unary_op) { - return unary_op(ctx, tensor->src[0], tensor); - } - - auto binary_op = qnn::ggml_qnn_binary_op_array()[tensor->op]; - if (binary_op) { - return binary_op(ctx, tensor->src[0], tensor->src[1], tensor); - } - - QNN_LOG_WARN("unsupported op %d", tensor->op); - return false; + return qnn::ggml_qnn_forward(ctx, tensor); } static const char *ggml_backend_qnn_buffer_get_name(ggml_backend_buffer_t buffer) { @@ -288,42 +272,7 @@ GGML_CALL static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backe GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const ggml_tensor *op) { GGML_UNUSED(backend); - - if (op->op == GGML_OP_UNARY) { - if (!qnn::ggml_qnn_unary_op_array()[qnn::kGgmlUnaryOpStart + ggml_get_unary_op(op)]) { - QNN_LOG_DEBUG("unsupported unary op %d", ggml_get_unary_op(op)); - return false; - } - - if (!op->src[0]) { - QNN_LOG_DEBUG("src0 is nullptr"); - return false; - } - } else if (op->op != GGML_OP_NONE) { - if (!qnn::ggml_qnn_unary_op_array()[op->op] && !qnn::ggml_qnn_binary_op_array()[op->op]) { - QNN_LOG_DEBUG("unsupported op %d", op->op); - return false; - } - - if (!op->src[0] || !op->src[1]) { - QNN_LOG_DEBUG("src0 or src1 is nullptr"); - return false; - } - } - - switch (op->type) { - case GGML_TYPE_F32: - case GGML_TYPE_F16: - case GGML_TYPE_I8: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q4_0: - break; - default: - QNN_LOG_DEBUG("unsupported src0 type %d", op->src[0]->type); - return false; - } - - return true; + return qnn::ggml_qnn_supports_op(op); } GGML_CALL static bool ggml_backend_qnn_offload_op(ggml_backend_t backend, const ggml_tensor *op) { diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 1f8b75e5e3e0e..20a4178fd2303 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -56,6 +56,15 @@ bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, namespace { +typedef bool (*ggml_qnn_unary_op_t)(ggml_backend_qnn_context *ctx, ggml_tensor *src, ggml_tensor *dst); +typedef bool (*ggml_qnn_binary_op_t)(ggml_backend_qnn_context *ctx, ggml_tensor *src0, ggml_tensor *src1, + ggml_tensor *dst); + +typedef const ggml_qnn_unary_op_t (&ggml_qnn_unary_op_array_t)[GGML_OP_COUNT + GGML_UNARY_OP_COUNT]; +typedef const ggml_qnn_binary_op_t (&ggml_qnn_binary_op_array_t)[GGML_OP_COUNT]; + +constexpr const size_t kGgmlUnaryOpStart = GGML_OP_COUNT; + void print_ggml_tensor(const ggml_tensor *tensor) { QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", tensor->name, tensor->type, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1], tensor->ne[2], @@ -106,8 +115,8 @@ qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_context *ctx, siz GGML_ASSERT(op < (GGML_OP_COUNT + GGML_UNARY_OP_COUNT)); auto &graph_cache = ctx->qnn_graph_cache; - const auto *op_name = op < qnn::kGgmlUnaryOpStart ? ggml_op_name(ggml_op(op)) - : ggml_unary_op_name(ggml_unary_op(op - qnn::kGgmlUnaryOpStart)); + const auto *op_name = + op < kGgmlUnaryOpStart ? ggml_op_name(ggml_op(op)) : ggml_unary_op_name(ggml_unary_op(op - kGgmlUnaryOpStart)); auto graph_key = get_graph_key<_InputSize, _OutputSize>(op_name, inputs, outputs); auto it = graph_cache.find(graph_key); qnn::ggml_qnn_graph *graph_ptr = nullptr; @@ -237,7 +246,7 @@ constexpr const char *kGgmlOpToQnnOp[] = { static_assert(sizeof(kGgmlOpToQnnOp) / sizeof(kGgmlOpToQnnOp[0]) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), "GGML_OP_COUNT does not match the size of the kGgmlOpToQnnOp table"); -static_assert(kGgmlOpToQnnOp[GGML_UNARY_OP_GELU + qnn::kGgmlUnaryOpStart] != nullptr, +static_assert(kGgmlOpToQnnOp[GGML_UNARY_OP_GELU + kGgmlUnaryOpStart] != nullptr, "GGML_UNARY_OP_GELU does not correspond to QNN_OP_GELU"); template @@ -281,10 +290,8 @@ bool qnn_unary_op_impl(ggml_backend_qnn_context *ctx, ggml_tensor *src, ggml_ten return succeed; } -} // namespace - -qnn::ggml_qnn_unary_op_array_t qnn::ggml_qnn_unary_op_array() { - static constexpr const qnn::ggml_qnn_unary_op_t kQnnOpsTable[] = { +ggml_qnn_unary_op_array_t ggml_qnn_unary_op_array() { + static constexpr const ggml_qnn_unary_op_t kQnnOpsTable[] = { nullptr, // GGML_OP_NONE nullptr, // GGML_OP_DUP nullptr, // GGML_OP_ADD @@ -369,19 +376,19 @@ qnn::ggml_qnn_unary_op_array_t qnn::ggml_qnn_unary_op_array() { nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK // ggml_unary_op - nullptr, // GGML_UNARY_OP_ABS - nullptr, // GGML_UNARY_OP_SGN - nullptr, // GGML_UNARY_OP_NEG - nullptr, // GGML_UNARY_OP_STEP - nullptr, // GGML_UNARY_OP_TANH - nullptr, // GGML_UNARY_OP_ELU - nullptr, // GGML_UNARY_OP_RELU - nullptr, // GGML_UNARY_OP_SIGMOID - qnn_unary_op_impl, // GGML_UNARY_OP_GELU - nullptr, // GGML_UNARY_OP_GELU_QUICK - nullptr, // GGML_UNARY_OP_SILU - nullptr, // GGML_UNARY_OP_HARDSWISH - nullptr, // GGML_UNARY_OP_HARDSIGMOID + nullptr, // GGML_UNARY_OP_ABS + nullptr, // GGML_UNARY_OP_SGN + nullptr, // GGML_UNARY_OP_NEG + nullptr, // GGML_UNARY_OP_STEP + nullptr, // GGML_UNARY_OP_TANH + nullptr, // GGML_UNARY_OP_ELU + nullptr, // GGML_UNARY_OP_RELU + nullptr, // GGML_UNARY_OP_SIGMOID + qnn_unary_op_impl, // GGML_UNARY_OP_GELU + nullptr, // GGML_UNARY_OP_GELU_QUICK + nullptr, // GGML_UNARY_OP_SILU + nullptr, // GGML_UNARY_OP_HARDSWISH + nullptr, // GGML_UNARY_OP_HARDSIGMOID }; static_assert(sizeof(kQnnOpsTable) / sizeof(kQnnOpsTable[0]) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), @@ -389,8 +396,8 @@ qnn::ggml_qnn_unary_op_array_t qnn::ggml_qnn_unary_op_array() { return kQnnOpsTable; } -qnn::ggml_qnn_binary_op_array_t qnn::ggml_qnn_binary_op_array() { - static constexpr const qnn::ggml_qnn_binary_op_t kQnnOpsTable[] = { +ggml_qnn_binary_op_array_t ggml_qnn_binary_op_array() { + static constexpr const ggml_qnn_binary_op_t kQnnOpsTable[] = { nullptr, // GGML_OP_NONE nullptr, // GGML_OP_DUP qnn_binary_op_impl, // GGML_OP_ADD @@ -479,3 +486,67 @@ qnn::ggml_qnn_binary_op_array_t qnn::ggml_qnn_binary_op_array() { "GGML_OP_COUNT does not match the size of the ops table"); return kQnnOpsTable; } + +} // namespace + +namespace qnn { + +bool ggml_qnn_supports_op(const ggml_tensor *op) { + if (op->op == GGML_OP_UNARY) { + if (!ggml_qnn_unary_op_array()[kGgmlUnaryOpStart + ggml_get_unary_op(op)]) { + QNN_LOG_DEBUG("unsupported unary op %d", ggml_get_unary_op(op)); + return false; + } + + if (!op->src[0]) { + QNN_LOG_DEBUG("src0 is nullptr"); + return false; + } + } else if (op->op != GGML_OP_NONE) { + if (!ggml_qnn_unary_op_array()[op->op] && !ggml_qnn_binary_op_array()[op->op]) { + QNN_LOG_DEBUG("unsupported op %d", op->op); + return false; + } + + if (!op->src[0] || !op->src[1]) { + QNN_LOG_DEBUG("src0 or src1 is nullptr"); + return false; + } + } + + switch (op->type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_I8: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q4_0: + break; + default: + QNN_LOG_DEBUG("unsupported src0 type %d", op->src[0]->type); + return false; + } + + return true; +} + +bool ggml_qnn_forward(ggml_backend_qnn_context *ctx, struct ggml_tensor *tensor) { + size_t unary_op_idx = tensor->op; + if (tensor->op == GGML_OP_UNARY) { + unary_op_idx = kGgmlUnaryOpStart + ggml_get_unary_op(tensor); + } + + auto unary_op = ggml_qnn_unary_op_array()[unary_op_idx]; + if (unary_op) { + return unary_op(ctx, tensor->src[0], tensor); + } + + auto binary_op = ggml_qnn_binary_op_array()[tensor->op]; + if (binary_op) { + return binary_op(ctx, tensor->src[0], tensor->src[1], tensor); + } + + QNN_LOG_WARN("unsupported op %d", tensor->op); + return false; +} + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/backend-ops.hpp b/ggml/src/ggml-qnn/backend-ops.hpp index 614bcf651b86b..ed4ce994f787b 100644 --- a/ggml/src/ggml-qnn/backend-ops.hpp +++ b/ggml/src/ggml-qnn/backend-ops.hpp @@ -6,16 +6,7 @@ namespace qnn { -typedef bool (*ggml_qnn_unary_op_t)(ggml_backend_qnn_context *ctx, ggml_tensor *src, ggml_tensor *dst); -typedef bool (*ggml_qnn_binary_op_t)(ggml_backend_qnn_context *ctx, ggml_tensor *src0, ggml_tensor *src1, - ggml_tensor *dst); - -typedef const ggml_qnn_unary_op_t (&ggml_qnn_unary_op_array_t)[GGML_OP_COUNT + GGML_UNARY_OP_COUNT]; -typedef const ggml_qnn_binary_op_t (&ggml_qnn_binary_op_array_t)[GGML_OP_COUNT]; - -constexpr const size_t kGgmlUnaryOpStart = GGML_OP_COUNT; - -ggml_qnn_unary_op_array_t ggml_qnn_unary_op_array(); -ggml_qnn_binary_op_array_t ggml_qnn_binary_op_array(); +bool ggml_qnn_supports_op(const ggml_tensor *op); +bool ggml_qnn_forward(ggml_backend_qnn_context *ctx, struct ggml_tensor *tensor); } // namespace qnn From e0c9b34016d949be19bd65d55fe81b11fc89d327 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 27 Jul 2024 13:31:57 +0800 Subject: [PATCH 108/166] feat: check if dims equal for add looks qnn add can only applied to matrix with equal dimensions --- ggml/src/ggml-qnn/backend-ops.cpp | 32 +++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 20a4178fd2303..fdba6cac24d82 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -43,6 +43,27 @@ bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, return true; } +bool is_tensor_dimensions_equal(const ggml_tensor *l, const ggml_tensor *r) { + const auto dim_l = ggml_n_dims(l); + if (dim_l != ggml_n_dims(r)) { + return false; + } + + for (int i = 0; i < dim_l; i++) { + if (l->ne[i] != r->ne[i]) { + return false; + } + } + + return true; +} + +void print_ggml_tensor(const ggml_tensor *tensor) { + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + tensor->name, tensor->type, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1], tensor->ne[2], + tensor->nb[0], tensor->nb[1], tensor->nb[2]); +} + } // namespace #define CHECK_PARAMS(ctx, ...) \ @@ -65,12 +86,6 @@ typedef const ggml_qnn_binary_op_t (&ggml_qnn_binary_op_array_t)[GGML_OP_COUNT]; constexpr const size_t kGgmlUnaryOpStart = GGML_OP_COUNT; -void print_ggml_tensor(const ggml_tensor *tensor) { - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - tensor->name, tensor->type, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1], tensor->ne[2], - tensor->nb[0], tensor->nb[1], tensor->nb[2]); -} - template qnn::ggml_tensor_array_t to_ggml_tensor_array(const std::array &array) { return qnn::ggml_tensor_array_t(array.data(), array.data() + _Size); @@ -512,6 +527,11 @@ bool ggml_qnn_supports_op(const ggml_tensor *op) { QNN_LOG_DEBUG("src0 or src1 is nullptr"); return false; } + + if (op->op == GGML_OP_ADD && !is_tensor_dimensions_equal(op->src[0], op->src[1])) { + QNN_LOG_DEBUG("src0 and src1 dimensions are not equal"); + return false; + } } switch (op->type) { From 8ab1f15fe396b6d46fe27cb5039855927cb0639b Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 27 Jul 2024 13:43:07 +0800 Subject: [PATCH 109/166] refactoring: remove internal functions, use op table directly --- ggml/src/ggml-qnn/backend-ops.cpp | 388 +++++++++++++++--------------- 1 file changed, 191 insertions(+), 197 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index fdba6cac24d82..a560417438c28 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -304,203 +304,197 @@ bool qnn_unary_op_impl(ggml_backend_qnn_context *ctx, ggml_tensor *src, ggml_ten return succeed; } +constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = { -ggml_qnn_unary_op_array_t ggml_qnn_unary_op_array() { - static constexpr const ggml_qnn_unary_op_t kQnnOpsTable[] = { - nullptr, // GGML_OP_NONE - nullptr, // GGML_OP_DUP - nullptr, // GGML_OP_ADD - nullptr, // GGML_OP_ADD1 - nullptr, // GGML_OP_ACC - nullptr, // GGML_OP_SUB - nullptr, // GGML_OP_MUL - nullptr, // GGML_OP_DIV - nullptr, // GGML_OP_SQR - qnn_unary_op_impl, // GGML_OP_SQRT - qnn_unary_op_impl, // GGML_OP_LOG - nullptr, // GGML_OP_SUM - nullptr, // GGML_OP_SUM_ROWS - nullptr, // GGML_OP_MEAN - nullptr, // GGML_OP_ARGMAX - nullptr, // GGML_OP_REPEAT - nullptr, // GGML_OP_REPEAT_BACK - nullptr, // GGML_OP_CONCAT - nullptr, // GGML_OP_SILU_BACK - nullptr, // GGML_OP_NORM - nullptr, // GGML_OP_RMS_NORM - nullptr, // GGML_OP_RMS_NORM_BACK - nullptr, // GGML_OP_GROUP_NORM - - nullptr, // GGML_OP_MUL_MAT - nullptr, // GGML_OP_MUL_MAT_ID - nullptr, // GGML_OP_OUT_PROD - - nullptr, // GGML_OP_SCALE - nullptr, // GGML_OP_SET - nullptr, // GGML_OP_CPY - nullptr, // GGML_OP_CONT - nullptr, // GGML_OP_RESHAPE - nullptr, // GGML_OP_VIEW - nullptr, // GGML_OP_PERMUTE - nullptr, // GGML_OP_TRANSPOSE - nullptr, // GGML_OP_GET_ROWS - nullptr, // GGML_OP_GET_ROWS_BACK - nullptr, // GGML_OP_DIAG - nullptr, // GGML_OP_DIAG_MASK_INF - nullptr, // GGML_OP_DIAG_MASK_ZERO - nullptr, // GGML_OP_SOFT_MAX - nullptr, // GGML_OP_SOFT_MAX_BACK - nullptr, // GGML_OP_ROPE - nullptr, // GGML_OP_ROPE_BACK - nullptr, // GGML_OP_CLAMP - nullptr, // GGML_OP_CONV_TRANSPOSE_1D - nullptr, // GGML_OP_IM2COL - nullptr, // GGML_OP_CONV_TRANSPOSE_2D - nullptr, // GGML_OP_POOL_1D - nullptr, // GGML_OP_POOL_2D - nullptr, // GGML_OP_UPSCALE - nullptr, // GGML_OP_PAD - nullptr, // GGML_OP_ARANGE - nullptr, // GGML_OP_TIMESTEP_EMBEDDING - nullptr, // GGML_OP_ARGSORT - nullptr, // GGML_OP_LEAKY_RELU - - nullptr, // GGML_OP_FLASH_ATTN_EXT - nullptr, // GGML_OP_FLASH_ATTN_BACK - nullptr, // GGML_OP_SSM_CONV - nullptr, // GGML_OP_SSM_SCAN - nullptr, // GGML_OP_WIN_PART - nullptr, // GGML_OP_WIN_UNPART - nullptr, // GGML_OP_GET_REL_POS - nullptr, // GGML_OP_ADD_REL_POS - - nullptr, // GGML_OP_UNARY - - nullptr, // GGML_OP_MAP_UNARY - nullptr, // GGML_OP_MAP_BINARY - - nullptr, // GGML_OP_MAP_CUSTOM1_F32 - nullptr, // GGML_OP_MAP_CUSTOM2_F32 - nullptr, // GGML_OP_MAP_CUSTOM3_F32 - - nullptr, // GGML_OP_MAP_CUSTOM1 - nullptr, // GGML_OP_MAP_CUSTOM2 - nullptr, // GGML_OP_MAP_CUSTOM3 - - nullptr, // GGML_OP_CROSS_ENTROPY_LOSS - nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK - - // ggml_unary_op - nullptr, // GGML_UNARY_OP_ABS - nullptr, // GGML_UNARY_OP_SGN - nullptr, // GGML_UNARY_OP_NEG - nullptr, // GGML_UNARY_OP_STEP - nullptr, // GGML_UNARY_OP_TANH - nullptr, // GGML_UNARY_OP_ELU - nullptr, // GGML_UNARY_OP_RELU - nullptr, // GGML_UNARY_OP_SIGMOID - qnn_unary_op_impl, // GGML_UNARY_OP_GELU - nullptr, // GGML_UNARY_OP_GELU_QUICK - nullptr, // GGML_UNARY_OP_SILU - nullptr, // GGML_UNARY_OP_HARDSWISH - nullptr, // GGML_UNARY_OP_HARDSIGMOID - }; + nullptr, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + nullptr, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + nullptr, // GGML_OP_SUB + nullptr, // GGML_OP_MUL + nullptr, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + qnn_unary_op_impl, // GGML_OP_SQRT + qnn_unary_op_impl, // GGML_OP_LOG + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + nullptr, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM - static_assert(sizeof(kQnnOpsTable) / sizeof(kQnnOpsTable[0]) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), - "GGML_OP_COUNT does not match the size of the kQnnOpsTable table"); - return kQnnOpsTable; -} + nullptr, // GGML_OP_MUL_MAT + nullptr, // GGML_OP_MUL_MAT_ID + nullptr, // GGML_OP_OUT_PROD -ggml_qnn_binary_op_array_t ggml_qnn_binary_op_array() { - static constexpr const ggml_qnn_binary_op_t kQnnOpsTable[] = { - nullptr, // GGML_OP_NONE - nullptr, // GGML_OP_DUP - qnn_binary_op_impl, // GGML_OP_ADD - nullptr, // GGML_OP_ADD1 - nullptr, // GGML_OP_ACC - qnn_binary_op_impl, // GGML_OP_SUB - qnn_binary_op_impl, // GGML_OP_MUL - qnn_binary_op_impl, // GGML_OP_DIV - nullptr, // GGML_OP_SQR - nullptr, // GGML_OP_SQRT - nullptr, // GGML_OP_LOG - nullptr, // GGML_OP_SUM - nullptr, // GGML_OP_SUM_ROWS - nullptr, // GGML_OP_MEAN - nullptr, // GGML_OP_ARGMAX - nullptr, // GGML_OP_REPEAT - nullptr, // GGML_OP_REPEAT_BACK - nullptr, // GGML_OP_CONCAT - nullptr, // GGML_OP_SILU_BACK - nullptr, // GGML_OP_NORM - nullptr, // GGML_OP_RMS_NORM - nullptr, // GGML_OP_RMS_NORM_BACK - nullptr, // GGML_OP_GROUP_NORM - - qnn_binary_op_impl, // GGML_OP_MUL_MAT - nullptr, // GGML_OP_MUL_MAT_ID - nullptr, // GGML_OP_OUT_PROD - - nullptr, // GGML_OP_SCALE - nullptr, // GGML_OP_SET - nullptr, // GGML_OP_CPY - nullptr, // GGML_OP_CONT - nullptr, // GGML_OP_RESHAPE - nullptr, // GGML_OP_VIEW - nullptr, // GGML_OP_PERMUTE - nullptr, // GGML_OP_TRANSPOSE - nullptr, // GGML_OP_GET_ROWS - nullptr, // GGML_OP_GET_ROWS_BACK - nullptr, // GGML_OP_DIAG - nullptr, // GGML_OP_DIAG_MASK_INF - nullptr, // GGML_OP_DIAG_MASK_ZERO - nullptr, // GGML_OP_SOFT_MAX - nullptr, // GGML_OP_SOFT_MAX_BACK - nullptr, // GGML_OP_ROPE - nullptr, // GGML_OP_ROPE_BACK - nullptr, // GGML_OP_CLAMP - nullptr, // GGML_OP_CONV_TRANSPOSE_1D - nullptr, // GGML_OP_IM2COL - nullptr, // GGML_OP_CONV_TRANSPOSE_2D - nullptr, // GGML_OP_POOL_1D - nullptr, // GGML_OP_POOL_2D - nullptr, // GGML_OP_UPSCALE - nullptr, // GGML_OP_PAD - nullptr, // GGML_OP_ARANGE - nullptr, // GGML_OP_TIMESTEP_EMBEDDING - nullptr, // GGML_OP_ARGSORT - nullptr, // GGML_OP_LEAKY_RELU - - nullptr, // GGML_OP_FLASH_ATTN_EXT - nullptr, // GGML_OP_FLASH_ATTN_BACK - nullptr, // GGML_OP_SSM_CONV - nullptr, // GGML_OP_SSM_SCAN - nullptr, // GGML_OP_WIN_PART - nullptr, // GGML_OP_WIN_UNPART - nullptr, // GGML_OP_GET_REL_POS - nullptr, // GGML_OP_ADD_REL_POS - - nullptr, // GGML_OP_UNARY - - nullptr, // GGML_OP_MAP_UNARY - nullptr, // GGML_OP_MAP_BINARY - - nullptr, // GGML_OP_MAP_CUSTOM1_F32 - nullptr, // GGML_OP_MAP_CUSTOM2_F32 - nullptr, // GGML_OP_MAP_CUSTOM3_F32 - - nullptr, // GGML_OP_MAP_CUSTOM1 - nullptr, // GGML_OP_MAP_CUSTOM2 - nullptr, // GGML_OP_MAP_CUSTOM3 - - nullptr, // GGML_OP_CROSS_ENTROPY_LOSS - nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK - }; + nullptr, // GGML_OP_SCALE + nullptr, // GGML_OP_SET + nullptr, // GGML_OP_CPY + nullptr, // GGML_OP_CONT + nullptr, // GGML_OP_RESHAPE + nullptr, // GGML_OP_VIEW + nullptr, // GGML_OP_PERMUTE + nullptr, // GGML_OP_TRANSPOSE + nullptr, // GGML_OP_GET_ROWS + nullptr, // GGML_OP_GET_ROWS_BACK + nullptr, // GGML_OP_DIAG + nullptr, // GGML_OP_DIAG_MASK_INF + nullptr, // GGML_OP_DIAG_MASK_ZERO + nullptr, // GGML_OP_SOFT_MAX + nullptr, // GGML_OP_SOFT_MAX_BACK + nullptr, // GGML_OP_ROPE + nullptr, // GGML_OP_ROPE_BACK + nullptr, // GGML_OP_CLAMP + nullptr, // GGML_OP_CONV_TRANSPOSE_1D + nullptr, // GGML_OP_IM2COL + nullptr, // GGML_OP_CONV_TRANSPOSE_2D + nullptr, // GGML_OP_POOL_1D + nullptr, // GGML_OP_POOL_2D + nullptr, // GGML_OP_UPSCALE + nullptr, // GGML_OP_PAD + nullptr, // GGML_OP_ARANGE + nullptr, // GGML_OP_TIMESTEP_EMBEDDING + nullptr, // GGML_OP_ARGSORT + nullptr, // GGML_OP_LEAKY_RELU - static_assert(sizeof(kQnnOpsTable) / sizeof(kQnnOpsTable[0]) == GGML_OP_COUNT, - "GGML_OP_COUNT does not match the size of the ops table"); - return kQnnOpsTable; -} + nullptr, // GGML_OP_FLASH_ATTN_EXT + nullptr, // GGML_OP_FLASH_ATTN_BACK + nullptr, // GGML_OP_SSM_CONV + nullptr, // GGML_OP_SSM_SCAN + nullptr, // GGML_OP_WIN_PART + nullptr, // GGML_OP_WIN_UNPART + nullptr, // GGML_OP_GET_REL_POS + nullptr, // GGML_OP_ADD_REL_POS + + nullptr, // GGML_OP_UNARY + + nullptr, // GGML_OP_MAP_UNARY + nullptr, // GGML_OP_MAP_BINARY + + nullptr, // GGML_OP_MAP_CUSTOM1_F32 + nullptr, // GGML_OP_MAP_CUSTOM2_F32 + nullptr, // GGML_OP_MAP_CUSTOM3_F32 + + nullptr, // GGML_OP_MAP_CUSTOM1 + nullptr, // GGML_OP_MAP_CUSTOM2 + nullptr, // GGML_OP_MAP_CUSTOM3 + + nullptr, // GGML_OP_CROSS_ENTROPY_LOSS + nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK + + // ggml_unary_op + nullptr, // GGML_UNARY_OP_ABS + nullptr, // GGML_UNARY_OP_SGN + nullptr, // GGML_UNARY_OP_NEG + nullptr, // GGML_UNARY_OP_STEP + nullptr, // GGML_UNARY_OP_TANH + nullptr, // GGML_UNARY_OP_ELU + nullptr, // GGML_UNARY_OP_RELU + nullptr, // GGML_UNARY_OP_SIGMOID + qnn_unary_op_impl, // GGML_UNARY_OP_GELU + nullptr, // GGML_UNARY_OP_GELU_QUICK + nullptr, // GGML_UNARY_OP_SILU + nullptr, // GGML_UNARY_OP_HARDSWISH + nullptr, // GGML_UNARY_OP_HARDSIGMOID +}; + +static_assert(sizeof(kQnnUnaryOpsTable) / sizeof(kQnnUnaryOpsTable[0]) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), + "GGML_OP_COUNT does not match the size of the kQnnUnaryOpsTable table"); + +static constexpr const ggml_qnn_binary_op_t kQnnBinaryOpsTable[] = { + nullptr, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + qnn_binary_op_impl, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + qnn_binary_op_impl, // GGML_OP_SUB + qnn_binary_op_impl, // GGML_OP_MUL + qnn_binary_op_impl, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + nullptr, // GGML_OP_SQRT + nullptr, // GGML_OP_LOG + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + nullptr, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM + + qnn_binary_op_impl, // GGML_OP_MUL_MAT + nullptr, // GGML_OP_MUL_MAT_ID + nullptr, // GGML_OP_OUT_PROD + + nullptr, // GGML_OP_SCALE + nullptr, // GGML_OP_SET + nullptr, // GGML_OP_CPY + nullptr, // GGML_OP_CONT + nullptr, // GGML_OP_RESHAPE + nullptr, // GGML_OP_VIEW + nullptr, // GGML_OP_PERMUTE + nullptr, // GGML_OP_TRANSPOSE + nullptr, // GGML_OP_GET_ROWS + nullptr, // GGML_OP_GET_ROWS_BACK + nullptr, // GGML_OP_DIAG + nullptr, // GGML_OP_DIAG_MASK_INF + nullptr, // GGML_OP_DIAG_MASK_ZERO + nullptr, // GGML_OP_SOFT_MAX + nullptr, // GGML_OP_SOFT_MAX_BACK + nullptr, // GGML_OP_ROPE + nullptr, // GGML_OP_ROPE_BACK + nullptr, // GGML_OP_CLAMP + nullptr, // GGML_OP_CONV_TRANSPOSE_1D + nullptr, // GGML_OP_IM2COL + nullptr, // GGML_OP_CONV_TRANSPOSE_2D + nullptr, // GGML_OP_POOL_1D + nullptr, // GGML_OP_POOL_2D + nullptr, // GGML_OP_UPSCALE + nullptr, // GGML_OP_PAD + nullptr, // GGML_OP_ARANGE + nullptr, // GGML_OP_TIMESTEP_EMBEDDING + nullptr, // GGML_OP_ARGSORT + nullptr, // GGML_OP_LEAKY_RELU + + nullptr, // GGML_OP_FLASH_ATTN_EXT + nullptr, // GGML_OP_FLASH_ATTN_BACK + nullptr, // GGML_OP_SSM_CONV + nullptr, // GGML_OP_SSM_SCAN + nullptr, // GGML_OP_WIN_PART + nullptr, // GGML_OP_WIN_UNPART + nullptr, // GGML_OP_GET_REL_POS + nullptr, // GGML_OP_ADD_REL_POS + + nullptr, // GGML_OP_UNARY + + nullptr, // GGML_OP_MAP_UNARY + nullptr, // GGML_OP_MAP_BINARY + + nullptr, // GGML_OP_MAP_CUSTOM1_F32 + nullptr, // GGML_OP_MAP_CUSTOM2_F32 + nullptr, // GGML_OP_MAP_CUSTOM3_F32 + + nullptr, // GGML_OP_MAP_CUSTOM1 + nullptr, // GGML_OP_MAP_CUSTOM2 + nullptr, // GGML_OP_MAP_CUSTOM3 + + nullptr, // GGML_OP_CROSS_ENTROPY_LOSS + nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK +}; + +static_assert(sizeof(kQnnBinaryOpsTable) / sizeof(kQnnBinaryOpsTable[0]) == GGML_OP_COUNT, + "GGML_OP_COUNT does not match the size of the kQnnBinaryOpsTable table"); } // namespace @@ -508,7 +502,7 @@ namespace qnn { bool ggml_qnn_supports_op(const ggml_tensor *op) { if (op->op == GGML_OP_UNARY) { - if (!ggml_qnn_unary_op_array()[kGgmlUnaryOpStart + ggml_get_unary_op(op)]) { + if (!kQnnUnaryOpsTable[kGgmlUnaryOpStart + ggml_get_unary_op(op)]) { QNN_LOG_DEBUG("unsupported unary op %d", ggml_get_unary_op(op)); return false; } @@ -518,7 +512,7 @@ bool ggml_qnn_supports_op(const ggml_tensor *op) { return false; } } else if (op->op != GGML_OP_NONE) { - if (!ggml_qnn_unary_op_array()[op->op] && !ggml_qnn_binary_op_array()[op->op]) { + if (!kQnnUnaryOpsTable[op->op] && !kQnnBinaryOpsTable[op->op]) { QNN_LOG_DEBUG("unsupported op %d", op->op); return false; } @@ -555,12 +549,12 @@ bool ggml_qnn_forward(ggml_backend_qnn_context *ctx, struct ggml_tensor *tensor) unary_op_idx = kGgmlUnaryOpStart + ggml_get_unary_op(tensor); } - auto unary_op = ggml_qnn_unary_op_array()[unary_op_idx]; + auto unary_op = kQnnUnaryOpsTable[unary_op_idx]; if (unary_op) { return unary_op(ctx, tensor->src[0], tensor); } - auto binary_op = ggml_qnn_binary_op_array()[tensor->op]; + auto binary_op = kQnnBinaryOpsTable[tensor->op]; if (binary_op) { return binary_op(ctx, tensor->src[0], tensor->src[1], tensor); } From e33b5c983749287ed06818b1d966b354b2ba6dc9 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 27 Jul 2024 13:49:49 +0800 Subject: [PATCH 110/166] refactoring: print the name of unsupport op --- ggml/src/ggml-qnn/backend-ops.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index a560417438c28..89def7ec636d8 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -559,7 +559,7 @@ bool ggml_qnn_forward(ggml_backend_qnn_context *ctx, struct ggml_tensor *tensor) return binary_op(ctx, tensor->src[0], tensor->src[1], tensor); } - QNN_LOG_WARN("unsupported op %d", tensor->op); + QNN_LOG_WARN("unsupported op %s", ggml_op_desc(tensor)); return false; } From 1f9d2a7e22e902ced1df6842c4e9b769435a5b98 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sun, 28 Jul 2024 22:05:51 +0800 Subject: [PATCH 111/166] refactoring: improve tensor print --- ggml/src/ggml-qnn/backend-ops.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 89def7ec636d8..4d83fd5d1a9c6 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -59,9 +59,9 @@ bool is_tensor_dimensions_equal(const ggml_tensor *l, const ggml_tensor *r) { } void print_ggml_tensor(const ggml_tensor *tensor) { - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - tensor->name, tensor->type, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1], tensor->ne[2], - tensor->nb[0], tensor->nb[1], tensor->nb[2]); + QNN_LOG_DEBUG("%s: type:%s ne: %ldx%ldx%ldx%ld, nb: %ldx%ldx%ldx%ld\n", tensor->name, ggml_type_name(tensor->type), + (long)tensor->ne[0], (long)tensor->ne[1], (long)tensor->ne[2], (long)tensor->ne[3], + (long)tensor->nb[0], (long)tensor->nb[1], (long)tensor->nb[2], (long)tensor->nb[3]); } } // namespace From 6da82947df06f182f93829e849c0ac33f68a10ea Mon Sep 17 00:00:00 2001 From: hongruichen Date: Mon, 29 Jul 2024 15:51:54 +0800 Subject: [PATCH 112/166] refactoring: set the default qnn lib search path at CMakeLists.txt by GGML_QNN_DEFAULT_LIB_SEARCH_PATH --- ggml/include/ggml-qnn.h | 8 +++----- ggml/src/CMakeLists.txt | 2 ++ ggml/src/ggml-qnn.cpp | 35 +++++++++++++---------------------- ggml/src/ggml-qnn/logger.cpp | 4 ++-- src/llama.cpp | 19 ++++++++----------- 5 files changed, 28 insertions(+), 40 deletions(-) diff --git a/ggml/include/ggml-qnn.h b/ggml/include/ggml-qnn.h index 2433af1668408..b8c7da8fbbf87 100644 --- a/ggml/include/ggml-qnn.h +++ b/ggml/include/ggml-qnn.h @@ -20,13 +20,11 @@ enum QNNBackend { /** * - * @param device 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2: - * QNN_BACKEND_NPU - * @param qnn_lib_path qnn library path, such as "/data/local/tmp/" on - * Android or specified in JNI layer + * @param device 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2:QNN_BACKEND_NPU + * @param extend_lib_search_path extened lib search path for searching QNN backend dynamic libs * @return */ -GGML_API ggml_backend_t ggml_backend_qnn_init(size_t dev_num, const char *qnn_lib_path); +GGML_API ggml_backend_t ggml_backend_qnn_init(size_t dev_num, const char *extend_lib_search_path); GGML_API bool ggml_backend_is_qnn(ggml_backend_t backend); diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index dffff42e7e530..59a7014dbeff3 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -889,10 +889,12 @@ if (GGML_QNN) find_library(LOG_LIB log) find_library(ANDROID_LIB android) set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${LOG_LIB} ${ANDROID_LIB}) + set(GGML_QNN_DEFAULT_LIB_SEARCH_PATH "\"/data/local/tmp/\"") else() message(FATAL_ERROR "QNN now only available on Android") endif() + add_compile_definitions(GGML_QNN_DEFAULT_LIB_SEARCH_PATH=${GGML_QNN_DEFAULT_LIB_SEARCH_PATH}) if (NOT DEFINED GGML_QNN_SDK_PATH) # try read from environment variable if (DEFINED ENV{QNN_SDK_PATH}) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 22b57b1758a54..6ed5ecb2e03f2 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -319,15 +319,8 @@ static ggml_guid_t ggml_backend_qnn_guid() { return &guid; } -static ggml_backend_t ggml_backend_qnn_reg_init(const char *params, void *user_data) { - if (nullptr == params) { - // QNN library path - // can be hardcoded to "/data/local/tmp/" for Android command line application - // or specified in JNI layer for Android APK - params = "/data/local/tmp/"; - } - ggml_backend_t qnn_backend = ggml_backend_qnn_init((int)(intptr_t)user_data, params); - +static ggml_backend_t ggml_backend_qnn_reg_init(const char *extend_lib_search_path, void *user_data) { + ggml_backend_t qnn_backend = ggml_backend_qnn_init((int)(intptr_t)user_data, extend_lib_search_path); return qnn_backend; } @@ -390,28 +383,25 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device) { return &ggml_backend_qnn_buffer_types[device]; } -/** - * - * @param device 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2: QNN_BACKEND_NPU - * @param qnn_lib_path qnn library path, such as "/data/local/tmp/" on Android or specified in JNI layer - * @return - */ -ggml_backend_t ggml_backend_qnn_init(size_t device, const char *qnn_lib_path) { +ggml_backend_t ggml_backend_qnn_init(size_t device, const char *extend_lib_search_path) { int result = 0; - if (nullptr == qnn_lib_path) { - QNN_LOG_ERROR("invalid qnn lib path\n"); - return nullptr; + if (!extend_lib_search_path) { + extend_lib_search_path = GGML_QNN_DEFAULT_LIB_SEARCH_PATH; + QNN_LOG_WARN("extend_lib_search_path is nullptr, will use " GGML_QNN_DEFAULT_LIB_SEARCH_PATH " as default"); } QNN_LOG_DEBUG("device %d", device); - QNN_LOG_DEBUG("qnn_lib_path %s", qnn_lib_path); + QNN_LOG_DEBUG("extend_lib_search_path %s", extend_lib_search_path); if (device >= GGML_QNN_MAX_DEVICES) { QNN_LOG_ERROR("invalid device %d", device); return nullptr; } - std::string path = qnn_lib_path; + std::string path = extend_lib_search_path; + +// TODO: Fix this for other platforms +#if defined(__ANDROID__) || defined(ANDROID) if (QNN_BACKEND_NPU == device) { if (0 == setenv("LD_LIBRARY_PATH", (path + ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/" @@ -438,8 +428,9 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char *qnn_lib_path) { QNN_LOG_ERROR("%s backend setenv failure\n", qnn::get_backend_name(device)); } } +#endif - auto instance = std::make_shared(qnn_lib_path, g_qnn_mgr[device].lib, ""); + auto instance = std::make_shared(extend_lib_search_path, g_qnn_mgr[device].lib, ""); result = instance->qnn_init(nullptr); if (result != 0) { QNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why\n", qnn::get_backend_name(device)); diff --git a/ggml/src/ggml-qnn/logger.cpp b/ggml/src/ggml-qnn/logger.cpp index fc37161edba17..187e9088c779c 100644 --- a/ggml/src/ggml-qnn/logger.cpp +++ b/ggml/src/ggml-qnn/logger.cpp @@ -5,7 +5,7 @@ #include -#if (defined __ANDROID__) || (defined ANDROID) +#if defined(__ANDROID__) || defined(ANDROID) #include #endif @@ -22,7 +22,7 @@ void qnn::internal_log(ggml_log_level level, const char * /*file*/, const char * int len_prefix = snprintf(s_qnn_internal_log_buf, QNN_LOGBUF_LEN, "[%s, %d]: ", func, line); int len = vsnprintf(s_qnn_internal_log_buf + len_prefix, QNN_LOGBUF_LEN - len_prefix, format, args); if (len < (QNN_LOGBUF_LEN - len_prefix)) { -#if (defined __ANDROID__) || (defined ANDROID) +#if defined(__ANDROID__) || defined(ANDROID) // for Android APK __android_log_print(level, "ggml-qnn", "%s\n", s_qnn_internal_log_buf); #endif diff --git a/src/llama.cpp b/src/llama.cpp index 670c5c83707dd..82d52ca84ecfd 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -16706,18 +16706,15 @@ struct llama_context * llama_new_context_with_model( } } #elif defined(GGML_USE_QNN) - if (model->n_gpu_layers > 0) { - //the second param is data path of prebuit QNN libs provided by Qualcomm - //can be hardcoded to "/data/local/tmp/" for Android command line application - //or specified in JNI layer for Android APK application - ggml_backend_t backend = ggml_backend_qnn_init(model->main_gpu, "/data/local/tmp/"); - if (nullptr == backend) { - LLAMA_LOG_ERROR("%s: failed to initialize QNN backend\n", __func__); - llama_free(ctx); - return nullptr; - } - ctx->backends.push_back(backend); + if (model->n_gpu_layers > 0) { + ggml_backend_t backend = ggml_backend_qnn_init(model->main_gpu, nullptr); + if (nullptr == backend) { + LLAMA_LOG_ERROR("%s: failed to initialize QNN backend\n", __func__); + llama_free(ctx); + return nullptr; } + ctx->backends.push_back(backend); + } #endif #ifdef GGML_USE_BLAS From 9a5f802bb6763ef51866687c97c4b936e33242d6 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Mon, 29 Jul 2024 22:18:48 +0800 Subject: [PATCH 113/166] refactoring: add convient macro to disable copy and move of class --- ggml/src/ggml-qnn/graph.hpp | 6 ++---- ggml/src/ggml-qnn/qnn-types.hpp | 8 ++++++++ ggml/src/ggml-qnn/tensor.hpp | 6 ++---- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index c82b7d66ae1cf..01190e18346c7 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -225,10 +225,8 @@ class ggml_qnn_graph { std::vector _qnn_tensor_outputs; std::vector _param_types; - ggml_qnn_graph(const ggml_qnn_graph &) = delete; - void operator=(const ggml_qnn_graph &) = delete; - ggml_qnn_graph(ggml_qnn_graph &&) = delete; - void operator=(ggml_qnn_graph &&) = delete; + DISABLE_COPY(ggml_qnn_graph); + DISABLE_MOVE(ggml_qnn_graph); }; } // namespace qnn diff --git a/ggml/src/ggml-qnn/qnn-types.hpp b/ggml/src/ggml-qnn/qnn-types.hpp index 58ca8648b0b03..8fce790defb61 100644 --- a/ggml/src/ggml-qnn/qnn-types.hpp +++ b/ggml/src/ggml-qnn/qnn-types.hpp @@ -51,3 +51,11 @@ using pfn_qnnsysteminterface_getproviders = decltype(QnnSystemInterface_getProvi #define RPCMEM_DEFAULT_FLAGS 1 #define RPCMEM_HEAP_ID_SYSTEM 25 + +#define DISABLE_COPY(class_name) \ + class_name(const class_name &) = delete; \ + void operator=(const class_name &) = delete + +#define DISABLE_MOVE(class_name) \ + class_name(class_name &&) = delete; \ + void operator=(class_name &&) = delete diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index c4ea7a4095d5f..07fbfde7828a7 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -193,10 +193,8 @@ class ggml_qnn_tensor { Qnn_GraphHandle_t _graph_handle = nullptr; std::unique_ptr _qnn_rpc_buffer; - ggml_qnn_tensor(const ggml_qnn_tensor &) = delete; - void operator=(const ggml_qnn_tensor &) = delete; - ggml_qnn_tensor(ggml_qnn_tensor &&) = delete; - void operator=(ggml_qnn_tensor &&) = delete; + DISABLE_COPY(ggml_qnn_tensor); + DISABLE_MOVE(ggml_qnn_tensor); }; } // namespace qnn From 74eb05a13b1c624efa4ba24eaa46839461a3e62d Mon Sep 17 00:00:00 2001 From: hongruichen Date: Mon, 29 Jul 2024 23:12:51 +0800 Subject: [PATCH 114/166] feat: add ggml_qnn_op_config for handle different op --- ggml/src/ggml-qnn/backend-ops.cpp | 88 +++++++++++++++++++------------ ggml/src/ggml-qnn/graph.hpp | 48 +++++++---------- ggml/src/ggml-qnn/op-config.hpp | 73 +++++++++++++++++++++++++ ggml/src/ggml-qnn/tensor.hpp | 2 - 4 files changed, 146 insertions(+), 65 deletions(-) create mode 100644 ggml/src/ggml-qnn/op-config.hpp diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 4d83fd5d1a9c6..d264ec766f808 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -5,6 +5,7 @@ #include "graph.hpp" #include "logger.hpp" +#include "op-config.hpp" #include "tensor.hpp" #include "utils.hpp" @@ -123,40 +124,22 @@ std::string get_graph_key(const std::string &op_name, const std::array -qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_context *ctx, size_t op, const std::string &qnn_op, - const std::array &inputs, - const std::array &outputs) { - GGML_ASSERT(op < (GGML_OP_COUNT + GGML_UNARY_OP_COUNT)); - - auto &graph_cache = ctx->qnn_graph_cache; - const auto *op_name = - op < kGgmlUnaryOpStart ? ggml_op_name(ggml_op(op)) : ggml_unary_op_name(ggml_unary_op(op - kGgmlUnaryOpStart)); - auto graph_key = get_graph_key<_InputSize, _OutputSize>(op_name, inputs, outputs); - auto it = graph_cache.find(graph_key); - qnn::ggml_qnn_graph *graph_ptr = nullptr; - if (it != graph_cache.end()) { - QNN_LOG_DEBUG("found graph %s in cache\n", graph_key.c_str()); - graph_ptr = it->second.get(); - } else { - auto graph = std::make_unique(graph_key, (QNNBackend)(ctx->device), ctx->instance, - ctx->socinfo.vtcm_size_in_mb); - - if (!graph->is_valid()) { - return nullptr; - } - - if (!graph->build_graph(qnn_op, to_ggml_tensor_array<_InputSize>(inputs), - to_ggml_tensor_array<_OutputSize>(outputs))) { - QNN_LOG_ERROR("build_graph failed\n"); - return nullptr; - } - - graph_ptr = graph.get(); - graph_cache[graph_key] = std::move(graph); +qnn::ggml_op_constructor_t generate_common_op_constructor(const std::string &op_name) { + if (op_name == QNN_OP_MAT_MUL) { + // For QNN_OP_MAT_MUL, we need to transpose the input tensor + return [](const std::string &name) { + auto config = std::make_unique(name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL); + Qnn_Scalar_t scalar = QNN_SCALAR_INIT; + scalar.dataType = QNN_DATATYPE_BOOL_8; + scalar.bool8Value = true; + config->add_scalar_param(QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN0, scalar); + return config; + }; } - return graph_ptr; + return [op_name](const std::string &name) { + return std::make_unique(name, QNN_OP_PACKAGE_NAME_QTI_AISW, op_name); + }; } constexpr const char *kGgmlOpToQnnOp[] = { @@ -264,6 +247,42 @@ static_assert(sizeof(kGgmlOpToQnnOp) / sizeof(kGgmlOpToQnnOp[0]) == (GGML_OP_COU static_assert(kGgmlOpToQnnOp[GGML_UNARY_OP_GELU + kGgmlUnaryOpStart] != nullptr, "GGML_UNARY_OP_GELU does not correspond to QNN_OP_GELU"); +template +qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_context *ctx, size_t op, + const std::array &inputs, + const std::array &outputs) { + GGML_ASSERT(op < (GGML_OP_COUNT + GGML_UNARY_OP_COUNT)); + + auto &graph_cache = ctx->qnn_graph_cache; + const auto *op_name = + op < kGgmlUnaryOpStart ? ggml_op_name(ggml_op(op)) : ggml_unary_op_name(ggml_unary_op(op - kGgmlUnaryOpStart)); + auto graph_key = get_graph_key<_InputSize, _OutputSize>(op_name, inputs, outputs); + auto it = graph_cache.find(graph_key); + qnn::ggml_qnn_graph *graph_ptr = nullptr; + if (it != graph_cache.end()) { + QNN_LOG_DEBUG("found graph %s in cache\n", graph_key.c_str()); + graph_ptr = it->second.get(); + } else { + auto graph = std::make_unique(graph_key, (QNNBackend)(ctx->device), ctx->instance, + ctx->socinfo.vtcm_size_in_mb); + if (!graph->is_valid()) { + return nullptr; + } + + auto op_constructor = generate_common_op_constructor(kGgmlOpToQnnOp[op]); + if (!graph->build_graph(op_constructor, to_ggml_tensor_array<_InputSize>(inputs), + to_ggml_tensor_array<_OutputSize>(outputs))) { + QNN_LOG_ERROR("build_graph failed\n"); + return nullptr; + } + + graph_ptr = graph.get(); + graph_cache[graph_key] = std::move(graph); + } + + return graph_ptr; +} + template bool qnn_binary_op_impl(ggml_backend_qnn_context *ctx, ggml_tensor *src0, ggml_tensor *src1, ggml_tensor *dst) { static_assert(kGgmlOpToQnnOp[_GgmlOp] != nullptr, "GGML_OP does not have a corresponding QNN_OP"); @@ -271,7 +290,7 @@ bool qnn_binary_op_impl(ggml_backend_qnn_context *ctx, ggml_tensor *src0, ggml_t CHECK_PARAMS(ctx, src0, src1, dst); bool succeed = false; - auto *graph_ptr = get_qnn_graph_from_cache<2, 1>(ctx, _GgmlOp, kGgmlOpToQnnOp[_GgmlOp], { src0, src1 }, { dst }); + auto *graph_ptr = get_qnn_graph_from_cache<2, 1>(ctx, _GgmlOp, { src0, src1 }, { dst }); if (graph_ptr) { succeed = execute_graph<2, 1>(graph_ptr, { src0, src1 }, { dst }); } @@ -292,7 +311,7 @@ bool qnn_unary_op_impl(ggml_backend_qnn_context *ctx, ggml_tensor *src, ggml_ten CHECK_PARAMS(ctx, src, dst); bool succeed = false; - auto *graph_ptr = get_qnn_graph_from_cache<1, 1>(ctx, _GgmlOp, kGgmlOpToQnnOp[_GgmlOp], { src }, { dst }); + auto *graph_ptr = get_qnn_graph_from_cache<1, 1>(ctx, _GgmlOp, { src }, { dst }); if (graph_ptr) { succeed = execute_graph<1, 1>(graph_ptr, { src }, { dst }); } @@ -305,7 +324,6 @@ bool qnn_unary_op_impl(ggml_backend_qnn_context *ctx, ggml_tensor *src, ggml_ten return succeed; } constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = { - nullptr, // GGML_OP_NONE nullptr, // GGML_OP_DUP nullptr, // GGML_OP_ADD diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 01190e18346c7..1beb4b31b0c77 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -2,18 +2,22 @@ #pragma once #include +#include #include +#include #include #include "ggml-qnn.h" #include "logger.hpp" +#include "op-config.hpp" #include "qnn-lib.hpp" #include "tensor.hpp" namespace qnn { using ggml_tensor_array_t = std::vector; +using ggml_op_constructor_t = std::function(const std::string &)>; class ggml_qnn_graph { public: @@ -79,15 +83,15 @@ class ggml_qnn_graph { ~ggml_qnn_graph() { QNN_LOG_DEBUG("graph name %s, destroy", _graph_name.c_str()); } - bool build_graph(const std::string &op_name, const ggml_tensor_array_t &tensor_inputs, + bool build_graph(ggml_op_constructor_t op_constructor, const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) { + GGML_ASSERT(op_constructor); if (!is_valid()) { QNN_LOG_ERROR("Invalid graph\n"); return false; } QNN_LOG_DEBUG("graph name %s, build_graph start", _graph_name.c_str()); - _qnn_tensor_inputs.resize(tensor_inputs.size()); _tensor_inputs.resize(tensor_inputs.size()); for (size_t i = 0; i < tensor_inputs.size(); i++) { char buffer[GGML_MAX_NAME] = {}; @@ -100,11 +104,9 @@ class ggml_qnn_graph { return false; } - _qnn_tensor_inputs[i] = qnn_tensor->get_qnn_tensor(); _tensor_inputs[i] = qnn_tensor; } - _qnn_tensor_outputs.resize(tensor_outputs.size()); _tensor_outputs.resize(tensor_outputs.size()); for (size_t i = 0; i < tensor_outputs.size(); i++) { char buffer[GGML_MAX_NAME] = {}; @@ -117,23 +119,13 @@ class ggml_qnn_graph { return false; } - _qnn_tensor_outputs[i] = qnn_tensor->get_qnn_tensor(); _tensor_outputs[i] = qnn_tensor; } - Qnn_OpConfig_t config = QNN_OPCONFIG_INIT; - config.version = QNN_OPCONFIG_VERSION_1; - auto &op_config = config.v1; - op_config.name = _graph_name.c_str(); - op_config.packageName = QNN_OP_PACKAGE_NAME_QTI_AISW; - op_config.typeName = op_name.c_str(); - op_config.numOfParams = (uint32_t)_param_types.size(); - op_config.params = _param_types.data(); - op_config.numOfInputs = (uint32_t)_qnn_tensor_inputs.size(); - op_config.inputTensors = _qnn_tensor_inputs.data(); - op_config.numOfOutputs = (uint32_t)_qnn_tensor_outputs.size(); - op_config.outputTensors = _qnn_tensor_outputs.data(); - auto error = _qnn_interface->qnn_graph_add_node(_graph_handle, config); + _op_config = op_constructor(_graph_name); + _op_config->set_input_tensors(_tensor_inputs); + _op_config->set_output_tensors(_tensor_outputs); + auto error = _qnn_interface->qnn_graph_add_node(_graph_handle, _op_config->get_op_config()); if (error != QNN_SUCCESS) { auto *error_str = get_qnn_error_string(error); if (error_str) { @@ -168,8 +160,6 @@ class ggml_qnn_graph { QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); return false; } - - _qnn_tensor_inputs[i] = _tensor_inputs[i]->get_qnn_tensor(); } for (size_t i = 0; i < tensor_outputs.size(); i++) { @@ -178,13 +168,16 @@ class ggml_qnn_graph { QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); return false; } - - _qnn_tensor_outputs[i] = _tensor_outputs[i]->get_qnn_tensor(); } + _op_config->set_input_tensors(_tensor_inputs); + _op_config->set_output_tensors(_tensor_outputs); + auto &qnn_tensor_inputs = _op_config->get_qnn_input_tensors(); + auto &qnn_tensor_outputs = _op_config->get_qnn_output_tensors(); + auto error = - _qnn_interface->qnn_graph_execute(_graph_handle, _qnn_tensor_inputs.data(), _qnn_tensor_inputs.size(), - _qnn_tensor_outputs.data(), _qnn_tensor_outputs.size(), nullptr, nullptr); + _qnn_interface->qnn_graph_execute(_graph_handle, qnn_tensor_inputs.data(), qnn_tensor_inputs.size(), + qnn_tensor_outputs.data(), qnn_tensor_outputs.size(), nullptr, nullptr); if (_device == QNN_BACKEND_NPU) { if (error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) { QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); @@ -219,10 +212,9 @@ class ggml_qnn_graph { Qnn_GraphHandle_t _graph_handle = nullptr; std::shared_ptr _qnn_instance; std::shared_ptr _qnn_interface; - std::vector> _tensor_inputs; - std::vector> _tensor_outputs; - std::vector _qnn_tensor_inputs; - std::vector _qnn_tensor_outputs; + std::vector> _tensor_inputs; + std::vector> _tensor_outputs; + std::unique_ptr _op_config; std::vector _param_types; DISABLE_COPY(ggml_qnn_graph); diff --git a/ggml/src/ggml-qnn/op-config.hpp b/ggml/src/ggml-qnn/op-config.hpp new file mode 100644 index 0000000000000..de75c93581168 --- /dev/null +++ b/ggml/src/ggml-qnn/op-config.hpp @@ -0,0 +1,73 @@ +#pragma once + +#include +#include + +#include "ggml-qnn.h" + +#include "logger.hpp" +#include "qnn-lib.hpp" +#include "qnn-types.hpp" +#include "tensor.hpp" + +namespace qnn { +class ggml_qnn_op_config { +public: + explicit ggml_qnn_op_config(const std::string &name, const std::string &package_name, const std::string &op_type) : + _name(name), _package_name(package_name), _op_type(op_type) {} + + void set_input_tensors(const std::vector> &tensor_inputs) { + _qnn_tensor_inputs.resize(tensor_inputs.size()); + for (size_t i = 0; i < tensor_inputs.size(); i++) { + _qnn_tensor_inputs[i] = tensor_inputs[i]->get_qnn_tensor(); + } + } + + void set_output_tensors(const std::vector> &tensor_outputs) { + _qnn_tensor_outputs.resize(tensor_outputs.size()); + for (size_t i = 0; i < tensor_outputs.size(); i++) { + _qnn_tensor_outputs[i] = tensor_outputs[i]->get_qnn_tensor(); + } + } + + void add_scalar_param(const std::string &name, const Qnn_Scalar_t scalar) { + _param_names.push_back(name); + Qnn_Param_t param = QNN_PARAM_INIT; + param.paramType = QNN_PARAMTYPE_SCALAR; + param.name = _param_names.back().c_str(); + param.scalarParam = scalar; + _param_types.push_back(param); + } + + std::vector &get_qnn_input_tensors() { return _qnn_tensor_inputs; } + std::vector &get_qnn_output_tensors() { return _qnn_tensor_outputs; } + + Qnn_OpConfig_t get_op_config() { + Qnn_OpConfig_t config = QNN_OPCONFIG_INIT; + config.version = QNN_OPCONFIG_VERSION_1; + auto &op_config = config.v1; + op_config.name = _name.c_str(); + op_config.packageName = _package_name.c_str(); + op_config.typeName = _op_type.c_str(); + op_config.numOfParams = (uint32_t)_param_types.size(); + op_config.params = _param_types.data(); + op_config.numOfInputs = (uint32_t)_qnn_tensor_inputs.size(); + op_config.inputTensors = _qnn_tensor_inputs.data(); + op_config.numOfOutputs = (uint32_t)_qnn_tensor_outputs.size(); + op_config.outputTensors = _qnn_tensor_outputs.data(); + return config; + } + +private: + std::string _name; + std::string _package_name; + std::string _op_type; + std::vector _qnn_tensor_inputs; + std::vector _qnn_tensor_outputs; + std::vector _param_types; + std::vector _param_names; + + DISABLE_COPY(ggml_qnn_op_config); + DISABLE_MOVE(ggml_qnn_op_config); +}; +} // namespace qnn diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index 07fbfde7828a7..b3181ed230e3d 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -8,8 +8,6 @@ #include "ggml-qnn.h" -#include "QnnTensor.h" -#include "System/QnnSystemInterface.h" #include "buffer.hpp" #include "logger.hpp" #include "qnn-lib.hpp" From 47f6e02eda0798f5f288f69dcb2956dd5703ff07 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 31 Jul 2024 22:44:21 +0800 Subject: [PATCH 115/166] fix: try fix the tensor rank of mul mat --- ggml/src/ggml-qnn/backend-ops.cpp | 1 + ggml/src/ggml-qnn/graph.hpp | 17 +++++++++++++---- ggml/src/ggml-qnn/op-config.hpp | 8 ++++---- ggml/src/ggml-qnn/tensor.hpp | 13 ++++++++----- 4 files changed, 26 insertions(+), 13 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index d264ec766f808..52f078a962ae8 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -133,6 +133,7 @@ qnn::ggml_op_constructor_t generate_common_op_constructor(const std::string &op_ scalar.dataType = QNN_DATATYPE_BOOL_8; scalar.bool8Value = true; config->add_scalar_param(QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN0, scalar); + QNN_LOG_DEBUG("add scalar param %s\n", QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN0); return config; }; } diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 1beb4b31b0c77..3f1a0ef163208 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -91,6 +91,14 @@ class ggml_qnn_graph { return false; } + // get the max tensor rank + for (auto tensor : tensor_inputs) { + _tensor_rank = std::max(_tensor_rank, ggml_n_dims(tensor)); + } + for (auto tensor : tensor_outputs) { + _tensor_rank = std::max(_tensor_rank, ggml_n_dims(tensor)); + } + QNN_LOG_DEBUG("graph name %s, build_graph start", _graph_name.c_str()); _tensor_inputs.resize(tensor_inputs.size()); for (size_t i = 0; i < tensor_inputs.size(); i++) { @@ -99,7 +107,7 @@ class ggml_qnn_graph { auto qnn_tensor = std::make_shared(std::string(buffer), _device, _graph_handle, _qnn_instance); auto *ggml_tensor = tensor_inputs[i]; - if (!qnn_tensor->bind_ggml_tensor(ggml_tensor, true)) { + if (!qnn_tensor->bind_ggml_tensor(ggml_tensor, true, _tensor_rank)) { QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); return false; } @@ -114,7 +122,7 @@ class ggml_qnn_graph { auto qnn_tensor = std::make_shared(std::string(buffer), _device, _graph_handle, _qnn_instance); auto *ggml_tensor = tensor_outputs[i]; - if (!qnn_tensor->bind_ggml_tensor(ggml_tensor, false)) { + if (!qnn_tensor->bind_ggml_tensor(ggml_tensor, false, _tensor_rank)) { QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); return false; } @@ -156,7 +164,7 @@ class ggml_qnn_graph { GGML_ASSERT(tensor_outputs.size() == _tensor_outputs.size()); for (size_t i = 0; i < tensor_inputs.size(); i++) { auto *ggml_tensor = tensor_inputs[i]; - if (!_tensor_inputs[i]->bind_ggml_tensor(ggml_tensor, true)) { + if (!_tensor_inputs[i]->bind_ggml_tensor(ggml_tensor, true, _tensor_rank)) { QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); return false; } @@ -164,7 +172,7 @@ class ggml_qnn_graph { for (size_t i = 0; i < tensor_outputs.size(); i++) { auto *ggml_tensor = tensor_outputs[i]; - if (!_tensor_outputs[i]->bind_ggml_tensor(ggml_tensor, false)) { + if (!_tensor_outputs[i]->bind_ggml_tensor(ggml_tensor, false, _tensor_rank)) { QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); return false; } @@ -216,6 +224,7 @@ class ggml_qnn_graph { std::vector> _tensor_outputs; std::unique_ptr _op_config; std::vector _param_types; + int _tensor_rank = 0; DISABLE_COPY(ggml_qnn_graph); DISABLE_MOVE(ggml_qnn_graph); diff --git a/ggml/src/ggml-qnn/op-config.hpp b/ggml/src/ggml-qnn/op-config.hpp index de75c93581168..7852ee84dc12f 100644 --- a/ggml/src/ggml-qnn/op-config.hpp +++ b/ggml/src/ggml-qnn/op-config.hpp @@ -36,7 +36,7 @@ class ggml_qnn_op_config { param.paramType = QNN_PARAMTYPE_SCALAR; param.name = _param_names.back().c_str(); param.scalarParam = scalar; - _param_types.push_back(param); + _parameters.push_back(param); } std::vector &get_qnn_input_tensors() { return _qnn_tensor_inputs; } @@ -49,8 +49,8 @@ class ggml_qnn_op_config { op_config.name = _name.c_str(); op_config.packageName = _package_name.c_str(); op_config.typeName = _op_type.c_str(); - op_config.numOfParams = (uint32_t)_param_types.size(); - op_config.params = _param_types.data(); + op_config.numOfParams = (uint32_t)_parameters.size(); + op_config.params = _parameters.data(); op_config.numOfInputs = (uint32_t)_qnn_tensor_inputs.size(); op_config.inputTensors = _qnn_tensor_inputs.data(); op_config.numOfOutputs = (uint32_t)_qnn_tensor_outputs.size(); @@ -64,7 +64,7 @@ class ggml_qnn_op_config { std::string _op_type; std::vector _qnn_tensor_inputs; std::vector _qnn_tensor_outputs; - std::vector _param_types; + std::vector _parameters; std::vector _param_names; DISABLE_COPY(ggml_qnn_op_config); diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index b3181ed230e3d..0c724e2871d45 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -29,7 +29,7 @@ class ggml_qnn_tensor { ~ggml_qnn_tensor() { _qnn_rpc_buffer.reset(); } - bool bind_ggml_tensor(ggml_tensor *tensor, bool is_input) { + bool bind_ggml_tensor(ggml_tensor *tensor, bool is_input, int prev_max_rank) { if (_tensor) { if (_tensor != tensor) { QNN_LOG_WARN("tensor %s has been bound to another ggml tensor %s", _tensor_name.c_str(), @@ -41,7 +41,7 @@ class ggml_qnn_tensor { return true; } - update_params_from_ggml_tensor(tensor); + update_params_from_ggml_tensor(tensor, prev_max_rank); Qnn_TensorType_t new_tensor_type = is_input ? QNN_TENSOR_TYPE_APP_WRITE : QNN_TENSOR_TYPE_APP_READ; QNN_TENSOR_SET_TYPE(_qnn_tensor, new_tensor_type); QNN_LOG_INFO("tensor %s changed to type %d", _tensor_name.c_str(), new_tensor_type); @@ -54,8 +54,10 @@ class ggml_qnn_tensor { QNN_LOG_WARN("create graph tensor failed, tensor %s, error: %d\n", _tensor_name.c_str(), error); return false; } + QNN_TENSOR_SET_ID(_qnn_tensor, QNN_TENSOR_GET_ID(qnn_tensor)); - QNN_LOG_DEBUG("create graph tensor %s, id: %d", _tensor_name.c_str(), QNN_TENSOR_GET_ID(qnn_tensor)); + QNN_LOG_DEBUG("create graph tensor %s, id: %d, rank: %d", _tensor_name.c_str(), + QNN_TENSOR_GET_ID(qnn_tensor), QNN_TENSOR_GET_RANK(qnn_tensor)); } if (should_use_mem_handle()) { @@ -166,14 +168,15 @@ class ggml_qnn_tensor { return true; } - void update_params_from_ggml_tensor(ggml_tensor *tensor) { + void update_params_from_ggml_tensor(ggml_tensor *tensor, int prev_max_rank) { _dimensions[0] = (uint32_t)tensor->ne[0]; _dimensions[1] = (uint32_t)tensor->ne[1]; _dimensions[2] = (uint32_t)tensor->ne[2]; _dimensions[3] = (uint32_t)tensor->ne[3]; QNN_TENSOR_SET_DATA_TYPE(_qnn_tensor, device_datatype_from_ggml_datatype(tensor->type)); // TODO: set the quantizeParams base on the tensor type - QNN_TENSOR_SET_RANK(_qnn_tensor, (uint32_t)ggml_n_dims(tensor)); + + QNN_TENSOR_SET_RANK(_qnn_tensor, (uint32_t)std::max(prev_max_rank, ggml_n_dims(tensor))); QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); Qnn_ClientBuffer_t client_buf = {}; From dedadf2a20a30dfc0a8b4d195a21a13c53c3ce00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E3=81=BF=E3=82=83=E3=82=93?= Date: Tue, 20 Aug 2024 11:20:23 +0900 Subject: [PATCH 116/166] =?UTF-8?q?Fixed=20a=20bug=20where=20debug=20code?= =?UTF-8?q?=20was=20included=20in=20the=20release,=20resulting=20i?= =?UTF-8?q?=E2=80=A6=20(#1)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fixed a bug where debug code was included in the release, resulting in an undefined function error. * Change the path of the QNN library when building in termux environment * Revert "Change the path of the QNN library when building in termux environment" This reverts commit c6e26a3679da2608940e2163e090adf75d667400. * Changed so that GGML_QNN_DEFAULT_LIB_SEARCH_PATH can be set from command line arguments --- ggml/src/CMakeLists.txt | 5 +++-- ggml/src/ggml-qnn/backend-ops.cpp | 6 ++++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index b3c287b7872db..0252499e3cb57 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -885,12 +885,13 @@ if (GGML_QNN) find_library(LOG_LIB log) find_library(ANDROID_LIB android) set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${LOG_LIB} ${ANDROID_LIB}) - set(GGML_QNN_DEFAULT_LIB_SEARCH_PATH "\"/data/local/tmp/\"") + set(GGML_QNN_DEFAULT_LIB_SEARCH_PATH "/data/local/tmp/" CACHE STRING "customized library search path for QNN backend") else() message(FATAL_ERROR "QNN now only available on Android") endif() - add_compile_definitions(GGML_QNN_DEFAULT_LIB_SEARCH_PATH=${GGML_QNN_DEFAULT_LIB_SEARCH_PATH}) + string(REGEX REPLACE "/$" "" GGML_QNN_DEFAULT_LIB_SEARCH_PATH "${GGML_QNN_DEFAULT_LIB_SEARCH_PATH}") + add_compile_definitions(GGML_QNN_DEFAULT_LIB_SEARCH_PATH="${GGML_QNN_DEFAULT_LIB_SEARCH_PATH}/") if (NOT DEFINED GGML_QNN_SDK_PATH) # try read from environment variable if (DEFINED ENV{QNN_SDK_PATH}) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 52f078a962ae8..d6d6dddf85b23 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -296,11 +296,13 @@ bool qnn_binary_op_impl(ggml_backend_qnn_context *ctx, ggml_tensor *src0, ggml_t succeed = execute_graph<2, 1>(graph_ptr, { src0, src1 }, { dst }); } +#ifndef NDEBUG if (!succeed) { print_ggml_tensor(src0); print_ggml_tensor(src1); print_ggml_tensor(dst); } +#endif return succeed; } @@ -317,10 +319,12 @@ bool qnn_unary_op_impl(ggml_backend_qnn_context *ctx, ggml_tensor *src, ggml_ten succeed = execute_graph<1, 1>(graph_ptr, { src }, { dst }); } +#ifndef NDEBUG if (!succeed) { print_ggml_tensor(src); print_ggml_tensor(dst); } +#endif return succeed; } @@ -541,10 +545,12 @@ bool ggml_qnn_supports_op(const ggml_tensor *op) { return false; } +#ifndef NDEBUG if (op->op == GGML_OP_ADD && !is_tensor_dimensions_equal(op->src[0], op->src[1])) { QNN_LOG_DEBUG("src0 and src1 dimensions are not equal"); return false; } +#endif } switch (op->type) { From 481cb3a0c507e7ee117ad2164d62272adcc3fef9 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 7 Sep 2024 12:22:53 +0800 Subject: [PATCH 117/166] fix compiling error --- ggml/src/ggml-qnn/backend-ops.cpp | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index d6d6dddf85b23..5829e0fadbe92 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -155,6 +155,8 @@ constexpr const char *kGgmlOpToQnnOp[] = { nullptr, // GGML_OP_SQR QNN_OP_ELEMENT_WISE_SQUARE_ROOT, // GGML_OP_SQRT QNN_OP_ELEMENT_WISE_LOG, // GGML_OP_LOG + nullptr, // GGML_OP_SIN + nullptr, // GGML_OP_COS nullptr, // GGML_OP_SUM nullptr, // GGML_OP_SUM_ROWS nullptr, // GGML_OP_MEAN @@ -192,9 +194,11 @@ constexpr const char *kGgmlOpToQnnOp[] = { nullptr, // GGML_OP_CLAMP nullptr, // GGML_OP_CONV_TRANSPOSE_1D nullptr, // GGML_OP_IM2COL + nullptr, // GGML_OP_IM2COL_BACK nullptr, // GGML_OP_CONV_TRANSPOSE_2D nullptr, // GGML_OP_POOL_1D nullptr, // GGML_OP_POOL_2D + nullptr, // GGML_OP_POOL_2D_BACK nullptr, // GGML_OP_UPSCALE nullptr, // GGML_OP_PAD nullptr, // GGML_OP_ARANGE @@ -210,6 +214,7 @@ constexpr const char *kGgmlOpToQnnOp[] = { nullptr, // GGML_OP_WIN_UNPART nullptr, // GGML_OP_GET_REL_POS nullptr, // GGML_OP_ADD_REL_POS + nullptr, // GGML_OP_RWKV_WKV nullptr, // GGML_OP_UNARY @@ -241,6 +246,7 @@ constexpr const char *kGgmlOpToQnnOp[] = { nullptr, // GGML_UNARY_OP_SILU nullptr, // GGML_UNARY_OP_HARDSWISH nullptr, // GGML_UNARY_OP_HARDSIGMOID + nullptr, // GGML_UNARY_OP_EXP }; static_assert(sizeof(kGgmlOpToQnnOp) / sizeof(kGgmlOpToQnnOp[0]) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), @@ -340,6 +346,8 @@ constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = { nullptr, // GGML_OP_SQR qnn_unary_op_impl, // GGML_OP_SQRT qnn_unary_op_impl, // GGML_OP_LOG + nullptr, // GGML_OP_SIN + nullptr, // GGML_OP_COS nullptr, // GGML_OP_SUM nullptr, // GGML_OP_SUM_ROWS nullptr, // GGML_OP_MEAN @@ -377,9 +385,11 @@ constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = { nullptr, // GGML_OP_CLAMP nullptr, // GGML_OP_CONV_TRANSPOSE_1D nullptr, // GGML_OP_IM2COL + nullptr, // GGML_OP_IM2COL_BACK nullptr, // GGML_OP_CONV_TRANSPOSE_2D nullptr, // GGML_OP_POOL_1D nullptr, // GGML_OP_POOL_2D + nullptr, // GGML_OP_POOL_2D_BACK nullptr, // GGML_OP_UPSCALE nullptr, // GGML_OP_PAD nullptr, // GGML_OP_ARANGE @@ -395,6 +405,7 @@ constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = { nullptr, // GGML_OP_WIN_UNPART nullptr, // GGML_OP_GET_REL_POS nullptr, // GGML_OP_ADD_REL_POS + nullptr, // GGML_OP_RWKV_WKV nullptr, // GGML_OP_UNARY @@ -426,6 +437,7 @@ constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = { nullptr, // GGML_UNARY_OP_SILU nullptr, // GGML_UNARY_OP_HARDSWISH nullptr, // GGML_UNARY_OP_HARDSIGMOID + nullptr, // GGML_UNARY_OP_EXP }; static_assert(sizeof(kQnnUnaryOpsTable) / sizeof(kQnnUnaryOpsTable[0]) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), @@ -443,6 +455,8 @@ static constexpr const ggml_qnn_binary_op_t kQnnBinaryOpsTable[] = { nullptr, // GGML_OP_SQR nullptr, // GGML_OP_SQRT nullptr, // GGML_OP_LOG + nullptr, // GGML_OP_SIN + nullptr, // GGML_OP_COS nullptr, // GGML_OP_SUM nullptr, // GGML_OP_SUM_ROWS nullptr, // GGML_OP_MEAN @@ -480,9 +494,11 @@ static constexpr const ggml_qnn_binary_op_t kQnnBinaryOpsTable[] = { nullptr, // GGML_OP_CLAMP nullptr, // GGML_OP_CONV_TRANSPOSE_1D nullptr, // GGML_OP_IM2COL + nullptr, // GGML_OP_IM2COL_BACK nullptr, // GGML_OP_CONV_TRANSPOSE_2D nullptr, // GGML_OP_POOL_1D nullptr, // GGML_OP_POOL_2D + nullptr, // GGML_OP_POOL_2D_BACK nullptr, // GGML_OP_UPSCALE nullptr, // GGML_OP_PAD nullptr, // GGML_OP_ARANGE @@ -498,6 +514,7 @@ static constexpr const ggml_qnn_binary_op_t kQnnBinaryOpsTable[] = { nullptr, // GGML_OP_WIN_UNPART nullptr, // GGML_OP_GET_REL_POS nullptr, // GGML_OP_ADD_REL_POS + nullptr, // GGML_OP_RWKV_WKV nullptr, // GGML_OP_UNARY From b7aea0438e080a6d090196a3d84e3947c111c600 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 18 Sep 2024 12:56:47 +0800 Subject: [PATCH 118/166] fix compiling error --- ggml/src/CMakeLists.txt | 2 +- ggml/src/ggml-qnn.cpp | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 1edbd736d5e20..185f588be144c 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -910,7 +910,7 @@ if (GGML_QNN) if (CMAKE_SYSTEM_NAME STREQUAL "Android") find_library(LOG_LIB log) find_library(ANDROID_LIB android) - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${LOG_LIB} ${ANDROID_LIB}) + set(GGML_EXTRA_LIBS_PRIVATE ${GGML_EXTRA_LIBS_PRIVATE} ${LOG_LIB} ${ANDROID_LIB}) set(GGML_QNN_DEFAULT_LIB_SEARCH_PATH "/data/local/tmp/" CACHE STRING "customized library search path for QNN backend") else() message(FATAL_ERROR "QNN now only available on Android") diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 6ed5ecb2e03f2..b5b18e04aa20c 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -29,6 +29,7 @@ #include #include "ggml-backend-impl.h" +#include "ggml-impl.h" #include "ggml-qnn/backend-ops.hpp" #include "ggml-qnn/backend.hpp" From a1ceaae4ad73ecfb6a1c76bb8ed4a8f452790e3c Mon Sep 17 00:00:00 2001 From: Hongrui Chen Date: Sat, 28 Sep 2024 23:06:17 +0800 Subject: [PATCH 119/166] fix compiling error at older ndk (r23c) --- ggml/src/ggml-qnn/qnn-lib.hpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-qnn/qnn-lib.hpp b/ggml/src/ggml-qnn/qnn-lib.hpp index da986e2e4c4ff..d55f730f80d84 100644 --- a/ggml/src/ggml-qnn/qnn-lib.hpp +++ b/ggml/src/ggml-qnn/qnn-lib.hpp @@ -6,6 +6,7 @@ #include #include #include +#include // header file of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK // https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct @@ -251,7 +252,7 @@ class qnn_instance { } qnn_status = QNN_SUCCESS; - if (_backend_name.find("Htp") != std::variant_npos) { + if (_backend_name.find("Htp") != _backend_name.npos) { const QnnDevice_PlatformInfo_t *p_info = nullptr; _qnn_interface->qnn_device_get_platform_info(nullptr, &p_info); QNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices); @@ -358,7 +359,7 @@ class qnn_instance { QNN_LOG_DEBUG("initialize qnn context successfully\n"); } - if (_backend_name.find("Htp") != std::variant_npos) { + if (_backend_name.find("Htp") != _backend_name.npos) { // TODO: faster approach to probe the accurate capacity of rpc ion memory size_t candidate_size = 0; uint8_t *rpc_buffer = nullptr; @@ -409,7 +410,7 @@ class qnn_instance { QNN_LOG_DEBUG("succeed to close rpcmem lib\n"); } - if (_backend_name.find("Htp") != std::variant_npos) { + if (_backend_name.find("Htp") != _backend_name.npos) { _qnn_htp_perfinfra->destroyPowerConfigId(_qnn_power_configid); } From 1da8a3e67831edaf9ba5ad7a8fcfaf9a01d6090f Mon Sep 17 00:00:00 2001 From: hongruichen Date: Mon, 30 Sep 2024 10:37:23 +0800 Subject: [PATCH 120/166] fix compiling error after merge --- ggml/src/ggml-qnn.cpp | 1 + ggml/src/ggml-qnn/backend-ops.cpp | 3 +++ ggml/src/ggml-qnn/tensor.hpp | 1 + 3 files changed, 5 insertions(+) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index b5b18e04aa20c..3e3fb5778c883 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -180,6 +180,7 @@ static ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = { /* .free_buffer = */ ggml_backend_qnn_buffer_free_buffer, /* .get_base = */ ggml_backend_qnn_buffer_get_base, /* .init_tensor = */ ggml_backend_qnn_buffer_init_tensor, + /* .memset_tensor = */ nullptr, /* .set_tensor = */ ggml_backend_qnn_buffer_set_tensor, /* .get_tensor = */ ggml_backend_qnn_buffer_get_tensor, /* .cpy_tensor = */ ggml_backend_qnn_buffer_cpy_tensor, diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 5829e0fadbe92..6a83f4561807a 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -231,6 +231,7 @@ constexpr const char *kGgmlOpToQnnOp[] = { nullptr, // GGML_OP_CROSS_ENTROPY_LOSS nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK + nullptr, // GGML_OP_OPT_STEP_ADAMW // ggml_unary_op nullptr, // GGML_UNARY_OP_ABS @@ -422,6 +423,7 @@ constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = { nullptr, // GGML_OP_CROSS_ENTROPY_LOSS nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK + nullptr, // GGML_OP_OPT_STEP_ADAMW // ggml_unary_op nullptr, // GGML_UNARY_OP_ABS @@ -531,6 +533,7 @@ static constexpr const ggml_qnn_binary_op_t kQnnBinaryOpsTable[] = { nullptr, // GGML_OP_CROSS_ENTROPY_LOSS nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK + nullptr, // GGML_OP_OPT_STEP_ADAMW }; static_assert(sizeof(kQnnBinaryOpsTable) / sizeof(kQnnBinaryOpsTable[0]) == GGML_OP_COUNT, diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index 0c724e2871d45..c465d17f25506 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -1,6 +1,7 @@ #pragma once +#include #include #include #include From 181cf52888015417d812b50c76af9ee0032aec11 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Thu, 10 Oct 2024 10:29:51 +0800 Subject: [PATCH 121/166] adapt new register backend interface and fix missing ops --- ggml/include/ggml-qnn.h | 11 +- ggml/src/ggml-backend.cpp | 8 + ggml/src/ggml-qnn.cpp | 443 ++++++++++++++++++------------ ggml/src/ggml-qnn/backend-ops.cpp | 23 +- ggml/src/ggml-qnn/backend-ops.hpp | 2 +- ggml/src/ggml-qnn/backend.hpp | 25 +- src/llama.cpp | 4 - 7 files changed, 308 insertions(+), 208 deletions(-) diff --git a/ggml/include/ggml-qnn.h b/ggml/include/ggml-qnn.h index b8c7da8fbbf87..23835f23cb0ec 100644 --- a/ggml/include/ggml-qnn.h +++ b/ggml/include/ggml-qnn.h @@ -8,6 +8,7 @@ extern "C" { #endif +#define GGML_QNN_NAME "QNN" #define GGML_QNN_MAX_DEVICES 3 enum QNNBackend { @@ -20,21 +21,17 @@ enum QNNBackend { /** * - * @param device 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2:QNN_BACKEND_NPU + * @param index 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2:QNN_BACKEND_NPU * @param extend_lib_search_path extened lib search path for searching QNN backend dynamic libs * @return */ -GGML_API ggml_backend_t ggml_backend_qnn_init(size_t dev_num, const char *extend_lib_search_path); +GGML_API ggml_backend_t ggml_backend_qnn_init(size_t index, const char *extend_lib_search_path); GGML_API bool ggml_backend_is_qnn(ggml_backend_t backend); -GGML_API void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int thread_counts); - GGML_API int ggml_backend_qnn_get_device_count(void); -GGML_API void ggml_backend_qnn_get_device_description(size_t dev_num, char *description, size_t description_size); - -GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t dev_num); +GGML_API ggml_backend_reg_t ggml_backend_qnn_reg(void); #ifdef __cplusplus } diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 0551764fe3fb0..f70c9f6e42f09 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -525,6 +525,10 @@ void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * na #include "ggml-cuda.h" #endif +#ifdef GGML_USE_QNN +#include "ggml-qnn.h" +#endif + struct ggml_backend_registry { std::vector backends; std::vector devices; @@ -534,6 +538,10 @@ struct ggml_backend_registry { register_backend(ggml_backend_cuda_reg()); #endif +#ifdef GGML_USE_QNN + register_backend(ggml_backend_qnn_reg()); +#endif + register_backend(ggml_backend_cpu_reg()); // TODO: sycl, metal, vulkan, kompute, cann diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 3e3fb5778c883..2d2b4745d13f1 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -1,11 +1,5 @@ #include "ggml-qnn.h" -#include -#include -#include -#include -#include -#include #include #include @@ -50,23 +44,19 @@ #define QNN_BACKEND_NAME "qnn" -// according to the QNN SDK Reference Guide, -// CPU - Choose a non-quantized model.Quantized models are currently incompatible with the CPU backend -// GPU - Choose a non-quantized model.Quantized models are currently incompatible with the GPU backend -// HTP - Choose a quantized model. Quantized models are required when running on the HTP backend -// DSP - Choose a quantized model. Quantized models are required when running on the DSP backend -// HTA - Choose a quantized model. Quantized models are required when running on the HTA backend -// -// only focus on Qualcomm CPU/GPU/NPU backend in this implementation of QNN backend for ggml currently, -// CPU: Qualcomm Kryo CPU -// GPU: Qualcomm Adreno GPU -// NPU: Qualcomm NPU: aka HTP(Hexagon Tensor Processor), ~= cDSP(Compute DSP) + -// HMX(Hexagon Matrix eXtensions)/HTA(Hexagon Tensor Accelerator) - -static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { - ggml_backend_qnn_context(QNN_BACKEND_CPU, 1, "qnn-cpu", "libQnnCpu.so"), /* QNN_BACKEND_CPU */ - ggml_backend_qnn_context(QNN_BACKEND_GPU, 1, "qnn-gpu", "libQnnGpu.so"), /* QNN_BACKEND_GPU */ - ggml_backend_qnn_context(QNN_BACKEND_NPU, 1, "qnn-npu", "libQnnHtp.so"), /* QNN_BACKEND_NPU */ +namespace { + +struct qnn_device_caps { + const char *name; + const char *description; + const char *lib_name; + enum ggml_backend_dev_type type; +}; + +const qnn_device_caps kDeviceCaps[GGML_QNN_MAX_DEVICES]{ + { "qnn-cpu", "Qualcomm Kryo CPU", "libQnnCpu.so", GGML_BACKEND_DEVICE_TYPE_CPU }, /* QNN_BACKEND_CPU */ + { "qnn-gpu", "Qualcomm Adreno GPU", "libQnnGpu.so", GGML_BACKEND_DEVICE_TYPE_GPU }, /* QNN_BACKEND_GPU */ + { "qnn-npu", "Qualcomm NPU", "libQnnHtp.so", GGML_BACKEND_DEVICE_TYPE_GPU }, /* QNN_BACKEND_NPU */ }; class ggml_backend_qnn_buffer_context { @@ -74,6 +64,7 @@ class ggml_backend_qnn_buffer_context { ggml_backend_qnn_buffer_context(QNNBackend device, std::shared_ptr instance, size_t size) : _instance(instance), _name(QNN_BACKEND_NAME + std::to_string(device)) { + // TODO: fix this for other platforms size_t size_page = sysconf(_SC_PAGESIZE); // TODO: for qnn npu, a better way here is to reuse the buffer allocated by qnn rpc, will save an extra copy @@ -105,61 +96,60 @@ class ggml_backend_qnn_buffer_context { }; struct ggml_backend_qnn_buffer_type_context { - size_t device; std::string name; }; -// ================================================================================================= -// -// implementation of QNN backend for GGML -// -// ================================================================================================= -static bool ggml_qnn_compute_forward(ggml_backend_qnn_context *ctx, struct ggml_tensor *tensor) { - return qnn::ggml_qnn_forward(ctx, tensor); +ggml_backend_qnn_device_context *get_device_context(ggml_backend_dev_t dev) { + return reinterpret_cast(dev->context); } -static const char *ggml_backend_qnn_buffer_get_name(ggml_backend_buffer_t buffer) { +/* + * ----------------------------------------------------------------------------------------------- + * qnn backend buffer object + * ----------------------------------------------------------------------------------------------- + */ +const char *ggml_backend_qnn_buffer_get_name(ggml_backend_buffer_t buffer) { GGML_UNUSED(buffer); - return "QNN"; + return GGML_QNN_NAME; } -GGML_CALL static bool ggml_backend_buffer_is_qnn(ggml_backend_buffer_t buffer) { +bool ggml_backend_buffer_is_qnn(ggml_backend_buffer_t buffer) { return buffer->iface.get_name == ggml_backend_qnn_buffer_get_name; } -GGML_CALL static void ggml_backend_qnn_buffer_free_buffer(ggml_backend_buffer_t buffer) { +void ggml_backend_qnn_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context; delete ctx; } -GGML_CALL static void *ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t buffer) { +void *ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t buffer) { ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context; return ctx->get_buffer(); } -GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor) { +void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor) { // Do nothing here, the qnn tensor will be create along with the graph. GGML_UNUSED(buffer); GGML_UNUSED(tensor); } -GGML_CALL static void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor, - const void *data, size_t offset, size_t size) { +void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data, + size_t offset, size_t size) { GGML_UNUSED(buffer); memcpy((char *)tensor->data + offset, data, size); } -GGML_CALL static void ggml_backend_qnn_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor *tensor, - void *data, size_t offset, size_t size) { +void ggml_backend_qnn_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor *tensor, void *data, + size_t offset, size_t size) { GGML_UNUSED(buffer); memcpy(data, (const char *)tensor->data + offset, size); } -GGML_CALL static bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor *src, - struct ggml_tensor *dst) { +bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor *src, + struct ggml_tensor *dst) { GGML_UNUSED(buffer); if (ggml_backend_buffer_is_host(src->buffer)) { memcpy(dst->data, src->data, ggml_nbytes(src)); @@ -169,13 +159,13 @@ GGML_CALL static bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t b return false; } -GGML_CALL static void ggml_backend_qnn_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { +void ggml_backend_qnn_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context; memset(ctx->get_buffer(), value, ctx->get_buffer_size()); } -static ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = { +ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = { /* .get_name = */ ggml_backend_qnn_buffer_get_name, /* .free_buffer = */ ggml_backend_qnn_buffer_free_buffer, /* .get_base = */ ggml_backend_qnn_buffer_get_base, @@ -188,16 +178,20 @@ static ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = { /* .reset = */ nullptr, }; -GGML_CALL static const char *ggml_backend_qnn_buffer_type_name(ggml_backend_buffer_type_t buft) { +/* + * ----------------------------------------------------------------------------------------------- + * qnn backend object + * ----------------------------------------------------------------------------------------------- + */ +const char *ggml_backend_qnn_buffer_type_name(ggml_backend_buffer_type_t buft) { GGML_UNUSED(buft); - return "QNN"; + return GGML_QNN_NAME; } -GGML_CALL static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, - size_t size) { - ggml_backend_qnn_buffer_type_context *buft_ctx = (ggml_backend_qnn_buffer_type_context *)buft->context; +ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + auto *dev_ctx = get_device_context(buft->device); ggml_backend_qnn_buffer_context *ctx = - new ggml_backend_qnn_buffer_context((QNNBackend)buft_ctx->device, g_qnn_mgr[buft_ctx->device].instance, size); + new ggml_backend_qnn_buffer_context((QNNBackend)dev_ctx->device, dev_ctx->instance, size); if (!ctx->is_valid()) { return nullptr; } @@ -205,65 +199,84 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer return ggml_backend_buffer_init(buft, ggml_backend_qnn_buffer_interface, ctx, size); } -GGML_CALL static size_t ggml_backend_qnn_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +size_t ggml_backend_qnn_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { GGML_UNUSED(buft); return 32; } // TODO: this value is an experimental value, works fine with whisper/llm/minicpm-v inference on Android -GGML_CALL static size_t ggml_backend_qnn_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { +size_t ggml_backend_qnn_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { GGML_UNUSED(buft); return (96 * 1024 * 1024); } -GGML_CALL static bool ggml_backend_qnn_buffer_is_host(ggml_backend_buffer_type_t buft) { +bool ggml_backend_qnn_buffer_is_host(ggml_backend_buffer_type_t buft) { + // TODO: fix this GGML_UNUSED(buft); return true; } -GGML_CALL static const char *ggml_backend_qnn_name(ggml_backend_t backend) { - ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *)backend->context; - return g_qnn_mgr[ctx->device].name; +const char *ggml_backend_qnn_name(ggml_backend_t backend) { + auto *device_ctx = get_device_context(backend->device); + return device_ctx->name.c_str(); } -GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { - QNN_LOG_INFO("enter %s", __func__); - ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *)backend->context; - QNN_LOG_INFO("idx %d, name:%s", ctx->device, g_qnn_mgr[ctx->device].name); +void ggml_backend_qnn_free(ggml_backend_t backend) { + auto *device_ctx = get_device_context(backend->device); + QNN_LOG_INFO("idx %d, name:%s", device_ctx->device, device_ctx->name.c_str()); - auto instance = g_qnn_mgr[ctx->device].instance; + auto &instance = device_ctx->instance; if (instance) { - ctx->qnn_graph_cache.clear(); + device_ctx->qnn_graph_cache.clear(); + device_ctx->qnn_interface.reset(); instance->qnn_finalize(); - g_qnn_mgr[ctx->device].instance.reset(); + instance.reset(); } +} - if (g_qnn_mgr[ctx->device].backend != nullptr) { - delete backend; - g_qnn_mgr[ctx->device].backend = nullptr; +ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(ggml_backend_dev_t dev) { + static ggml_backend_qnn_buffer_type_context ggml_backend_qnn_buffer_type_contexts[GGML_QNN_MAX_DEVICES]; + static ggml_backend_buffer_type ggml_backend_qnn_buffer_types[GGML_QNN_MAX_DEVICES]; + static bool ggml_backend_qnn_buffer_type_initialized = false; + auto *dev_ctx = get_device_context(dev); + if (!ggml_backend_qnn_buffer_type_initialized) { + for (size_t i = 0; i < GGML_QNN_MAX_DEVICES; i++) { + auto &context = ggml_backend_qnn_buffer_type_contexts[i]; + context = { std::string(QNN_BACKEND_NAME) + std::to_string(i) }; + ggml_backend_qnn_buffer_types[i] = { + /* .iface = */ { + /* .get_name = */ ggml_backend_qnn_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_qnn_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_qnn_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_qnn_buffer_type_get_max_size, + /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes + /* .is_host = */ ggml_backend_qnn_buffer_is_host, + }, + /* .device */ dev, + /* .context = */ &context, + }; + } + ggml_backend_qnn_buffer_type_initialized = true; } - QNN_LOG_INFO("leave %s", __func__); -} -GGML_CALL static ggml_backend_buffer_type_t ggml_backend_qnn_get_default_buffer_type(ggml_backend_t backend) { - ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *)backend->context; + return &ggml_backend_qnn_buffer_types[dev_ctx->device]; +} - return ggml_backend_qnn_buffer_type(ctx->device); +ggml_backend_buffer_type_t ggml_backend_qnn_get_default_buffer_type(ggml_backend_t backend) { + return ggml_backend_qnn_buffer_type(backend->device); } -GGML_CALL static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, ggml_cgraph *cgraph) { +ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, ggml_cgraph *cgraph) { enum ggml_status result = GGML_STATUS_SUCCESS; - ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *)backend->context; - GGML_UNUSED(ctx); - + auto *device_ctx = get_device_context(backend->device); for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor *node = cgraph->nodes[i]; if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { continue; } - bool ok = ggml_qnn_compute_forward(ctx, node); + bool ok = qnn::ggml_qnn_forward(device_ctx, node); if (!ok) { QNN_LOG_DEBUG("error: op not supported %s (%s)\n", node->name, ggml_op_name(node->op)); } @@ -272,12 +285,12 @@ GGML_CALL static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backe return result; } -GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const ggml_tensor *op) { +bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const ggml_tensor *op) { GGML_UNUSED(backend); return qnn::ggml_qnn_supports_op(op); } -GGML_CALL static bool ggml_backend_qnn_offload_op(ggml_backend_t backend, const ggml_tensor *op) { +bool ggml_backend_qnn_offload_op(ggml_backend_t backend, const ggml_tensor *op) { GGML_UNUSED(backend); size_t dims = ggml_n_dims(op); @@ -292,7 +305,7 @@ GGML_CALL static bool ggml_backend_qnn_offload_op(ggml_backend_t backend, const return can_offload; } -static ggml_backend_i ggml_backend_qnn_interface = { +ggml_backend_i ggml_backend_qnn_interface = { /* .get_name = */ ggml_backend_qnn_name, /* .free = */ ggml_backend_qnn_free, /* .get_default_buffer_type = */ ggml_backend_qnn_get_default_buffer_type, @@ -305,106 +318,75 @@ static ggml_backend_i ggml_backend_qnn_interface = { /* .graph_plan_update = */ nullptr, /* .graph_plan_compute = */ nullptr, /* .graph_compute = */ ggml_backend_qnn_graph_compute, - /* .supports_op = */ ggml_backend_qnn_supports_op, - /* .supports_buft = */ nullptr, - /* .offload_op = */ ggml_backend_qnn_offload_op, - /* .event_new = */ nullptr, - /* .event_free = */ nullptr, + /* .supports_op = */ nullptr, // moved to device + /* .supports_buft = */ nullptr, // moved to device + /* .offload_op = */ nullptr, // moved to device /* .event_record = */ nullptr, /* .event_wait = */ nullptr, - /* .event_synchronize = */ nullptr, }; -static ggml_guid_t ggml_backend_qnn_guid() { - static ggml_guid guid = { 0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81, - 0x92, 0xa3, 0xb4, 0xc5, 0xd6, 0xe7, 0xf8, 0x09 }; - return &guid; -} - -static ggml_backend_t ggml_backend_qnn_reg_init(const char *extend_lib_search_path, void *user_data) { - ggml_backend_t qnn_backend = ggml_backend_qnn_init((int)(intptr_t)user_data, extend_lib_search_path); - return qnn_backend; +/* + * ----------------------------------------------------------------------------------------------- + * qnn backend device object + * ----------------------------------------------------------------------------------------------- + */ +const char *ggml_backend_qnn_device_get_name(ggml_backend_dev_t dev) { + const auto &caps = kDeviceCaps[get_device_context(dev)->device]; + return caps.name; } -bool ggml_backend_is_qnn(ggml_backend_t backend) { - return backend != nullptr && ggml_guid_matches(backend->guid, ggml_backend_qnn_guid()); +const char *ggml_backend_qnn_device_get_description(ggml_backend_dev_t dev) { + const auto &caps = kDeviceCaps[get_device_context(dev)->device]; + return caps.description; } -void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int n_threads) { - GGML_ASSERT(ggml_backend_is_qnn(backend)); +void ggml_backend_qnn_device_get_memory(ggml_backend_dev_t dev, size_t *free, size_t *total) { + // TODO: get memory info + *free = 0; + *total = 0; - auto *ctx = (ggml_backend_qnn_context *)backend->context; - ctx->threads = n_threads; + GGML_UNUSED(dev); } -int ggml_backend_qnn_get_device_count() { return GGML_QNN_MAX_DEVICES; } - -void ggml_backend_qnn_get_device_description(size_t dev_num, char *description, size_t description_size) { - if (nullptr == description || 0 == description_size) { - QNN_LOG_WARN("invalid param"); - return; - } - - if (dev_num >= GGML_QNN_MAX_DEVICES) { - QNN_LOG_WARN("invalid param"); - return; - } - - snprintf(description, description_size, "%s", g_qnn_mgr[dev_num].name); +enum ggml_backend_dev_type ggml_backend_qnn_device_get_type(ggml_backend_dev_t dev) { + // TODO: for cpu backend, we should return GGML_BACKEND_DEVICE_TYPE_CPU + GGML_UNUSED(dev); + return GGML_BACKEND_DEVICE_TYPE_GPU; } -ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device) { - if (device >= GGML_QNN_MAX_DEVICES) { - QNN_LOG_DEBUG( - "ggml_backend_qnn_buffer_type error: device_index:%d is " - "out of range [0, %d]\n", - device, GGML_QNN_MAX_DEVICES - 1); - return nullptr; - } - - static ggml_backend_qnn_buffer_type_context ggml_backend_qnn_buffer_type_contexts[GGML_QNN_MAX_DEVICES]; - static ggml_backend_buffer_type ggml_backend_qnn_buffer_types[GGML_QNN_MAX_DEVICES]; - static bool ggml_backend_qnn_buffer_type_initialized = false; - if (!ggml_backend_qnn_buffer_type_initialized) { - for (size_t i = 0; i < GGML_QNN_MAX_DEVICES; i++) { - auto &context = ggml_backend_qnn_buffer_type_contexts[i]; - context = { i, std::string(QNN_BACKEND_NAME) + std::to_string(i) }; - ggml_backend_qnn_buffer_types[i] = { - /* .iface = */ { /* .get_name = */ ggml_backend_qnn_buffer_type_name, - /* .alloc_buffer = */ ggml_backend_qnn_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_qnn_buffer_type_get_alignment, - /* .get_max_size = */ ggml_backend_qnn_buffer_type_get_max_size, - /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes - /* .is_host = */ ggml_backend_qnn_buffer_is_host }, - /* .context = */ &context, - }; - } - ggml_backend_qnn_buffer_type_initialized = true; - } - - return &ggml_backend_qnn_buffer_types[device]; +void ggml_backend_qnn_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props *props) { + props->name = ggml_backend_qnn_device_get_name(dev); + props->description = ggml_backend_qnn_device_get_description(dev); + props->type = ggml_backend_qnn_device_get_type(dev); + ggml_backend_qnn_device_get_memory(dev, &props->memory_free, &props->memory_total); + props->caps = { + /* async */ false, + /* host_buffer */ false, + /* events */ false, + }; } -ggml_backend_t ggml_backend_qnn_init(size_t device, const char *extend_lib_search_path) { - int result = 0; +ggml_guid_t ggml_backend_qnn_guid() { + static ggml_guid guid = { 0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81, + 0x92, 0xa3, 0xb4, 0xc5, 0xd6, 0xe7, 0xf8, 0x09 }; + return &guid; +} +ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, const char *extend_lib_search_path) { if (!extend_lib_search_path) { extend_lib_search_path = GGML_QNN_DEFAULT_LIB_SEARCH_PATH; QNN_LOG_WARN("extend_lib_search_path is nullptr, will use " GGML_QNN_DEFAULT_LIB_SEARCH_PATH " as default"); } - QNN_LOG_DEBUG("device %d", device); + auto *dev_ctx = get_device_context(dev); + auto device_index = dev_ctx->device; + QNN_LOG_DEBUG("device %d", device_index); QNN_LOG_DEBUG("extend_lib_search_path %s", extend_lib_search_path); - if (device >= GGML_QNN_MAX_DEVICES) { - QNN_LOG_ERROR("invalid device %d", device); - return nullptr; - } - std::string path = extend_lib_search_path; // TODO: Fix this for other platforms #if defined(__ANDROID__) || defined(ANDROID) - if (QNN_BACKEND_NPU == device) { + if (QNN_BACKEND_NPU == device_index) { if (0 == setenv("LD_LIBRARY_PATH", (path + ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/" "dsp:/vendor/dsp/images") @@ -425,17 +407,18 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char *extend_lib_searc } } else { if (0 == setenv("LD_LIBRARY_PATH", path.c_str(), 1)) { - QNN_LOG_INFO("%s backend setenv successfully\n", qnn::get_backend_name(device)); + QNN_LOG_INFO("%s backend setenv successfully\n", qnn::get_backend_name(device_index)); } else { - QNN_LOG_ERROR("%s backend setenv failure\n", qnn::get_backend_name(device)); + QNN_LOG_ERROR("%s backend setenv failure\n", qnn::get_backend_name(device_index)); } } #endif - auto instance = std::make_shared(extend_lib_search_path, g_qnn_mgr[device].lib, ""); - result = instance->qnn_init(nullptr); + auto instance = std::make_shared(path, dev_ctx->lib_name, "ggml"); + auto result = instance->qnn_init(nullptr); if (result != 0) { - QNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why\n", qnn::get_backend_name(device)); + QNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why\n", + qnn::get_backend_name(device_index)); return nullptr; } auto qnn_interface = instance->get_qnn_interface(); @@ -444,28 +427,138 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char *extend_lib_searc return nullptr; } - std::string device_name = qnn::get_backend_name(device); + std::string device_name = qnn::get_backend_name(device_index); QNN_LOG_INFO("qnn device name %s", device_name.c_str()); - auto &qnn_device = g_qnn_mgr[device]; - qnn_device.instance = instance; - qnn_device.qnn_interface = qnn_interface; - qnn_device.socinfo = instance->get_soc_info(); + dev_ctx->instance = instance; + dev_ctx->qnn_interface = qnn_interface; + dev_ctx->socinfo = instance->get_soc_info(); - ggml_backend_t qnn_backend = new ggml_backend{ /* .guid = */ ggml_backend_qnn_guid(), - /* .iface = */ ggml_backend_qnn_interface, - /* .context = */ &g_qnn_mgr[device] }; - g_qnn_mgr[device].backend = qnn_backend; + ggml_backend_t qnn_backend = new ggml_backend{ + /* .guid = */ ggml_backend_qnn_guid(), + /* .iface = */ ggml_backend_qnn_interface, + /* .device = */ dev, + /* .context = */ nullptr, + }; return qnn_backend; } -extern "C" GGML_CALL void ggml_backend_qnn_reg_devices(); +ggml_backend_t ggml_backend_qnn_device_init(ggml_backend_dev_t dev, const char *params) { + return ggml_backend_qnn_init_with_device_context(dev, params); +} + +ggml_backend_buffer_type_t ggml_backend_qnn_device_get_buffer_type(ggml_backend_dev_t dev) { + return ggml_backend_qnn_buffer_type(dev); +} + +ggml_backend_buffer_t ggml_backend_qnn_device_buffer_from_ptr(ggml_backend_dev_t dev, void *ptr, size_t size, + size_t max_tensor_size) { + // TODO + GGML_UNUSED(dev); + GGML_UNUSED(max_tensor_size); + return ggml_backend_cpu_buffer_from_ptr(ptr, size); +} + +bool ggml_backend_qnn_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor *op) { + GGML_UNUSED(dev); + return qnn::ggml_qnn_supports_op(op); +} + +bool ggml_backend_qnn_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { + GGML_UNUSED(dev); + return ggml_backend_buft_is_host(buft); +} + +const struct ggml_backend_device_i ggml_backend_qnn_device_interface = { + /* .get_name = */ ggml_backend_qnn_device_get_name, + /* .get_description = */ ggml_backend_qnn_device_get_description, + /* .get_memory = */ ggml_backend_qnn_device_get_memory, + /* .get_type = */ ggml_backend_qnn_device_get_type, + /* .get_props = */ ggml_backend_qnn_device_get_props, + /* .init_backend = */ ggml_backend_qnn_device_init, + /* .get_buffer_type = */ ggml_backend_qnn_device_get_buffer_type, + /* .get_host_buffer_type = */ nullptr, + /* .buffer_from_host_ptr = */ ggml_backend_qnn_device_buffer_from_ptr, + /* .supports_op = */ ggml_backend_qnn_device_supports_op, + /* .supports_buft = */ ggml_backend_qnn_device_supports_buft, + /* .offload_op = */ nullptr, + /* .event_new = */ nullptr, + /* .event_free = */ nullptr, + /* .event_synchronize = */ nullptr, +}; + +/* + * ----------------------------------------------------------------------------------------------- + * qnn backend registry object + * ----------------------------------------------------------------------------------------------- + */ + +struct ggml_backend_qnn_reg_impl : ggml_backend_reg { + std::array, GGML_QNN_MAX_DEVICES> device_contexts; + std::array devices; + + ggml_backend_qnn_reg_impl(ggml_backend_reg_i interface) { + context = this; + iface = interface; + } +}; + +const char *ggml_backend_qnn_reg_get_name(ggml_backend_reg_t reg) { + GGML_UNUSED(reg); + return GGML_QNN_NAME; +} + +size_t ggml_backend_qnn_reg_get_device_count(ggml_backend_reg_t reg) { + auto *ctx = (ggml_backend_qnn_reg_impl *)reg->context; + return ctx->devices.size(); +} + +ggml_backend_dev_t ggml_backend_qnn_reg_get_device(ggml_backend_reg_t reg, size_t index) { + auto *ctx = (ggml_backend_qnn_reg_impl *)reg->context; + GGML_ASSERT(index < ctx->devices.size()); + return &(ctx->devices[index]); +} -GGML_CALL void ggml_backend_qnn_reg_devices() { - for (size_t idx = 0; idx < GGML_QNN_MAX_DEVICES; idx++) { - char name[GGML_MAX_NAME]; - ggml_backend_qnn_get_device_description(idx, name, GGML_MAX_NAME); - ggml_backend_register(name, ggml_backend_qnn_reg_init, ggml_backend_qnn_buffer_type(idx), - (void *)(intptr_t)idx); +const ggml_backend_reg_i ggml_backend_qnn_reg_interface = { + /* .get_name = */ ggml_backend_qnn_reg_get_name, + /* .get_device_count = */ ggml_backend_qnn_reg_get_device_count, + /* .get_device_get = */ ggml_backend_qnn_reg_get_device, + /* .get_proc_address = */ nullptr, +}; + +} // namespace + +ggml_backend_reg_t ggml_backend_qnn_reg() { + static ggml_backend_qnn_reg_impl reg{ ggml_backend_qnn_reg_interface }; + static bool initialized = false; + static std::mutex mutex; + + { + std::lock_guard lock(mutex); + if (!initialized) { + for (int i = 0; i < GGML_QNN_MAX_DEVICES; i++) { + reg.device_contexts[i] = std::make_unique( + /* .device = */ (QNNBackend)i, + /* .threads = */ 1, + /* .name = */ qnn::get_backend_name(i), + /* .lib_name = */ kDeviceCaps[i].lib_name); + + auto &device = reg.devices[i]; + device.iface = ggml_backend_qnn_device_interface; + device.reg = ® + device.context = reg.device_contexts[i].get(); + } + initialized = true; + } } + + return ® +} + +int ggml_backend_qnn_get_device_count() { return GGML_QNN_MAX_DEVICES; } + +ggml_backend_t ggml_backend_qnn_init(size_t index, const char *extend_lib_search_path) { + auto *reg = ggml_backend_qnn_reg(); + auto *device = ggml_backend_qnn_reg_get_device(reg, index); + return ggml_backend_qnn_device_init(device, extend_lib_search_path); } diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 6a83f4561807a..9c6e5709c8189 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -13,7 +13,7 @@ namespace { -bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src, ggml_tensor *dst) { +bool qnn_is_valid_params(ggml_backend_qnn_device_context *ctx, const ggml_tensor *src, ggml_tensor *dst) { if (!ctx || !src || !dst) { QNN_LOG_WARN("invalid params\n"); return false; @@ -28,7 +28,7 @@ bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src, return true; } -bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, +bool qnn_is_valid_params(ggml_backend_qnn_device_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst) { if (!ctx || !src0 || !src1 || !dst) { QNN_LOG_WARN("invalid params\n"); @@ -78,8 +78,8 @@ void print_ggml_tensor(const ggml_tensor *tensor) { namespace { -typedef bool (*ggml_qnn_unary_op_t)(ggml_backend_qnn_context *ctx, ggml_tensor *src, ggml_tensor *dst); -typedef bool (*ggml_qnn_binary_op_t)(ggml_backend_qnn_context *ctx, ggml_tensor *src0, ggml_tensor *src1, +typedef bool (*ggml_qnn_unary_op_t)(ggml_backend_qnn_device_context *ctx, ggml_tensor *src, ggml_tensor *dst); +typedef bool (*ggml_qnn_binary_op_t)(ggml_backend_qnn_device_context *ctx, ggml_tensor *src0, ggml_tensor *src1, ggml_tensor *dst); typedef const ggml_qnn_unary_op_t (&ggml_qnn_unary_op_array_t)[GGML_OP_COUNT + GGML_UNARY_OP_COUNT]; @@ -161,6 +161,7 @@ constexpr const char *kGgmlOpToQnnOp[] = { nullptr, // GGML_OP_SUM_ROWS nullptr, // GGML_OP_MEAN nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_COUNT_EQUAL nullptr, // GGML_OP_REPEAT nullptr, // GGML_OP_REPEAT_BACK nullptr, // GGML_OP_CONCAT @@ -256,7 +257,7 @@ static_assert(kGgmlOpToQnnOp[GGML_UNARY_OP_GELU + kGgmlUnaryOpStart] != nullptr, "GGML_UNARY_OP_GELU does not correspond to QNN_OP_GELU"); template -qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_context *ctx, size_t op, +qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *ctx, size_t op, const std::array &inputs, const std::array &outputs) { GGML_ASSERT(op < (GGML_OP_COUNT + GGML_UNARY_OP_COUNT)); @@ -271,8 +272,8 @@ qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_context *ctx, siz QNN_LOG_DEBUG("found graph %s in cache\n", graph_key.c_str()); graph_ptr = it->second.get(); } else { - auto graph = std::make_unique(graph_key, (QNNBackend)(ctx->device), ctx->instance, - ctx->socinfo.vtcm_size_in_mb); + auto graph = + std::make_unique(graph_key, ctx->device, ctx->instance, ctx->socinfo.vtcm_size_in_mb); if (!graph->is_valid()) { return nullptr; } @@ -292,7 +293,7 @@ qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_context *ctx, siz } template -bool qnn_binary_op_impl(ggml_backend_qnn_context *ctx, ggml_tensor *src0, ggml_tensor *src1, ggml_tensor *dst) { +bool qnn_binary_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src0, ggml_tensor *src1, ggml_tensor *dst) { static_assert(kGgmlOpToQnnOp[_GgmlOp] != nullptr, "GGML_OP does not have a corresponding QNN_OP"); CHECK_PARAMS(ctx, src0, src1, dst); @@ -315,7 +316,7 @@ bool qnn_binary_op_impl(ggml_backend_qnn_context *ctx, ggml_tensor *src0, ggml_t } template -bool qnn_unary_op_impl(ggml_backend_qnn_context *ctx, ggml_tensor *src, ggml_tensor *dst) { +bool qnn_unary_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src, ggml_tensor *dst) { static_assert(kGgmlOpToQnnOp[_GgmlOp] != nullptr, "GGML_OP does not have a corresponding QNN_OP"); CHECK_PARAMS(ctx, src, dst); @@ -353,6 +354,7 @@ constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = { nullptr, // GGML_OP_SUM_ROWS nullptr, // GGML_OP_MEAN nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_COUNT_EQUAL nullptr, // GGML_OP_REPEAT nullptr, // GGML_OP_REPEAT_BACK nullptr, // GGML_OP_CONCAT @@ -463,6 +465,7 @@ static constexpr const ggml_qnn_binary_op_t kQnnBinaryOpsTable[] = { nullptr, // GGML_OP_SUM_ROWS nullptr, // GGML_OP_MEAN nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_COUNT_EQUAL nullptr, // GGML_OP_REPEAT nullptr, // GGML_OP_REPEAT_BACK nullptr, // GGML_OP_CONCAT @@ -588,7 +591,7 @@ bool ggml_qnn_supports_op(const ggml_tensor *op) { return true; } -bool ggml_qnn_forward(ggml_backend_qnn_context *ctx, struct ggml_tensor *tensor) { +bool ggml_qnn_forward(ggml_backend_qnn_device_context *ctx, struct ggml_tensor *tensor) { size_t unary_op_idx = tensor->op; if (tensor->op == GGML_OP_UNARY) { unary_op_idx = kGgmlUnaryOpStart + ggml_get_unary_op(tensor); diff --git a/ggml/src/ggml-qnn/backend-ops.hpp b/ggml/src/ggml-qnn/backend-ops.hpp index ed4ce994f787b..86658da118f8b 100644 --- a/ggml/src/ggml-qnn/backend-ops.hpp +++ b/ggml/src/ggml-qnn/backend-ops.hpp @@ -7,6 +7,6 @@ namespace qnn { bool ggml_qnn_supports_op(const ggml_tensor *op); -bool ggml_qnn_forward(ggml_backend_qnn_context *ctx, struct ggml_tensor *tensor); +bool ggml_qnn_forward(ggml_backend_qnn_device_context *ctx, struct ggml_tensor *tensor); } // namespace qnn diff --git a/ggml/src/ggml-qnn/backend.hpp b/ggml/src/ggml-qnn/backend.hpp index b2f93a8f7a9e5..696a883480e9f 100644 --- a/ggml/src/ggml-qnn/backend.hpp +++ b/ggml/src/ggml-qnn/backend.hpp @@ -2,11 +2,13 @@ #pragma once #include +#include #include #include "ggml.h" #include "ggml-backend.h" +#include "ggml-qnn.h" #include "graph.hpp" #include "qnn-lib.hpp" @@ -15,20 +17,21 @@ namespace qnn { typedef std::unordered_map> ggml_qnn_graph_cache_t; } // namespace qnn -struct ggml_backend_qnn_context { - int device; - int threads; - char name[GGML_MAX_NAME]; - char lib[GGML_MAX_NAME]; - ggml_backend *backend = nullptr; +struct ggml_backend_qnn_device_context { + // initialize in constructor + QNNBackend device; + size_t threads; + std::string name; + std::string lib_name; + + // initialize in init qnn::qcom_socinfo socinfo = {}; std::shared_ptr instance; std::shared_ptr qnn_interface; + qnn::ggml_qnn_graph_cache_t qnn_graph_cache; - explicit ggml_backend_qnn_context(int device, int threads, const char *name, const char *lib) : - device(device), threads(threads) { - strncpy(this->name, name, GGML_MAX_NAME); - strncpy(this->lib, lib, GGML_MAX_NAME); - } + explicit ggml_backend_qnn_device_context(QNNBackend device, size_t threads, const char *name, + const char *lib_name) : + device(device), threads(threads), name(name), lib_name(lib_name) {} }; diff --git a/src/llama.cpp b/src/llama.cpp index 44fef53b31295..d929d74e567e8 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3430,8 +3430,6 @@ static int llama_get_device_count(const llama_model & model) { count += ggml_backend_vk_get_device_count(); #elif defined(GGML_USE_CANN) count += ggml_backend_cann_get_device_count(); -#elif defined(GGML_USE_QNN) - count = ggml_backend_qnn_get_device_count(); #endif return count; @@ -3465,8 +3463,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(const llama_mode if (host_buffer) { buft = ggml_backend_vk_host_buffer_type(); } -#elif defined(GGML_USE_QNN) - buft = ggml_backend_qnn_buffer_type(gpu); #endif if (buft == nullptr) { From f2604982136b43944902366d2b41f5ebb65cc49a Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 11 Oct 2024 12:11:31 +0800 Subject: [PATCH 122/166] remove unused function --- ggml/src/ggml-qnn.cpp | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 2d2b4745d13f1..bc88ba0f4de45 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -285,26 +285,6 @@ ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, ggml_cgraph * return result; } -bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const ggml_tensor *op) { - GGML_UNUSED(backend); - return qnn::ggml_qnn_supports_op(op); -} - -bool ggml_backend_qnn_offload_op(ggml_backend_t backend, const ggml_tensor *op) { - GGML_UNUSED(backend); - - size_t dims = ggml_n_dims(op); - bool can_offload = false; - for (size_t i = 0; i < dims; i++) { - if (op->ne[i] > 1) { - can_offload = true; - break; - } - } - - return can_offload; -} - ggml_backend_i ggml_backend_qnn_interface = { /* .get_name = */ ggml_backend_qnn_name, /* .free = */ ggml_backend_qnn_free, From 4abaf7d87ed55876448394d340e475e5426f29a9 Mon Sep 17 00:00:00 2001 From: nullname Date: Mon, 28 Oct 2024 12:48:16 +0800 Subject: [PATCH 123/166] feat: fix mulmat (#2) * ggml_qnn_op_config now manager the construction of ggml_qnn_tensor * wip * add interface ggml_qnn_op_config * add ggml_qnn_list_op_config * add create_tensor and move tensor bind to execute * wip * rename: ggml_qnn_list_op_config -> ggml_qnn_matmul_op_config * add tensortype to allow native tensor * remove ggml_tensor param at ggml_qnn_tensor::create_tensor * postpone the tensor id allocation to add_node * add ggml_qnn_op_config_base * trival change to reduct the param of function * split bind_tensors into bind_input_tensors and bind_output_tensors * implement ggml_qnn_single_op_config::create_tensors next will set the prameter of transpose * tensor: add bind buffer * add parameter tensor type * implement add_tensor_param * set qnn_instance only at constructor * set transpose tensor param * move create_op_constructor into op-config module * create QNN_OP_MAT_MUL from ggml_qnn_matmul_op_config * try fix crash * fix compiling error at older ndk (r23c) * fix crash * fix parameter tensor name * update tensor dimension assignment and add TODO * fix mat_mul graph creating * fix MUL_MAT_256x16x10x1_256x1x10x1_16x1x10x1 * append type to graph cache key * wip * fix supported op * update comment * disable op other than add and mat_mul * add convert op to adapt multi input/output format * disable f16 for cpu backend according to official doc https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/cpu_backend.html#supported-operations * add supported data types flags in each backend * remove unused functions * append output type to graph key * fix gpu backend by disable the different data type op * fix cpu backend support ops * fix duplicated tensor name * append op name * suppress warning * remove unused code --- ggml/src/ggml-qnn.cpp | 36 ++- ggml/src/ggml-qnn/backend-ops.cpp | 114 +++++--- ggml/src/ggml-qnn/backend-ops.hpp | 2 +- ggml/src/ggml-qnn/backend.hpp | 2 + ggml/src/ggml-qnn/buffer.hpp | 4 +- ggml/src/ggml-qnn/graph.hpp | 117 ++------ ggml/src/ggml-qnn/op-config.cpp | 471 ++++++++++++++++++++++++++++++ ggml/src/ggml-qnn/op-config.hpp | 147 ++++++---- ggml/src/ggml-qnn/qnn-lib.hpp | 2 +- ggml/src/ggml-qnn/tensor.hpp | 154 +++++++--- ggml/src/ggml-qnn/utils.cpp | 114 +++++++- ggml/src/ggml-qnn/utils.hpp | 15 +- 12 files changed, 920 insertions(+), 258 deletions(-) create mode 100644 ggml/src/ggml-qnn/op-config.cpp diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index bc88ba0f4de45..21a7dee1c99ef 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -51,12 +51,30 @@ struct qnn_device_caps { const char *description; const char *lib_name; enum ggml_backend_dev_type type; + + // TODO: should get this caps from device + std::unordered_set supported_types; }; const qnn_device_caps kDeviceCaps[GGML_QNN_MAX_DEVICES]{ - { "qnn-cpu", "Qualcomm Kryo CPU", "libQnnCpu.so", GGML_BACKEND_DEVICE_TYPE_CPU }, /* QNN_BACKEND_CPU */ - { "qnn-gpu", "Qualcomm Adreno GPU", "libQnnGpu.so", GGML_BACKEND_DEVICE_TYPE_GPU }, /* QNN_BACKEND_GPU */ - { "qnn-npu", "Qualcomm NPU", "libQnnHtp.so", GGML_BACKEND_DEVICE_TYPE_GPU }, /* QNN_BACKEND_NPU */ + { "qnn-cpu", + "Qualcomm Kryo CPU", + "libQnnCpu.so", + GGML_BACKEND_DEVICE_TYPE_CPU, + { GGML_TYPE_F32, + GGML_TYPE_I8 } }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/CpuOpDefSupplement.html#matmul + { "qnn-gpu", + "Qualcomm Adreno GPU", + "libQnnGpu.so", + GGML_BACKEND_DEVICE_TYPE_GPU, + { GGML_TYPE_F32, + GGML_TYPE_F16 } }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/GpuOpDefSupplement.html#matmul + { "qnn-npu", + "Qualcomm NPU", + "libQnnHtp.so", + GGML_BACKEND_DEVICE_TYPE_GPU, + { GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_I16, + GGML_TYPE_I8 } }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/HtpOpDefSupplement.html#matmul }; class ggml_backend_qnn_buffer_context { @@ -340,9 +358,10 @@ void ggml_backend_qnn_device_get_props(ggml_backend_dev_t dev, struct ggml_backe props->type = ggml_backend_qnn_device_get_type(dev); ggml_backend_qnn_device_get_memory(dev, &props->memory_free, &props->memory_total); props->caps = { - /* async */ false, - /* host_buffer */ false, - /* events */ false, + /* async */ false, + /* host_buffer */ false, + /* buffer_from_host_ptr */ false, + /* events */ false, }; } @@ -412,6 +431,7 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, dev_ctx->instance = instance; dev_ctx->qnn_interface = qnn_interface; dev_ctx->socinfo = instance->get_soc_info(); + dev_ctx->supported_types = kDeviceCaps[device_index].supported_types; ggml_backend_t qnn_backend = new ggml_backend{ /* .guid = */ ggml_backend_qnn_guid(), @@ -440,8 +460,8 @@ ggml_backend_buffer_t ggml_backend_qnn_device_buffer_from_ptr(ggml_backend_dev_t } bool ggml_backend_qnn_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor *op) { - GGML_UNUSED(dev); - return qnn::ggml_qnn_supports_op(op); + auto *device_ctx = get_device_context(dev); + return qnn::ggml_qnn_supports_op(device_ctx, op); } bool ggml_backend_qnn_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 9c6e5709c8189..d20069874a7c3 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -108,8 +108,8 @@ std::string get_graph_key(const std::string &op_name, const std::array &outputs) { constexpr static const auto append_dimensions = [](std::string &key, const ggml_tensor *tensor) { char buffer[256] = {}; - snprintf(buffer, sizeof(buffer), "_%ldx%ldx%ldx%ld", (long)tensor->ne[0], (long)tensor->ne[1], - (long)tensor->ne[2], (long)tensor->ne[3]); + snprintf(buffer, sizeof(buffer), "_%ldx%ldx%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1], + (long)tensor->ne[2], (long)tensor->ne[3], qnn::get_ggml_type_name(tensor->type)); key += buffer; }; @@ -117,32 +117,11 @@ std::string get_graph_key(const std::string &op_name, const std::arraytype); return graph_key; } -qnn::ggml_op_constructor_t generate_common_op_constructor(const std::string &op_name) { - if (op_name == QNN_OP_MAT_MUL) { - // For QNN_OP_MAT_MUL, we need to transpose the input tensor - return [](const std::string &name) { - auto config = std::make_unique(name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL); - Qnn_Scalar_t scalar = QNN_SCALAR_INIT; - scalar.dataType = QNN_DATATYPE_BOOL_8; - scalar.bool8Value = true; - config->add_scalar_param(QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN0, scalar); - QNN_LOG_DEBUG("add scalar param %s\n", QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN0); - return config; - }; - } - - return [op_name](const std::string &name) { - return std::make_unique(name, QNN_OP_PACKAGE_NAME_QTI_AISW, op_name); - }; -} - constexpr const char *kGgmlOpToQnnOp[] = { nullptr, // GGML_OP_NONE nullptr, // GGML_OP_DUP @@ -278,7 +257,7 @@ qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *c return nullptr; } - auto op_constructor = generate_common_op_constructor(kGgmlOpToQnnOp[op]); + auto op_constructor = qnn::create_op_constructor(kGgmlOpToQnnOp[op]); if (!graph->build_graph(op_constructor, to_ggml_tensor_array<_InputSize>(inputs), to_ggml_tensor_array<_OutputSize>(outputs))) { QNN_LOG_ERROR("build_graph failed\n"); @@ -542,11 +521,57 @@ static constexpr const ggml_qnn_binary_op_t kQnnBinaryOpsTable[] = { static_assert(sizeof(kQnnBinaryOpsTable) / sizeof(kQnnBinaryOpsTable[0]) == GGML_OP_COUNT, "GGML_OP_COUNT does not match the size of the kQnnBinaryOpsTable table"); +bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_tensor *tensor) { + switch (tensor->type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q4_0: + if (ctx->supported_types.find(tensor->type) == ctx->supported_types.end()) { + QNN_LOG_DEBUG("unsupported data type GGML_TYPE_F16 for cpu backend"); + return false; + } + break; + default: + QNN_LOG_DEBUG("unsupported data type %d", tensor->type); + return false; + } + + return true; +} + +bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) { + GGML_UNUSED(ctx); + + auto *src0 = op->src[0]; + auto *src1 = op->src[1]; + if (src0->type != src1->type || src0->type != op->type) { + // current qnn implementation only supports the same type for src0 and src1 + QNN_LOG_DEBUG("src0 type %d and src1 type %d and op type %d are not equal", src0->type, src1->type, op->type); + return false; + } + + if (src0->ne[2] != src1->ne[2] || src0->ne[3] != src1->ne[3]) { + /* + * TODO: remove the blocker here when qnn backend supports mul_mat like this: + * [ne03, ne02, n, k] * [ne03 * x, ne02 * y, m, k] -> [ne03 * x, ne02 * y, m, n] + */ + QNN_LOG_DEBUG("src0 and src1 dimensions are not equal"); + return false; + } + + return true; +} + } // namespace namespace qnn { -bool ggml_qnn_supports_op(const ggml_tensor *op) { +bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) { + if (op->op == GGML_OP_NONE) { + return true; + } + if (op->op == GGML_OP_UNARY) { if (!kQnnUnaryOpsTable[kGgmlUnaryOpStart + ggml_get_unary_op(op)]) { QNN_LOG_DEBUG("unsupported unary op %d", ggml_get_unary_op(op)); @@ -557,35 +582,38 @@ bool ggml_qnn_supports_op(const ggml_tensor *op) { QNN_LOG_DEBUG("src0 is nullptr"); return false; } - } else if (op->op != GGML_OP_NONE) { + } else { if (!kQnnUnaryOpsTable[op->op] && !kQnnBinaryOpsTable[op->op]) { QNN_LOG_DEBUG("unsupported op %d", op->op); return false; } - if (!op->src[0] || !op->src[1]) { + auto *src0 = op->src[0]; + auto *src1 = op->src[1]; + if (!src0 || !src1) { QNN_LOG_DEBUG("src0 or src1 is nullptr"); return false; } -#ifndef NDEBUG - if (op->op == GGML_OP_ADD && !is_tensor_dimensions_equal(op->src[0], op->src[1])) { - QNN_LOG_DEBUG("src0 and src1 dimensions are not equal"); + if (!ggml_qnn_supports_tensor(ctx, src0) || !ggml_qnn_supports_tensor(ctx, src1) || + !ggml_qnn_supports_tensor(ctx, op)) { return false; } -#endif - } - switch (op->type) { - case GGML_TYPE_F32: - case GGML_TYPE_F16: - case GGML_TYPE_I8: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q4_0: - break; - default: - QNN_LOG_DEBUG("unsupported src0 type %d", op->src[0]->type); - return false; + switch (op->op) { + case GGML_OP_ADD: + if (!is_tensor_dimensions_equal(src0, src1)) { + QNN_LOG_DEBUG("src0 and src1 dimensions are not equal"); + return false; + } + break; + + case GGML_OP_MUL_MAT: + return ggml_qnn_supports_matmul_op(ctx, op); + + default: + return false; + } } return true; diff --git a/ggml/src/ggml-qnn/backend-ops.hpp b/ggml/src/ggml-qnn/backend-ops.hpp index 86658da118f8b..3df7f4a98a146 100644 --- a/ggml/src/ggml-qnn/backend-ops.hpp +++ b/ggml/src/ggml-qnn/backend-ops.hpp @@ -6,7 +6,7 @@ namespace qnn { -bool ggml_qnn_supports_op(const ggml_tensor *op); +bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op); bool ggml_qnn_forward(ggml_backend_qnn_device_context *ctx, struct ggml_tensor *tensor); } // namespace qnn diff --git a/ggml/src/ggml-qnn/backend.hpp b/ggml/src/ggml-qnn/backend.hpp index 696a883480e9f..eb292e89bfd21 100644 --- a/ggml/src/ggml-qnn/backend.hpp +++ b/ggml/src/ggml-qnn/backend.hpp @@ -4,6 +4,7 @@ #include #include #include +#include #include "ggml.h" @@ -26,6 +27,7 @@ struct ggml_backend_qnn_device_context { // initialize in init qnn::qcom_socinfo socinfo = {}; + std::unordered_set supported_types; std::shared_ptr instance; std::shared_ptr qnn_interface; diff --git a/ggml/src/ggml-qnn/buffer.hpp b/ggml/src/ggml-qnn/buffer.hpp index 4b4b2daaa75b4..676e88c0454be 100644 --- a/ggml/src/ggml-qnn/buffer.hpp +++ b/ggml/src/ggml-qnn/buffer.hpp @@ -8,8 +8,8 @@ namespace qnn { class ggml_qnn_rpc_buffer { public: - ggml_qnn_rpc_buffer(std::shared_ptr qnn_instance, size_t size, uint32_t rank, uint32_t *dimensions, - Qnn_DataType_t data_type) : + ggml_qnn_rpc_buffer(std::shared_ptr qnn_instance, const size_t size, const uint32_t rank, + uint32_t *dimensions, Qnn_DataType_t data_type) : _qnn_instance(qnn_instance), _size(size) { _qnn_rpc_buffer = static_cast(qnn_instance->alloc_rpcmem(size, alignof(void *))); diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 3f1a0ef163208..858a7d3af29a2 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -2,7 +2,6 @@ #pragma once #include -#include #include #include #include @@ -12,19 +11,15 @@ #include "logger.hpp" #include "op-config.hpp" #include "qnn-lib.hpp" -#include "tensor.hpp" namespace qnn { -using ggml_tensor_array_t = std::vector; -using ggml_op_constructor_t = std::function(const std::string &)>; - class ggml_qnn_graph { public: explicit ggml_qnn_graph(const std::string &graph_name, QNNBackend device, std::shared_ptr qnn_instance, size_t vtcm_size_in_mb) : _graph_name(graph_name), _device(device), _qnn_instance(qnn_instance) { - QNN_LOG_INFO("graph name %s", graph_name.c_str()); + QNN_LOG_INFO("[%s]create", graph_name.c_str()); auto qnn_interface = qnn_instance->get_qnn_interface(); auto qnn_context = qnn_instance->get_qnn_context_handle(); @@ -69,19 +64,16 @@ class ggml_qnn_graph { } if (error != QNN_SUCCESS) { - QNN_LOG_INFO( - "can't create qnn graph handle with graph name %s, " - "error = %d\n", - graph_name.c_str(), error); + QNN_LOG_INFO("[%s]can't create qnn graph handle, error = %d\n", graph_name.c_str(), error); return; } - QNN_LOG_INFO("create qnn graph handle with graph name %s ok\n", graph_name.c_str()); + QNN_LOG_INFO("[%s]create succeed\n", graph_name.c_str()); _graph_handle = graph_handle; _qnn_interface = qnn_interface; } - ~ggml_qnn_graph() { QNN_LOG_DEBUG("graph name %s, destroy", _graph_name.c_str()); } + ~ggml_qnn_graph() { QNN_LOG_DEBUG("[%s]destroy", _graph_name.c_str()); } bool build_graph(ggml_op_constructor_t op_constructor, const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) { @@ -91,95 +83,44 @@ class ggml_qnn_graph { return false; } - // get the max tensor rank - for (auto tensor : tensor_inputs) { - _tensor_rank = std::max(_tensor_rank, ggml_n_dims(tensor)); - } - for (auto tensor : tensor_outputs) { - _tensor_rank = std::max(_tensor_rank, ggml_n_dims(tensor)); - } - - QNN_LOG_DEBUG("graph name %s, build_graph start", _graph_name.c_str()); - _tensor_inputs.resize(tensor_inputs.size()); - for (size_t i = 0; i < tensor_inputs.size(); i++) { - char buffer[GGML_MAX_NAME] = {}; - snprintf(buffer, GGML_MAX_NAME, "src%d", (int)i); - auto qnn_tensor = - std::make_shared(std::string(buffer), _device, _graph_handle, _qnn_instance); - auto *ggml_tensor = tensor_inputs[i]; - if (!qnn_tensor->bind_ggml_tensor(ggml_tensor, true, _tensor_rank)) { - QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); - return false; - } - - _tensor_inputs[i] = qnn_tensor; - } - - _tensor_outputs.resize(tensor_outputs.size()); - for (size_t i = 0; i < tensor_outputs.size(); i++) { - char buffer[GGML_MAX_NAME] = {}; - snprintf(buffer, GGML_MAX_NAME, "dst%d", (int)i); - auto qnn_tensor = - std::make_shared(std::string(buffer), _device, _graph_handle, _qnn_instance); - auto *ggml_tensor = tensor_outputs[i]; - if (!qnn_tensor->bind_ggml_tensor(ggml_tensor, false, _tensor_rank)) { - QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); - return false; - } - - _tensor_outputs[i] = qnn_tensor; + QNN_LOG_DEBUG("[%s]build_graph start", _graph_name.c_str()); + _op_config = op_constructor(_graph_name, _qnn_instance); + if (!_op_config->create_tensors(_device, _graph_handle, tensor_inputs, tensor_outputs)) { + QNN_LOG_ERROR("[%s]create_tensors failed\n", _graph_name.c_str()); + return false; } - _op_config = op_constructor(_graph_name); - _op_config->set_input_tensors(_tensor_inputs); - _op_config->set_output_tensors(_tensor_outputs); - auto error = _qnn_interface->qnn_graph_add_node(_graph_handle, _op_config->get_op_config()); - if (error != QNN_SUCCESS) { - auto *error_str = get_qnn_error_string(error); - if (error_str) { - QNN_LOG_ERROR("qnn_graph_add_node.error: %s\n", error_str); - } else { - QNN_LOG_ERROR("qnn_graph_add_node.error: %d\n", error); - } + if (!_op_config->add_op_to_graph(_graph_handle)) { + QNN_LOG_ERROR("[%s]add nodes failed\n", _graph_name.c_str()); return false; } - error = _qnn_interface->qnn_graph_finalize(_graph_handle, nullptr, nullptr); + auto error = _qnn_interface->qnn_graph_finalize(_graph_handle, nullptr, nullptr); if (error != QNN_SUCCESS) { auto *error_str = get_qnn_error_string(error); if (error_str) { - QNN_LOG_ERROR("qnn_graph_finalize.error: %s\n", error_str); + QNN_LOG_ERROR("[%s]qnn_graph_finalize.error: %s\n", _graph_name.c_str(), error_str); } else { - QNN_LOG_ERROR("qnn_graph_finalize.error: %d\n", error); + QNN_LOG_ERROR("[%s]qnn_graph_finalize.error: %d\n", _graph_name.c_str(), error); } return false; } - QNN_LOG_DEBUG("graph name %s, build_graph succeed", _graph_name.c_str()); + QNN_LOG_DEBUG("[%s]build_graph succeed", _graph_name.c_str()); return true; } bool execute(const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) { - GGML_ASSERT(tensor_inputs.size() == _tensor_inputs.size()); - GGML_ASSERT(tensor_outputs.size() == _tensor_outputs.size()); - for (size_t i = 0; i < tensor_inputs.size(); i++) { - auto *ggml_tensor = tensor_inputs[i]; - if (!_tensor_inputs[i]->bind_ggml_tensor(ggml_tensor, true, _tensor_rank)) { - QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); - return false; - } + if (!_op_config->bind_input_tensors(tensor_inputs)) { + QNN_LOG_ERROR("[%s]bind input tensors failed\n", _graph_name.c_str()); + return false; } - for (size_t i = 0; i < tensor_outputs.size(); i++) { - auto *ggml_tensor = tensor_outputs[i]; - if (!_tensor_outputs[i]->bind_ggml_tensor(ggml_tensor, false, _tensor_rank)) { - QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); - return false; - } + if (!_op_config->bind_output_tensors(tensor_outputs)) { + QNN_LOG_ERROR("[%s]bind output tensors failed\n", _graph_name.c_str()); + return false; } - _op_config->set_input_tensors(_tensor_inputs); - _op_config->set_output_tensors(_tensor_outputs); auto &qnn_tensor_inputs = _op_config->get_qnn_input_tensors(); auto &qnn_tensor_outputs = _op_config->get_qnn_output_tensors(); @@ -188,20 +129,15 @@ class ggml_qnn_graph { qnn_tensor_outputs.data(), qnn_tensor_outputs.size(), nullptr, nullptr); if (_device == QNN_BACKEND_NPU) { if (error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) { - QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); + QNN_LOG_WARN("[%s]NPU crashed. SSR detected. Caused QNN graph execute error\n", _graph_name.c_str()); } } - for (auto tensor : _tensor_inputs) { - tensor->unbind_ggml_tensor(); - } - - for (auto tensor : _tensor_outputs) { - tensor->unbind_ggml_tensor(); - } + _op_config->unbind_input_tensors(); + _op_config->unbind_output_tensors(); if (error != QNN_SUCCESS) { - QNN_LOG_INFO("error = %d\n", error); + QNN_LOG_INFO("[%s]error = %d\n", _graph_name.c_str(), error); return false; } @@ -220,11 +156,8 @@ class ggml_qnn_graph { Qnn_GraphHandle_t _graph_handle = nullptr; std::shared_ptr _qnn_instance; std::shared_ptr _qnn_interface; - std::vector> _tensor_inputs; - std::vector> _tensor_outputs; std::unique_ptr _op_config; std::vector _param_types; - int _tensor_rank = 0; DISABLE_COPY(ggml_qnn_graph); DISABLE_MOVE(ggml_qnn_graph); diff --git a/ggml/src/ggml-qnn/op-config.cpp b/ggml/src/ggml-qnn/op-config.cpp new file mode 100644 index 0000000000000..07dcba156471b --- /dev/null +++ b/ggml/src/ggml-qnn/op-config.cpp @@ -0,0 +1,471 @@ +#include "op-config.hpp" + +#include + +#include "logger.hpp" + +namespace { + +constexpr const qnn::qnn_dimension_array_t kTransposeParamData[GGML_MAX_DIMS] = { + { 0 }, + { 1, 0 }, + { 0, 2, 1 }, + { 0, 1, 3, 2 }, +}; + +qnn::qnn_dimension_array_t get_transposed_dimensions(const qnn::qnn_dimension_array_t &dimensions, int rank) { + qnn::qnn_dimension_array_t transposed_dims = dimensions; + if (rank >= 2) { + transposed_dims[rank - 1] = dimensions[rank - 2]; + transposed_dims[rank - 2] = dimensions[rank - 1]; + } + + return transposed_dims; +} + +int get_rank(const qnn::ggml_tensor_array_t &tensor_inputs, const qnn::ggml_tensor_array_t &tensor_outputs) { + int tensor_rank = 0; + // get the max tensor rank + for (auto tensor : tensor_inputs) { + tensor_rank = std::max(tensor_rank, ggml_n_dims(tensor)); + } + for (auto tensor : tensor_outputs) { + tensor_rank = std::max(tensor_rank, ggml_n_dims(tensor)); + } + + return tensor_rank; +} + +Qnn_DataType_t get_tensor_type(const qnn::ggml_qnn_tensor_array_t &tensors) { + Qnn_DataType_t type = QNN_DATATYPE_UNDEFINED; + for (auto tensor : tensors) { + auto tensor_type_size = qnn::qnn_datatype_size(tensor->get_data_type()); + GGML_ASSERT(tensor_type_size > 0); + if (tensor_type_size > qnn::qnn_datatype_size(type)) { + type = tensor->get_data_type(); + } + } + + return type; +} + +struct tensor_common_params { + const char *name_prefix; + int tensor_rank; + bool is_input; + QNNBackend device; + Qnn_GraphHandle_t graph_handle; + std::shared_ptr qnn_instance; +}; + +void create_tensors_from_ggml_tensor(const tensor_common_params ¶ms, const qnn::ggml_tensor_array_t &ggml_tensors, + qnn::ggml_qnn_tensor_array_t *tensor_wrappers, + std::vector *qnn_tensors) { + using namespace qnn; + + tensor_wrappers->resize(ggml_tensors.size()); + if (qnn_tensors) { + qnn_tensors->resize(ggml_tensors.size()); + } + char buffer[GGML_MAX_NAME] = {}; + auto tensor_type = params.is_input ? ggml_qnn_tensor::INPUT : ggml_qnn_tensor::OUTPUT; + for (size_t i = 0; i < ggml_tensors.size(); i++) { + snprintf(buffer, GGML_MAX_NAME, "%s%d", params.name_prefix, (int)i); + auto *ggml_tensor = ggml_tensors[i]; + (*tensor_wrappers)[i] = std::make_shared(tensor_type, std::string(buffer), ggml_tensor->ne, + ggml_tensor->type, params.tensor_rank, params.device, + params.graph_handle, params.qnn_instance); + } +} + +bool bind_tensors(const qnn::ggml_tensor_array_t &ggml_tensors, qnn::ggml_qnn_tensor_array_t &tensor_wrappers, + std::vector &qnn_tensors) { + for (size_t i = 0; i < ggml_tensors.size(); i++) { + auto *ggml_tensor = ggml_tensors[i]; + if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor)) { + QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); + return false; + } + + qnn_tensors[i] = tensor_wrappers[i]->get_qnn_tensor(); + } + + return true; +} + +class ggml_qnn_connectable_op_config : public qnn::ggml_qnn_op_config_base { +public: + explicit ggml_qnn_connectable_op_config(const std::string &name, const std::string &package_name, + const std::string &op_type, + std::shared_ptr qnn_instance) : + ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} + + bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, + const qnn::ggml_tensor_array_t &tensor_inputs, + const qnn::ggml_tensor_array_t &tensor_outputs) override { + GGML_UNUSED(device); + GGML_UNUSED(graph_handle); + GGML_UNUSED(tensor_inputs); + GGML_UNUSED(tensor_outputs); + return true; + } + + void set_input_tensors(qnn::ggml_qnn_tensor_array_t &tensor_inputs) { + _tensor_inputs = tensor_inputs; + _qnn_tensor_inputs.resize(_tensor_inputs.size()); + } + + void set_input_tensors(qnn::ggml_qnn_tensor_array_t &&tensor_inputs) { + _tensor_inputs = std::move(tensor_inputs); + _qnn_tensor_inputs.resize(_tensor_inputs.size()); + } + + void set_output_tensors(qnn::ggml_qnn_tensor_array_t &tensor_outputs) { + _tensor_outputs = tensor_outputs; + _qnn_tensor_outputs.resize(_tensor_outputs.size()); + } + + void set_output_tensors(qnn::ggml_qnn_tensor_array_t &&tensor_outputs) { + _tensor_outputs = std::move(tensor_outputs); + _qnn_tensor_outputs.resize(_tensor_outputs.size()); + } + + qnn::ggml_qnn_tensor_array_t &get_input_tensors() { return _tensor_inputs; } + qnn::ggml_qnn_tensor_array_t &get_output_tensors() { return _tensor_outputs; } + +private: + DISABLE_COPY(ggml_qnn_connectable_op_config); + DISABLE_MOVE(ggml_qnn_connectable_op_config); +}; + +} // namespace + +namespace qnn { + +void ggml_qnn_op_config_base::add_scalar_param(const std::string &name, const Qnn_Scalar_t scalar) { + _param_names.push_back(name); + Qnn_Param_t param = QNN_PARAM_INIT; + param.paramType = QNN_PARAMTYPE_SCALAR; + param.name = _param_names.back().c_str(); + param.scalarParam = scalar; + _qnn_parameters.push_back(param); +} + +bool ggml_qnn_op_config_base::add_tensor_param(const std::string &name, const qnn_dimension_array_t &dimensions, + int rank, const uint8_t *data, const Qnn_DataType_t data_type, + QNNBackend device, Qnn_GraphHandle_t graph_handle) { + std::string tensor_name = _name + name + std::to_string(_tensor_parameters.size()); + auto param_tensor = std::make_shared(ggml_qnn_tensor::PARAMETER, tensor_name, dimensions, + data_type, rank, device, graph_handle, _qnn_instance); + size_t data_size = ggml_type_size(ggml_datatype_from_qnn_datatype(data_type)); + for (int i = 0; i < rank; i++) { + data_size *= dimensions[i]; + } + + GGML_ASSERT(data_size > 0); + if (!param_tensor->bind_buffer(const_cast(data), data_size)) { + QNN_LOG_ERROR("parameter tensor bind_buffer failed\n"); + return false; + } + + if (!param_tensor->alloc_qnn_tensor_id()) { + QNN_LOG_ERROR("parameter tensor alloc_qnn_tensor_id failed\n"); + return false; + } + + _tensor_parameters.push_back(param_tensor); + _param_names.push_back(name); + Qnn_Param_t param = QNN_PARAM_INIT; + param.paramType = QNN_PARAMTYPE_TENSOR; + param.name = _param_names.back().c_str(); + param.tensorParam = param_tensor->get_qnn_tensor(); + _qnn_parameters.push_back(param); + return true; +} + +bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) { + GGML_ASSERT(_qnn_tensor_inputs.size() == _tensor_inputs.size()); + GGML_ASSERT(_qnn_tensor_outputs.size() == _tensor_outputs.size()); + + auto qnn_interface = _qnn_instance->get_qnn_interface(); + for (size_t i = 0; i < _tensor_inputs.size(); i++) { + auto tensor = _tensor_inputs[i]; + if (!tensor->alloc_qnn_tensor_id()) { + QNN_LOG_ERROR("[%s]input tensor alloc_qnn_tensor_id failed\n", _name.c_str()); + return false; + } + + _qnn_tensor_inputs[i] = tensor->get_qnn_tensor(); + } + + for (size_t i = 0; i < _tensor_outputs.size(); i++) { + auto tensor = _tensor_outputs[i]; + if (!tensor->alloc_qnn_tensor_id()) { + QNN_LOG_ERROR("[%s]output tensor alloc_qnn_tensor_id failed\n", _name.c_str()); + return false; + } + _qnn_tensor_outputs[i] = _tensor_outputs[i]->get_qnn_tensor(); + } + + auto error = qnn_interface->qnn_graph_add_node(graph_handle, get_op_config()); + if (error != QNN_SUCCESS) { + auto *error_str = get_qnn_error_string(error); + if (error_str) { + QNN_LOG_ERROR("[%s]qnn_graph_add_node.error: %s\n", _name.c_str(), error_str); + } else { + QNN_LOG_ERROR("[%s]qnn_graph_add_node.error: %d\n", _name.c_str(), error); + } + return false; + } + + QNN_LOG_DEBUG("[%s]added to graph\n", _name.c_str()); + return true; +} + +bool ggml_qnn_op_config_base::bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) { + GGML_ASSERT(tensor_inputs.size() == _tensor_inputs.size()); + return bind_tensors(tensor_inputs, _tensor_inputs, _qnn_tensor_inputs); +} + +bool ggml_qnn_op_config_base::bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) { + GGML_ASSERT(tensor_outputs.size() == _tensor_outputs.size()); + return bind_tensors(tensor_outputs, _tensor_outputs, _qnn_tensor_outputs); +} + +void ggml_qnn_op_config_base::unbind_input_tensors() { + for (auto &tensor : _tensor_inputs) { + tensor->unbind(); + } +} + +void ggml_qnn_op_config_base::unbind_output_tensors() { + for (auto &tensor : _tensor_outputs) { + tensor->unbind(); + } +} + +Qnn_OpConfig_t ggml_qnn_op_config_base::get_op_config() { + Qnn_OpConfig_t config = QNN_OPCONFIG_INIT; + config.version = QNN_OPCONFIG_VERSION_1; + auto &op_config = config.v1; + op_config.name = _name.c_str(); + op_config.packageName = _package_name.c_str(); + op_config.typeName = _op_type.c_str(); + op_config.numOfParams = (uint32_t)_qnn_parameters.size(); + op_config.params = _qnn_parameters.data(); + op_config.numOfInputs = (uint32_t)_qnn_tensor_inputs.size(); + op_config.inputTensors = _qnn_tensor_inputs.data(); + op_config.numOfOutputs = (uint32_t)_qnn_tensor_outputs.size(); + op_config.outputTensors = _qnn_tensor_outputs.data(); + return config; +} + +bool ggml_qnn_single_op_config::create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, + const ggml_tensor_array_t &tensor_inputs, + const ggml_tensor_array_t &tensor_outputs) { + const auto tensor_rank = get_rank(tensor_inputs, tensor_outputs); + tensor_common_params params = { "src", tensor_rank, true, device, graph_handle, _qnn_instance }; + create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs); + params.name_prefix = "dst"; + params.is_input = false; + create_tensors_from_ggml_tensor(params, tensor_outputs, &_tensor_outputs, &_qnn_tensor_outputs); + return true; +} + +bool ggml_qnn_matmul_op_config::create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, + const ggml_tensor_array_t &tensor_inputs, + const ggml_tensor_array_t &tensor_outputs) { + GGML_ASSERT(tensor_inputs.size() == 2); + GGML_ASSERT(tensor_outputs.size() == 1); + const auto tensor_rank = get_rank(tensor_inputs, tensor_outputs); + GGML_ASSERT(tensor_rank >= 2); + + // create input tensors + tensor_common_params params = { "src", tensor_rank, true, device, graph_handle, _qnn_instance }; + create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs); + + // create output tensor + ggml_qnn_tensor_array_t mat_mul_tensor_outputs; + params.name_prefix = "dst"; + params.is_input = false; + create_tensors_from_ggml_tensor(params, tensor_outputs, &mat_mul_tensor_outputs, nullptr); + + // create mat_mul nodes + return create_mat_mul_nodes(device, graph_handle, tensor_rank, _tensor_inputs, mat_mul_tensor_outputs); +} + +bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, + ggml_qnn_tensor_array_t &tensor_inputs, + ggml_qnn_tensor_array_t &tensor_outputs) { + + /* + * First, both the ggml and qnn tensor in memory are stored as row-major format. (For more details, please also: + * https://pytorch.org/blog/tensor-memory-format-matters/#:~:text=Column%20Major%20Order:%20In%20this%20format,%20the%20matrix) + * But the dimensions of the tensor are stored in different order. + * For example, a 2x3 matrix: + * [ + * [1, 2, 3], + * [4, 5, 6], + * ] + * The ggml tensor will have dimensions [3, 2], while the qnn tensor will have dimensions [2, 3]. + * + * Second, from the ggml introduction here: https://github.com/huggingface/blog/blob/main/introduction-to-ggml.md + * Given 2 matrices A and B, the matrix multiplication C = A * B is defined as: + * ```python + * import torch + * # Create two matrices + * A = torch.tensor([ + * [2, 8], + * [5, 1], + * [4, 2], + * [8, 6], + * ]) + * B = torch.tensor([ + * [10, 5], + * [9, 9], + * [5, 4], + * ]) + * # Perform matrix multiplication + * result = torch.matmul(A, B.T) + * print(result.T) + * ``` + * Here, the B.T is the transpose of B. + * + * So here we need to create graph like: + * ```mermaid + * graph TD; + * i1>ggml_tensor_in0] --src0--> mat_mul0; + * i2>ggml_tensor_in1] --src1--> transpose0; + * transpose0 --src0_trans--> mat_mul0; + * mat_mul0 --dst_trans--> transpose1; + * transpose1 --dst0--> o1>ggml_tensor_out]; + * ``` + */ + + // create src0_trans tensor + auto src1 = tensor_inputs.back(); + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS does not match the expected value"); + + qnn_dimension_array_t dimensions = get_transposed_dimensions(src1->get_dimensions(), rank); + auto src0_trans = + std::make_shared(ggml_qnn_tensor::INTERMEDIATE, "src0_trans", dimensions, + src1->get_data_type(), rank, device, graph_handle, _qnn_instance); + + // create dst_trans tensor + auto dst = tensor_outputs.front(); + dimensions = get_transposed_dimensions(dst->get_dimensions(), rank); + auto dst_trans = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, "dst_trans", dimensions, + dst->get_data_type(), rank, device, graph_handle, _qnn_instance); + + // create transpose0 + auto transpose0 = std::make_shared(_name + "_trans0", QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_TRANSPOSE, _qnn_instance); + + // create transpose1 + auto transpose1 = std::make_shared(_name + "_trans1", QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_TRANSPOSE, _qnn_instance); + + // create mat_mul + auto mat_mul = std::make_shared(_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL, + _qnn_instance); + + // set transpose0 parameters + auto *params_data = reinterpret_cast(kTransposeParamData[rank - 1].data()); + const qnn_dimension_array_t param_dims = { (uint32_t)rank, 1, 1, 1 }; + transpose0->add_tensor_param(QNN_OP_TRANSPOSE_PARAM_PERM, param_dims, 1, params_data, QNN_DATATYPE_UINT_32, device, + graph_handle); + + // set transpose1 parameters + transpose1->add_tensor_param(QNN_OP_TRANSPOSE_PARAM_PERM, param_dims, 1, params_data, QNN_DATATYPE_UINT_32, device, + graph_handle); + + // set tensor to transpose0 + ggml_qnn_tensor_array_t tensors = { tensor_inputs.back() }; + transpose0->set_input_tensors(tensors); + tensors = { src0_trans }; + transpose0->set_output_tensors(tensors); + + // set tensor to mat_mul + tensors = { tensor_inputs.front(), src0_trans }; + mat_mul->set_input_tensors(tensors); + tensors = { dst_trans }; + mat_mul->set_output_tensors(tensors); + + // set tensor to transpose1 + tensors = { dst_trans }; + transpose1->set_input_tensors(tensors); + transpose1->set_output_tensors(tensor_outputs); + + _mat_mul = mat_mul; + _transpose0 = transpose0; + _transpose1 = transpose1; + return true; +} + +bool ggml_qnn_matmul_op_config::add_op_to_graph(Qnn_GraphHandle_t graph_handle) { + for (auto &convert : _input_converts) { + if (convert && !convert->add_op_to_graph(graph_handle)) { + return false; + } + } + + return _transpose0->add_op_to_graph(graph_handle) && _mat_mul->add_op_to_graph(graph_handle) && + _transpose1->add_op_to_graph(graph_handle) && + (!_output_convert || _output_convert->add_op_to_graph(graph_handle)); +} + +bool ggml_qnn_matmul_op_config::bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) { + return bind_tensors(tensor_inputs, _tensor_inputs, _qnn_tensor_inputs); +} + +bool ggml_qnn_matmul_op_config::bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) { + if (_output_convert) { + return _output_convert->bind_output_tensors(tensor_outputs); + } else { + return _transpose1->bind_output_tensors(tensor_outputs); + } +} + +void ggml_qnn_matmul_op_config::unbind_input_tensors() { + _mat_mul->unbind_input_tensors(); + _transpose0->unbind_input_tensors(); + for (auto &convert : _input_converts) { + if (convert) { + convert->unbind_input_tensors(); + } + } +} + +void ggml_qnn_matmul_op_config::unbind_output_tensors() { + _transpose1->unbind_output_tensors(); + if (_output_convert) { + _output_convert->unbind_output_tensors(); + } +} + +std::vector &ggml_qnn_matmul_op_config::get_qnn_output_tensors() { + if (_output_convert) { + return _output_convert->get_qnn_output_tensors(); + } else { + return _transpose1->get_qnn_output_tensors(); + } +} + +ggml_op_constructor_t create_op_constructor(const std::string &op_name) { + if (op_name == QNN_OP_MAT_MUL) { + // For QNN_OP_MAT_MUL, we need to transpose the input tensor + return [](const std::string &instance_name, + std::shared_ptr qnn_instance) -> std::unique_ptr { + QNN_LOG_DEBUG("create QNN_OP_MAT_MUL, name %s\n", instance_name.c_str()); + return std::make_unique(instance_name, qnn_instance); + }; + } + + return [op_name](const std::string &instance_name, + std::shared_ptr qnn_instance) -> std::unique_ptr { + return std::make_unique(instance_name, QNN_OP_PACKAGE_NAME_QTI_AISW, op_name, + qnn_instance); + }; +} + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/op-config.hpp b/ggml/src/ggml-qnn/op-config.hpp index 7852ee84dc12f..2016cb4ac994d 100644 --- a/ggml/src/ggml-qnn/op-config.hpp +++ b/ggml/src/ggml-qnn/op-config.hpp @@ -1,73 +1,122 @@ #pragma once +#include +#include #include #include #include "ggml-qnn.h" -#include "logger.hpp" #include "qnn-lib.hpp" #include "qnn-types.hpp" #include "tensor.hpp" namespace qnn { + +using ggml_tensor_array_t = std::vector; + class ggml_qnn_op_config { public: - explicit ggml_qnn_op_config(const std::string &name, const std::string &package_name, const std::string &op_type) : - _name(name), _package_name(package_name), _op_type(op_type) {} - - void set_input_tensors(const std::vector> &tensor_inputs) { - _qnn_tensor_inputs.resize(tensor_inputs.size()); - for (size_t i = 0; i < tensor_inputs.size(); i++) { - _qnn_tensor_inputs[i] = tensor_inputs[i]->get_qnn_tensor(); - } - } - - void set_output_tensors(const std::vector> &tensor_outputs) { - _qnn_tensor_outputs.resize(tensor_outputs.size()); - for (size_t i = 0; i < tensor_outputs.size(); i++) { - _qnn_tensor_outputs[i] = tensor_outputs[i]->get_qnn_tensor(); - } - } - - void add_scalar_param(const std::string &name, const Qnn_Scalar_t scalar) { - _param_names.push_back(name); - Qnn_Param_t param = QNN_PARAM_INIT; - param.paramType = QNN_PARAMTYPE_SCALAR; - param.name = _param_names.back().c_str(); - param.scalarParam = scalar; - _parameters.push_back(param); - } - - std::vector &get_qnn_input_tensors() { return _qnn_tensor_inputs; } - std::vector &get_qnn_output_tensors() { return _qnn_tensor_outputs; } - - Qnn_OpConfig_t get_op_config() { - Qnn_OpConfig_t config = QNN_OPCONFIG_INIT; - config.version = QNN_OPCONFIG_VERSION_1; - auto &op_config = config.v1; - op_config.name = _name.c_str(); - op_config.packageName = _package_name.c_str(); - op_config.typeName = _op_type.c_str(); - op_config.numOfParams = (uint32_t)_parameters.size(); - op_config.params = _parameters.data(); - op_config.numOfInputs = (uint32_t)_qnn_tensor_inputs.size(); - op_config.inputTensors = _qnn_tensor_inputs.data(); - op_config.numOfOutputs = (uint32_t)_qnn_tensor_outputs.size(); - op_config.outputTensors = _qnn_tensor_outputs.data(); - return config; - } + virtual ~ggml_qnn_op_config() {} + virtual bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, + const ggml_tensor_array_t &tensor_inputs, + const ggml_tensor_array_t &tensor_outputs) = 0; + virtual std::vector &get_qnn_input_tensors() = 0; + virtual std::vector &get_qnn_output_tensors() = 0; + virtual bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) = 0; + virtual bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) = 0; + virtual bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) = 0; + virtual void unbind_input_tensors() = 0; + virtual void unbind_output_tensors() = 0; +}; + +class ggml_qnn_op_config_base : public ggml_qnn_op_config { +public: + explicit ggml_qnn_op_config_base(const std::string &name, const std::string &package_name, + const std::string &op_type, std::shared_ptr qnn_instance) : + _name(name), _package_name(package_name), _op_type(op_type), _qnn_instance(qnn_instance) {} + + void add_scalar_param(const std::string &name, const Qnn_Scalar_t scalar); + bool add_tensor_param(const std::string &name, const qnn_dimension_array_t &dimensions, int rank, + const uint8_t *data, const Qnn_DataType_t data_type, QNNBackend device, + Qnn_GraphHandle_t graph_handle); + bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override; + bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) override; + bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) override; + void unbind_input_tensors() override; + void unbind_output_tensors() override; + std::vector &get_qnn_input_tensors() override { return _qnn_tensor_inputs; } + std::vector &get_qnn_output_tensors() override { return _qnn_tensor_outputs; } + +protected: + Qnn_OpConfig_t get_op_config(); -private: std::string _name; std::string _package_name; std::string _op_type; + std::shared_ptr _qnn_instance; + ggml_qnn_tensor_array_t _tensor_inputs; + ggml_qnn_tensor_array_t _tensor_outputs; + ggml_qnn_tensor_array_t _tensor_parameters; std::vector _qnn_tensor_inputs; std::vector _qnn_tensor_outputs; - std::vector _parameters; + std::vector _qnn_parameters; std::vector _param_names; - DISABLE_COPY(ggml_qnn_op_config); - DISABLE_MOVE(ggml_qnn_op_config); + DISABLE_COPY(ggml_qnn_op_config_base); + DISABLE_MOVE(ggml_qnn_op_config_base); +}; + +class ggml_qnn_single_op_config : public ggml_qnn_op_config_base { +public: + explicit ggml_qnn_single_op_config(const std::string &name, const std::string &package_name, + const std::string &op_type, std::shared_ptr qnn_instance) : + ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} + + bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, const ggml_tensor_array_t &tensor_inputs, + const ggml_tensor_array_t &tensor_outputs) override; + +private: + DISABLE_COPY(ggml_qnn_single_op_config); + DISABLE_MOVE(ggml_qnn_single_op_config); }; + +class ggml_qnn_matmul_op_config : public ggml_qnn_op_config { +public: + ggml_qnn_matmul_op_config(const std::string &name, std::shared_ptr qnn_instance) : + _name(name), _qnn_instance(qnn_instance) {} + + bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, const ggml_tensor_array_t &tensor_inputs, + const ggml_tensor_array_t &tensor_outputs) override; + bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override; + bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) override; + bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) override; + void unbind_input_tensors() override; + void unbind_output_tensors() override; + std::vector &get_qnn_input_tensors() override { return _qnn_tensor_inputs; } + std::vector &get_qnn_output_tensors() override; + +private: + bool create_mat_mul_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, + ggml_qnn_tensor_array_t &tensor_inputs, ggml_qnn_tensor_array_t &tensor_outputs); + + std::string _name; + std::shared_ptr _qnn_instance; + std::shared_ptr _transpose0; + std::shared_ptr _transpose1; + std::shared_ptr _mat_mul; + std::vector> _input_converts; + std::shared_ptr _output_convert; + ggml_qnn_tensor_array_t _tensor_inputs; + std::vector _qnn_tensor_inputs; + + DISABLE_COPY(ggml_qnn_matmul_op_config); + DISABLE_MOVE(ggml_qnn_matmul_op_config); +}; + +using ggml_op_constructor_t = + std::function(const std::string &, std::shared_ptr)>; + +ggml_op_constructor_t create_op_constructor(const std::string &op_name); + } // namespace qnn diff --git a/ggml/src/ggml-qnn/qnn-lib.hpp b/ggml/src/ggml-qnn/qnn-lib.hpp index d55f730f80d84..74bc2b3f95f6b 100644 --- a/ggml/src/ggml-qnn/qnn-lib.hpp +++ b/ggml/src/ggml-qnn/qnn-lib.hpp @@ -637,7 +637,7 @@ class qnn_instance { return mem_fd; } - Qnn_MemHandle_t register_rpcmem(void *p_data, uint32_t rank, uint32_t *dimensions, Qnn_DataType_t data_type) { + Qnn_MemHandle_t register_rpcmem(void *p_data, const uint32_t rank, uint32_t *dimensions, Qnn_DataType_t data_type) { if (!p_data) { QNN_LOG_WARN("invalid param\n"); return nullptr; diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index c465d17f25506..faf5b0df5f4e1 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -1,8 +1,10 @@ #pragma once +#include #include #include +#include #include #include #include @@ -16,55 +18,81 @@ namespace qnn { +static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS should be 4"); + class ggml_qnn_tensor { public: - explicit ggml_qnn_tensor(const std::string &name, QNNBackend device, Qnn_GraphHandle_t graph_handle, + typedef enum _tensor_type { INPUT, OUTPUT, INTERMEDIATE, PARAMETER } tensor_type_t; + + explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string &name, + const qnn_dimension_array_t &dimensions, Qnn_DataType_t data_type, int rank, + QNNBackend device, Qnn_GraphHandle_t graph_handle, std::shared_ptr qnn_instance) : _tensor_name(name), _device(device), _qnn_instance(qnn_instance), _graph_handle(graph_handle) { - QNN_TENSOR_SET_NAME(_qnn_tensor, _tensor_name.c_str()); + if (!_tensor_name.empty()) { + QNN_TENSOR_SET_NAME(_qnn_tensor, _tensor_name.c_str()); + } QNN_TENSOR_SET_DIMENSIONS(_qnn_tensor, _dimensions.data()); - QNN_TENSOR_SET_TYPE(_qnn_tensor, QNN_TENSOR_TYPE_NATIVE); QNN_TENSOR_SET_DATA_FORMAT(_qnn_tensor, QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER); - QNN_LOG_DEBUG("create tensor %s, device: %d", _tensor_name.c_str(), device); + + _dimensions = dimensions; + update_params_from_ggml_tensor(tensor_type, data_type, rank); + QNN_LOG_DEBUG("create tensor %s, rank: %d, dims: [%d, %d, %d, %d], data_type: %d, device: %d", + _tensor_name.c_str(), rank, (int)_dimensions[0], (int)_dimensions[1], (int)_dimensions[2], + (int)_dimensions[3], (int)data_type, (int)device); } + explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string &name, + const ggml_dimension_array_t &dimensions, ggml_type data_type, int rank, QNNBackend device, + Qnn_GraphHandle_t graph_handle, std::shared_ptr qnn_instance) : + ggml_qnn_tensor(tensor_type, name, get_internal_dimension(dimensions, rank), + qnn_datatype_from_ggml_datatype(data_type), rank, device, graph_handle, qnn_instance) {} + ~ggml_qnn_tensor() { _qnn_rpc_buffer.reset(); } - bool bind_ggml_tensor(ggml_tensor *tensor, bool is_input, int prev_max_rank) { - if (_tensor) { - if (_tensor != tensor) { - QNN_LOG_WARN("tensor %s has been bound to another ggml tensor %s", _tensor_name.c_str(), - ggml_get_name(_tensor)); - return false; - } - QNN_LOG_INFO("tensor %s already bound to same ggml tensor %s", _tensor_name.c_str(), - ggml_get_name(_tensor)); + bool alloc_qnn_tensor_id() { + if (QNN_TENSOR_GET_ID(_qnn_tensor)) { + QNN_LOG_WARN("graph tensor %s already created, id %d", _tensor_name.c_str(), + QNN_TENSOR_GET_ID(_qnn_tensor)); return true; } - update_params_from_ggml_tensor(tensor, prev_max_rank); - Qnn_TensorType_t new_tensor_type = is_input ? QNN_TENSOR_TYPE_APP_WRITE : QNN_TENSOR_TYPE_APP_READ; - QNN_TENSOR_SET_TYPE(_qnn_tensor, new_tensor_type); - QNN_LOG_INFO("tensor %s changed to type %d", _tensor_name.c_str(), new_tensor_type); + Qnn_Tensor_t qnn_tensor = _qnn_tensor; + auto qnn_interface = _qnn_instance->get_qnn_interface(); + auto error = qnn_interface->qnn_tensor_create_graph_tensor(_graph_handle, &qnn_tensor); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("create graph tensor failed, tensor %s, error: %d\n", _tensor_name.c_str(), error); + return false; + } + + QNN_TENSOR_SET_ID(_qnn_tensor, QNN_TENSOR_GET_ID(qnn_tensor)); + QNN_LOG_DEBUG("create graph tensor %s, id: %d, rank: %d", _tensor_name.c_str(), QNN_TENSOR_GET_ID(qnn_tensor), + QNN_TENSOR_GET_RANK(qnn_tensor)); - if (!QNN_TENSOR_GET_ID(_qnn_tensor)) { - Qnn_Tensor_t qnn_tensor = _qnn_tensor; - auto qnn_interface = _qnn_instance->get_qnn_interface(); - auto error = qnn_interface->qnn_tensor_create_graph_tensor(_graph_handle, &qnn_tensor); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("create graph tensor failed, tensor %s, error: %d\n", _tensor_name.c_str(), error); + return true; + } + + bool bind_buffer(uint8_t *buffer, const size_t buffer_size) { + if (_buffer) { + if (_buffer != buffer) { + QNN_LOG_WARN("tensor %s has been bound to another buffer %p", _tensor_name.c_str(), _buffer); return false; } - QNN_TENSOR_SET_ID(_qnn_tensor, QNN_TENSOR_GET_ID(qnn_tensor)); - QNN_LOG_DEBUG("create graph tensor %s, id: %d, rank: %d", _tensor_name.c_str(), - QNN_TENSOR_GET_ID(qnn_tensor), QNN_TENSOR_GET_RANK(qnn_tensor)); + QNN_LOG_INFO("tensor %s already bound to same ggml tensor %p", _tensor_name.c_str(), _buffer); + return true; + } + + if (QNN_TENSOR_GET_TYPE(_qnn_tensor) == QNN_TENSOR_TYPE_NATIVE) { + QNN_LOG_DEBUG("tensor %s type(%d) not READ/WRITE, skipping", _tensor_name.c_str(), + (int)QNN_TENSOR_TYPE_NATIVE); + return true; } if (should_use_mem_handle()) { if (!_qnn_rpc_buffer) { auto qnn_rpc_buffer = std::make_unique( - _qnn_instance, ggml_nbytes(tensor), QNN_TENSOR_GET_RANK(_qnn_tensor), + _qnn_instance, buffer_size, QNN_TENSOR_GET_RANK(_qnn_tensor), QNN_TENSOR_GET_DIMENSIONS(_qnn_tensor), QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor)); if (!qnn_rpc_buffer->is_valid()) { QNN_LOG_WARN("alloc rpc mem failed, tensor %s", _tensor_name.c_str()); @@ -79,30 +107,41 @@ class ggml_qnn_tensor { QNN_LOG_DEBUG("tensor %s, use mem handle %p", _tensor_name.c_str(), QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor)); } else { QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); - Qnn_ClientBuffer_t client_buf = { tensor->data, get_ggml_tensor_data_size(tensor) }; + Qnn_ClientBuffer_t client_buf = { buffer, (uint32_t)buffer_size }; QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); QNN_LOG_DEBUG("tensor %s, use client buffer %p size %d", _tensor_name.c_str(), client_buf.data, (int)client_buf.dataSize); } - _tensor = tensor; + _buffer = buffer; + _buffer_size = buffer_size; if (!write_to_qnn_tensor()) { QNN_LOG_WARN("write to qnn tensor failed, tensor %s", _tensor_name.c_str()); return false; } - QNN_LOG_DEBUG("bind tensor %s to ggml tensor %s", _tensor_name.c_str(), ggml_get_name(tensor)); + QNN_LOG_DEBUG("bind tensor %s to buffer: %p, size: %d", _tensor_name.c_str(), buffer, (int)buffer_size); + return true; + } + + bool bind_ggml_tensor(ggml_tensor *tensor) { + if (!bind_buffer(reinterpret_cast(tensor->data), ggml_nbytes(tensor))) { + QNN_LOG_WARN("Failed to bind tensor: %s to ggml tensor: %s", _tensor_name.c_str(), ggml_get_name(tensor)); + return false; + } + + QNN_LOG_DEBUG("Bind tensor %s to ggml tensor %s", _tensor_name.c_str(), ggml_get_name(tensor)); return true; } - bool unbind_ggml_tensor() { + bool unbind() { if (!_graph_handle) { QNN_LOG_WARN("tensor %s not bound to any graph", _tensor_name.c_str()); return false; } - if (!_tensor) { + if (!_buffer) { QNN_LOG_DEBUG("tensor %s not bound to ggml tensor", _tensor_name.c_str()); return true; } @@ -119,12 +158,15 @@ class ggml_qnn_tensor { QNN_LOG_DEBUG("tensor %s, clear client buffer", _tensor_name.c_str()); } - QNN_LOG_DEBUG("unbind tensor: %s from ggml tensor: %s", _tensor_name.c_str(), ggml_get_name(_tensor)); - _tensor = nullptr; + QNN_LOG_DEBUG("unbind tensor: %s from buffer: %p, size: %d", _tensor_name.c_str(), _buffer, (int)_buffer_size); + _buffer = nullptr; + _buffer_size = 0; return true; } const Qnn_Tensor_t &get_qnn_tensor() const { return _qnn_tensor; } + Qnn_DataType_t get_data_type() const { return QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor); } + const qnn_dimension_array_t &get_dimensions() const { return _dimensions; } private: bool write_to_qnn_tensor() { @@ -136,7 +178,7 @@ class ggml_qnn_tensor { if (should_use_mem_handle()) { if (_qnn_rpc_buffer) { - memcpy(_qnn_rpc_buffer->get_buffer(), _tensor->data, ggml_nbytes(_tensor)); + memcpy(_qnn_rpc_buffer->get_buffer(), _buffer, _buffer_size); } else { QNN_LOG_WARN("tensor %s: can't find rpcmem from qnn mem handle\n", _tensor_name.c_str()); return false; @@ -157,7 +199,7 @@ class ggml_qnn_tensor { if (should_use_mem_handle()) { if (_qnn_rpc_buffer) { - memcpy(_tensor->data, _qnn_rpc_buffer->get_buffer(), ggml_nbytes(_tensor)); + memcpy(_buffer, _qnn_rpc_buffer->get_buffer(), _buffer_size); } else { QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n"); return false; @@ -169,29 +211,45 @@ class ggml_qnn_tensor { return true; } - void update_params_from_ggml_tensor(ggml_tensor *tensor, int prev_max_rank) { - _dimensions[0] = (uint32_t)tensor->ne[0]; - _dimensions[1] = (uint32_t)tensor->ne[1]; - _dimensions[2] = (uint32_t)tensor->ne[2]; - _dimensions[3] = (uint32_t)tensor->ne[3]; - QNN_TENSOR_SET_DATA_TYPE(_qnn_tensor, device_datatype_from_ggml_datatype(tensor->type)); + void update_params_from_ggml_tensor(tensor_type_t tensor_type, Qnn_DataType_t data_type, int rank) { + QNN_TENSOR_SET_DATA_TYPE(_qnn_tensor, data_type); // TODO: set the quantizeParams base on the tensor type - QNN_TENSOR_SET_RANK(_qnn_tensor, (uint32_t)std::max(prev_max_rank, ggml_n_dims(tensor))); - + QNN_TENSOR_SET_RANK(_qnn_tensor, (uint32_t)rank); QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); Qnn_ClientBuffer_t client_buf = {}; QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); + + Qnn_TensorType_t new_tensor_type; + switch (tensor_type) { + case INPUT: + new_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; + break; + case OUTPUT: + new_tensor_type = QNN_TENSOR_TYPE_APP_READ; + break; + case PARAMETER: + new_tensor_type = QNN_TENSOR_TYPE_STATIC; + break; + default: + new_tensor_type = QNN_TENSOR_TYPE_NATIVE; + break; + } + QNN_TENSOR_SET_TYPE(_qnn_tensor, new_tensor_type); + QNN_LOG_INFO("tensor %s changed to type %d", _tensor_name.c_str(), new_tensor_type); } - bool should_use_mem_handle() const { return _device == QNN_BACKEND_NPU; } + bool should_use_mem_handle() const { + return _device == QNN_BACKEND_NPU && QNN_TENSOR_GET_TYPE(_qnn_tensor) != QNN_TENSOR_TYPE_STATIC; + } std::string _tensor_name; - const ggml_tensor *_tensor; + uint8_t *_buffer = nullptr; + size_t _buffer_size = 0; QNNBackend _device; std::shared_ptr _qnn_instance; Qnn_Tensor_t _qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion); - std::array _dimensions = {}; + qnn_dimension_array_t _dimensions = {}; Qnn_GraphHandle_t _graph_handle = nullptr; std::unique_ptr _qnn_rpc_buffer; @@ -199,4 +257,6 @@ class ggml_qnn_tensor { DISABLE_MOVE(ggml_qnn_tensor); }; +using ggml_qnn_tensor_array_t = std::vector>; + } // namespace qnn diff --git a/ggml/src/ggml-qnn/utils.cpp b/ggml/src/ggml-qnn/utils.cpp index e44d6dbccee42..0de9d203ebee9 100644 --- a/ggml/src/ggml-qnn/utils.cpp +++ b/ggml/src/ggml-qnn/utils.cpp @@ -9,14 +9,40 @@ namespace qnn { +qnn_dimension_array_t get_internal_dimension(const ggml_dimension_array_t &dims, uint32_t rank) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS should be 4"); + GGML_ASSERT(rank <= GGML_MAX_DIMS && rank > 0); + + qnn_dimension_array_t internal_dims = {}; + /* + * Both the ggml and qnn tensor in memory are stored as row-major format. + * But the dimensions of the tensor are stored in different order. + * For example, a 2x3 matrix: + * [ + * [1, 2, 3], + * [4, 5, 6], + * ] + * The ggml tensor will have dimensions [3, 2], while the qnn tensor will have dimensions [2, 3]. + */ + for (uint32_t i = 0; i < rank; i++) { + internal_dims[i] = std::max(dims[rank - 1 - i], 1); + } + + return internal_dims; +} + // TODO: mapping more ggml data type to QNN data type // ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684 -Qnn_DataType_t device_datatype_from_ggml_datatype(ggml_type ggml_type) { +Qnn_DataType_t qnn_datatype_from_ggml_datatype(ggml_type ggml_type) { switch (ggml_type) { - case GGML_TYPE_F16: - return QNN_DATATYPE_FLOAT_16; case GGML_TYPE_F32: return QNN_DATATYPE_FLOAT_32; + case GGML_TYPE_F16: + return QNN_DATATYPE_FLOAT_16; + case GGML_TYPE_I32: + return QNN_DATATYPE_INT_32; + case GGML_TYPE_I16: + return QNN_DATATYPE_INT_16; case GGML_TYPE_I8: return QNN_DATATYPE_INT_8; case GGML_TYPE_Q8_0: @@ -29,16 +55,75 @@ Qnn_DataType_t device_datatype_from_ggml_datatype(ggml_type ggml_type) { return QNN_DATATYPE_UNDEFINED; } -Qnn_TensorType_t device_tensortype_from_ggml_tensor(ggml_tensor *ggml_tensor) { - Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_NATIVE; +ggml_type ggml_datatype_from_qnn_datatype(Qnn_DataType_t qnn_type) { + switch (qnn_type) { + case QNN_DATATYPE_FLOAT_32: + return GGML_TYPE_F32; + case QNN_DATATYPE_FLOAT_16: + return GGML_TYPE_F16; + case QNN_DATATYPE_UINT_32: + case QNN_DATATYPE_INT_32: + return GGML_TYPE_I32; + case QNN_DATATYPE_INT_16: + return GGML_TYPE_I16; + case QNN_DATATYPE_INT_8: + return GGML_TYPE_I8; + case QNN_DATATYPE_SFIXED_POINT_8: + return GGML_TYPE_Q8_0; + case QNN_DATATYPE_SFIXED_POINT_4: + return GGML_TYPE_Q4_0; + default: + break; + } + return GGML_TYPE_COUNT; +} - if (ggml_tensor->flags & GGML_TENSOR_FLAG_INPUT) { - qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; - } else if (ggml_tensor->flags & GGML_TENSOR_FLAG_OUTPUT) { - qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ; +size_t qnn_datatype_size(Qnn_DataType_t qnn_type) { + switch (qnn_type) { + case QNN_DATATYPE_FLOAT_32: + return sizeof(float); + case QNN_DATATYPE_FLOAT_16: + return sizeof(uint16_t); + case QNN_DATATYPE_UINT_32: + case QNN_DATATYPE_INT_32: + return sizeof(int32_t); + case QNN_DATATYPE_INT_16: + return sizeof(int16_t); + case QNN_DATATYPE_INT_8: + return sizeof(int8_t); + case QNN_DATATYPE_SFIXED_POINT_8: + return sizeof(int8_t); + case QNN_DATATYPE_SFIXED_POINT_4: + return sizeof(int8_t); + default: + break; } + return 0; +} - return qnn_tensor_type; +const char *qnn_datatype_to_string(Qnn_DataType_t qnn_type) { + switch (qnn_type) { + case QNN_DATATYPE_FLOAT_32: + return "QNN_DATATYPE_FLOAT_32"; + case QNN_DATATYPE_FLOAT_16: + return "QNN_DATATYPE_FLOAT_16"; + case QNN_DATATYPE_UINT_32: + return "QNN_DATATYPE_UINT_32"; + case QNN_DATATYPE_INT_32: + return "QNN_DATATYPE_INT_32"; + case QNN_DATATYPE_INT_16: + return "QNN_DATATYPE_INT_16"; + case QNN_DATATYPE_INT_8: + return "QNN_DATATYPE_INT_8"; + case QNN_DATATYPE_SFIXED_POINT_8: + return "QNN_DATATYPE_SFIXED_POINT_8"; + case QNN_DATATYPE_SFIXED_POINT_4: + return "QNN_DATATYPE_SFIXED_POINT_4"; + default: + break; + } + + return "QNN_DATATYPE_UNDEFINED"; } uint32_t get_ggml_tensor_rank(const ggml_tensor *tensor) { @@ -51,8 +136,13 @@ uint32_t get_ggml_tensor_rank(const ggml_tensor *tensor) { return rank; } -const char *get_backend_name(int n_backend_type) { - switch (n_backend_type) { +const char *get_ggml_type_name(ggml_type type) { + const auto *traits = ggml_get_type_traits(type); + return traits->type_name; +} + +const char *get_backend_name(size_t device_index) { + switch (device_index) { case QNN_BACKEND_CPU: return "QNN-CPU"; case QNN_BACKEND_GPU: diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp index b7f29bdaa5663..2c58d037982f6 100644 --- a/ggml/src/ggml-qnn/utils.hpp +++ b/ggml/src/ggml-qnn/utils.hpp @@ -6,6 +6,7 @@ #include #include +#include #include #include "ggml.h" @@ -17,8 +18,14 @@ namespace qnn { +using ggml_dimension_array_t = int64_t[GGML_MAX_DIMS]; +using qnn_dimension_array_t = std::array; + +qnn_dimension_array_t get_internal_dimension(const ggml_dimension_array_t &dims, uint32_t rank); + uint32_t get_ggml_tensor_rank(const ggml_tensor *tensor); -const char *get_backend_name(int n_backend_type); +const char *get_ggml_type_name(ggml_type type); +const char *get_backend_name(size_t device_index); const char *get_chipset_desc(uint32_t chipset_id); const char *get_htparch_desc(size_t htp_arch); intptr_t align_to(size_t alignment, intptr_t offset); @@ -187,8 +194,10 @@ inline void set_qnn_tensor_dyn_dimensions(Qnn_Tensor_t &tensor, uint8_t *isDynam } } -Qnn_DataType_t device_datatype_from_ggml_datatype(ggml_type ggml_type); -Qnn_TensorType_t device_tensortype_from_ggml_tensor(ggml_tensor *ggml_tensor); +Qnn_DataType_t qnn_datatype_from_ggml_datatype(ggml_type ggml_type); +ggml_type ggml_datatype_from_qnn_datatype(Qnn_DataType_t qnn_type); +size_t qnn_datatype_size(Qnn_DataType_t qnn_type); +const char *qnn_datatype_to_string(Qnn_DataType_t qnn_type); #if ENABLE_QNNBACKEND_PERF class qnn_perf { From 5c1e6d4905c4e7e6023caa5c1ca12fd9aafcd70c Mon Sep 17 00:00:00 2001 From: hongruichen Date: Tue, 29 Oct 2024 00:54:08 +0800 Subject: [PATCH 124/166] disable gelu in NPU --- ggml/src/ggml-qnn/backend-ops.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index d20069874a7c3..c4207e62a36f7 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -573,8 +573,15 @@ bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tenso } if (op->op == GGML_OP_UNARY) { - if (!kQnnUnaryOpsTable[kGgmlUnaryOpStart + ggml_get_unary_op(op)]) { - QNN_LOG_DEBUG("unsupported unary op %d", ggml_get_unary_op(op)); + const auto unary_op = ggml_get_unary_op(op); + if (unary_op == GGML_UNARY_OP_GELU && ctx->device == QNN_BACKEND_NPU) { + // TODO: fix this when NPU supports GELU + QNN_LOG_DEBUG("unsupported unary op GGML_UNARY_OP_GELU for NPU"); + return false; + } + + if (!kQnnUnaryOpsTable[kGgmlUnaryOpStart + unary_op]) { + QNN_LOG_DEBUG("unsupported unary op %d", unary_op); return false; } From fe565cfd9f7b81e1afd5f8d1c8a82ea72b4ea69b Mon Sep 17 00:00:00 2001 From: nullname Date: Tue, 29 Oct 2024 15:47:07 +0800 Subject: [PATCH 125/166] fix compiling error in release --- ggml/src/ggml-qnn/backend-ops.cpp | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index c4207e62a36f7..3e24ca32ed35f 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -44,21 +44,6 @@ bool qnn_is_valid_params(ggml_backend_qnn_device_context *ctx, const ggml_tensor return true; } -bool is_tensor_dimensions_equal(const ggml_tensor *l, const ggml_tensor *r) { - const auto dim_l = ggml_n_dims(l); - if (dim_l != ggml_n_dims(r)) { - return false; - } - - for (int i = 0; i < dim_l; i++) { - if (l->ne[i] != r->ne[i]) { - return false; - } - } - - return true; -} - void print_ggml_tensor(const ggml_tensor *tensor) { QNN_LOG_DEBUG("%s: type:%s ne: %ldx%ldx%ldx%ld, nb: %ldx%ldx%ldx%ld\n", tensor->name, ggml_type_name(tensor->type), (long)tensor->ne[0], (long)tensor->ne[1], (long)tensor->ne[2], (long)tensor->ne[3], @@ -78,6 +63,21 @@ void print_ggml_tensor(const ggml_tensor *tensor) { namespace { +bool is_tensor_dimensions_equal(const ggml_tensor *l, const ggml_tensor *r) { + const auto dim_l = ggml_n_dims(l); + if (dim_l != ggml_n_dims(r)) { + return false; + } + + for (int i = 0; i < dim_l; i++) { + if (l->ne[i] != r->ne[i]) { + return false; + } + } + + return true; +} + typedef bool (*ggml_qnn_unary_op_t)(ggml_backend_qnn_device_context *ctx, ggml_tensor *src, ggml_tensor *dst); typedef bool (*ggml_qnn_binary_op_t)(ggml_backend_qnn_device_context *ctx, ggml_tensor *src0, ggml_tensor *src1, ggml_tensor *dst); From 0fec56fd57f3051534defd40b19372498ffd5c68 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Mon, 4 Nov 2024 22:44:04 +0800 Subject: [PATCH 126/166] fix compiling error --- ggml/src/ggml-qnn.cpp | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 21a7dee1c99ef..4da991916c0b7 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -126,15 +126,6 @@ ggml_backend_qnn_device_context *get_device_context(ggml_backend_dev_t dev) { * qnn backend buffer object * ----------------------------------------------------------------------------------------------- */ -const char *ggml_backend_qnn_buffer_get_name(ggml_backend_buffer_t buffer) { - GGML_UNUSED(buffer); - return GGML_QNN_NAME; -} - -bool ggml_backend_buffer_is_qnn(ggml_backend_buffer_t buffer) { - return buffer->iface.get_name == ggml_backend_qnn_buffer_get_name; -} - void ggml_backend_qnn_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context; @@ -184,7 +175,6 @@ void ggml_backend_qnn_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) } ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = { - /* .get_name = */ ggml_backend_qnn_buffer_get_name, /* .free_buffer = */ ggml_backend_qnn_buffer_free_buffer, /* .get_base = */ ggml_backend_qnn_buffer_get_base, /* .init_tensor = */ ggml_backend_qnn_buffer_init_tensor, @@ -281,10 +271,6 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(ggml_backend_dev_t dev) return &ggml_backend_qnn_buffer_types[dev_ctx->device]; } -ggml_backend_buffer_type_t ggml_backend_qnn_get_default_buffer_type(ggml_backend_t backend) { - return ggml_backend_qnn_buffer_type(backend->device); -} - ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, ggml_cgraph *cgraph) { enum ggml_status result = GGML_STATUS_SUCCESS; auto *device_ctx = get_device_context(backend->device); @@ -306,7 +292,6 @@ ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, ggml_cgraph * ggml_backend_i ggml_backend_qnn_interface = { /* .get_name = */ ggml_backend_qnn_name, /* .free = */ ggml_backend_qnn_free, - /* .get_default_buffer_type = */ ggml_backend_qnn_get_default_buffer_type, /* .set_tensor_async = */ nullptr, /* .get_tensor_async = */ nullptr, /* .cpy_tensor_async = */ nullptr, @@ -316,9 +301,6 @@ ggml_backend_i ggml_backend_qnn_interface = { /* .graph_plan_update = */ nullptr, /* .graph_plan_compute = */ nullptr, /* .graph_compute = */ ggml_backend_qnn_graph_compute, - /* .supports_op = */ nullptr, // moved to device - /* .supports_buft = */ nullptr, // moved to device - /* .offload_op = */ nullptr, // moved to device /* .event_record = */ nullptr, /* .event_wait = */ nullptr, }; From 8ad86dc703fd091d943b0ea1e07932947b1e2e66 Mon Sep 17 00:00:00 2001 From: nullname Date: Mon, 4 Nov 2024 23:12:03 +0800 Subject: [PATCH 127/166] feat: add QNN_OP_TRANSPOSE (#6) * redo: add convert nodes This reverts commit 8448acd5ebf8fe86ab9d25313b64a15c811ef96e. * align clang format with cann * rename binary_op -> general_op casue there're some op that will only tak 1 param * Revert "rename binary_op -> general_op" This reverts commit 5be63b1a0dc4614457785367dade62158fe46214. * wip * add GGML_OP_PERMUTE * add GGML_OP_VIEW and GGML_OP_GET_ROWS * wip * Revert "wip" This reverts commit 772462ca6cfa01ea31bde725c2da60076ad9385f. --- ggml/src/ggml-qnn.cpp | 48 ++++---- ggml/src/ggml-qnn/.clang-format | 46 +++++++- ggml/src/ggml-qnn/backend-ops.cpp | 186 ++++++++++++++++-------------- ggml/src/ggml-qnn/op-config.cpp | 90 ++++++++++++--- ggml/src/ggml-qnn/op-config.hpp | 35 ++++-- ggml/src/ggml-qnn/tensor.hpp | 1 + 6 files changed, 265 insertions(+), 141 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 4da991916c0b7..d28163dce44bc 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -57,30 +57,30 @@ struct qnn_device_caps { }; const qnn_device_caps kDeviceCaps[GGML_QNN_MAX_DEVICES]{ - { "qnn-cpu", - "Qualcomm Kryo CPU", - "libQnnCpu.so", - GGML_BACKEND_DEVICE_TYPE_CPU, - { GGML_TYPE_F32, - GGML_TYPE_I8 } }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/CpuOpDefSupplement.html#matmul - { "qnn-gpu", - "Qualcomm Adreno GPU", - "libQnnGpu.so", - GGML_BACKEND_DEVICE_TYPE_GPU, - { GGML_TYPE_F32, - GGML_TYPE_F16 } }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/GpuOpDefSupplement.html#matmul - { "qnn-npu", - "Qualcomm NPU", - "libQnnHtp.so", - GGML_BACKEND_DEVICE_TYPE_GPU, - { GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_I16, - GGML_TYPE_I8 } }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/HtpOpDefSupplement.html#matmul + {// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/CpuOpDefSupplement.html#matmul + "qnn-cpu", + "Qualcomm Kryo CPU", + "libQnnCpu.so", + GGML_BACKEND_DEVICE_TYPE_CPU, + {GGML_TYPE_F32, GGML_TYPE_I8}}, + {// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/GpuOpDefSupplement.html#matmul + "qnn-gpu", + "Qualcomm Adreno GPU", + "libQnnGpu.so", + GGML_BACKEND_DEVICE_TYPE_GPU, + {GGML_TYPE_F32, GGML_TYPE_F16}}, + {// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/HtpOpDefSupplement.html#matmul + "qnn-npu", + "Qualcomm NPU", + "libQnnHtp.so", + GGML_BACKEND_DEVICE_TYPE_GPU, + {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_I16, GGML_TYPE_I8}}, }; class ggml_backend_qnn_buffer_context { public: - ggml_backend_qnn_buffer_context(QNNBackend device, std::shared_ptr instance, size_t size) : - _instance(instance), _name(QNN_BACKEND_NAME + std::to_string(device)) { + ggml_backend_qnn_buffer_context(QNNBackend device, std::shared_ptr instance, size_t size) + : _instance(instance), _name(QNN_BACKEND_NAME + std::to_string(device)) { // TODO: fix this for other platforms size_t size_page = sysconf(_SC_PAGESIZE); @@ -251,7 +251,7 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(ggml_backend_dev_t dev) if (!ggml_backend_qnn_buffer_type_initialized) { for (size_t i = 0; i < GGML_QNN_MAX_DEVICES; i++) { auto &context = ggml_backend_qnn_buffer_type_contexts[i]; - context = { std::string(QNN_BACKEND_NAME) + std::to_string(i) }; + context = {std::string(QNN_BACKEND_NAME) + std::to_string(i)}; ggml_backend_qnn_buffer_types[i] = { /* .iface = */ { /* .get_name = */ ggml_backend_qnn_buffer_type_name, @@ -348,8 +348,8 @@ void ggml_backend_qnn_device_get_props(ggml_backend_dev_t dev, struct ggml_backe } ggml_guid_t ggml_backend_qnn_guid() { - static ggml_guid guid = { 0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81, - 0x92, 0xa3, 0xb4, 0xc5, 0xd6, 0xe7, 0xf8, 0x09 }; + static ggml_guid guid = {0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81, + 0x92, 0xa3, 0xb4, 0xc5, 0xd6, 0xe7, 0xf8, 0x09}; return &guid; } @@ -511,7 +511,7 @@ const ggml_backend_reg_i ggml_backend_qnn_reg_interface = { } // namespace ggml_backend_reg_t ggml_backend_qnn_reg() { - static ggml_backend_qnn_reg_impl reg{ ggml_backend_qnn_reg_interface }; + static ggml_backend_qnn_reg_impl reg{ggml_backend_qnn_reg_interface}; static bool initialized = false; static std::mutex mutex; diff --git a/ggml/src/ggml-qnn/.clang-format b/ggml/src/ggml-qnn/.clang-format index 3b933ff10db42..0c67c54239623 100644 --- a/ggml/src/ggml-qnn/.clang-format +++ b/ggml/src/ggml-qnn/.clang-format @@ -3,16 +3,50 @@ BasedOnStyle: Google IndentWidth: 4 AccessModifierOffset: -4 AlignAfterOpenBracket: Align -AlignOperands: true +AlignConsecutiveMacros: false +AlignConsecutiveAssignments: false +AlignConsecutiveDeclarations: false +AlignEscapedNewlines: Left +AlignOperands: true AlignTrailingComments: true +AllowAllArgumentsOnNextLine: true +AllowAllConstructorInitializersOnNextLine: true +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortBlocksOnASingleLine: Never +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: All +AllowShortLambdasOnASingleLine: All +AllowShortIfStatementsOnASingleLine: WithoutElse +AllowShortLoopsOnASingleLine: true +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: true +AlwaysBreakTemplateDeclarations: Yes BinPackArguments: true BinPackParameters: true -BreakBeforeBraces: Custom -BreakConstructorInitializers: AfterColon +BraceWrapping: + AfterCaseLabel: false + AfterClass: false + AfterControlStatement: false + AfterEnum: false + AfterFunction: false + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + AfterExternBlock: false + BeforeCatch: false + BeforeElse: false + IndentBraces: false + SplitEmptyFunction: true + SplitEmptyRecord: true + SplitEmptyNamespace: true ColumnLimit: 120 -Cpp11BracedListStyle: false +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true DerivePointerAlignment: false -IncludeCategories: +IncludeCategories: - Regex: '^<.*\.h>' Priority: 1 - Regex: '^<.*' @@ -28,4 +62,4 @@ MaxEmptyLinesToKeep: 1 PointerAlignment: Right SortIncludes: true SpacesBeforeTrailingComments: 1 -UseTab: Never \ No newline at end of file +UseTab: Never diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 3e24ca32ed35f..c0e263a640eea 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -92,10 +92,10 @@ qnn::ggml_tensor_array_t to_ggml_tensor_array(const std::array +template bool execute_graph(qnn::ggml_qnn_graph *graph, const std::array &inputs, - const std::array &outputs) { - if (!graph->execute(to_ggml_tensor_array<_InputSize>(inputs), to_ggml_tensor_array<_OutputSize>(outputs))) { + ggml_tensor *output) { + if (!graph->execute(to_ggml_tensor_array<_InputSize>(inputs), to_ggml_tensor_array<1>({output}))) { QNN_LOG_WARN("execute failed\n"); return false; } @@ -154,37 +154,37 @@ constexpr const char *kGgmlOpToQnnOp[] = { nullptr, // GGML_OP_MUL_MAT_ID nullptr, // GGML_OP_OUT_PROD - nullptr, // GGML_OP_SCALE - nullptr, // GGML_OP_SET - nullptr, // GGML_OP_CPY - nullptr, // GGML_OP_CONT - nullptr, // GGML_OP_RESHAPE - nullptr, // GGML_OP_VIEW - nullptr, // GGML_OP_PERMUTE - nullptr, // GGML_OP_TRANSPOSE - nullptr, // GGML_OP_GET_ROWS - nullptr, // GGML_OP_GET_ROWS_BACK - nullptr, // GGML_OP_DIAG - nullptr, // GGML_OP_DIAG_MASK_INF - nullptr, // GGML_OP_DIAG_MASK_ZERO - nullptr, // GGML_OP_SOFT_MAX - nullptr, // GGML_OP_SOFT_MAX_BACK - nullptr, // GGML_OP_ROPE - nullptr, // GGML_OP_ROPE_BACK - nullptr, // GGML_OP_CLAMP - nullptr, // GGML_OP_CONV_TRANSPOSE_1D - nullptr, // GGML_OP_IM2COL - nullptr, // GGML_OP_IM2COL_BACK - nullptr, // GGML_OP_CONV_TRANSPOSE_2D - nullptr, // GGML_OP_POOL_1D - nullptr, // GGML_OP_POOL_2D - nullptr, // GGML_OP_POOL_2D_BACK - nullptr, // GGML_OP_UPSCALE - nullptr, // GGML_OP_PAD - nullptr, // GGML_OP_ARANGE - nullptr, // GGML_OP_TIMESTEP_EMBEDDING - nullptr, // GGML_OP_ARGSORT - nullptr, // GGML_OP_LEAKY_RELU + nullptr, // GGML_OP_SCALE + nullptr, // GGML_OP_SET + nullptr, // GGML_OP_CPY + nullptr, // GGML_OP_CONT + nullptr, // GGML_OP_RESHAPE + nullptr, // GGML_OP_VIEW + QNN_OP_TRANSPOSE, // GGML_OP_PERMUTE + nullptr, // GGML_OP_TRANSPOSE + nullptr, // GGML_OP_GET_ROWS + nullptr, // GGML_OP_GET_ROWS_BACK + nullptr, // GGML_OP_DIAG + nullptr, // GGML_OP_DIAG_MASK_INF + nullptr, // GGML_OP_DIAG_MASK_ZERO + nullptr, // GGML_OP_SOFT_MAX + nullptr, // GGML_OP_SOFT_MAX_BACK + nullptr, // GGML_OP_ROPE + nullptr, // GGML_OP_ROPE_BACK + nullptr, // GGML_OP_CLAMP + nullptr, // GGML_OP_CONV_TRANSPOSE_1D + nullptr, // GGML_OP_IM2COL + nullptr, // GGML_OP_IM2COL_BACK + nullptr, // GGML_OP_CONV_TRANSPOSE_2D + nullptr, // GGML_OP_POOL_1D + nullptr, // GGML_OP_POOL_2D + nullptr, // GGML_OP_POOL_2D_BACK + nullptr, // GGML_OP_UPSCALE + nullptr, // GGML_OP_PAD + nullptr, // GGML_OP_ARANGE + nullptr, // GGML_OP_TIMESTEP_EMBEDDING + nullptr, // GGML_OP_ARGSORT + nullptr, // GGML_OP_LEAKY_RELU nullptr, // GGML_OP_FLASH_ATTN_EXT nullptr, // GGML_OP_FLASH_ATTN_BACK @@ -235,16 +235,16 @@ static_assert(sizeof(kGgmlOpToQnnOp) / sizeof(kGgmlOpToQnnOp[0]) == (GGML_OP_COU static_assert(kGgmlOpToQnnOp[GGML_UNARY_OP_GELU + kGgmlUnaryOpStart] != nullptr, "GGML_UNARY_OP_GELU does not correspond to QNN_OP_GELU"); -template +template qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *ctx, size_t op, const std::array &inputs, - const std::array &outputs) { + ggml_tensor *output) { GGML_ASSERT(op < (GGML_OP_COUNT + GGML_UNARY_OP_COUNT)); auto &graph_cache = ctx->qnn_graph_cache; const auto *op_name = op < kGgmlUnaryOpStart ? ggml_op_name(ggml_op(op)) : ggml_unary_op_name(ggml_unary_op(op - kGgmlUnaryOpStart)); - auto graph_key = get_graph_key<_InputSize, _OutputSize>(op_name, inputs, outputs); + auto graph_key = get_graph_key<_InputSize, 1>(op_name, inputs, {output}); auto it = graph_cache.find(graph_key); qnn::ggml_qnn_graph *graph_ptr = nullptr; if (it != graph_cache.end()) { @@ -259,7 +259,7 @@ qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *c auto op_constructor = qnn::create_op_constructor(kGgmlOpToQnnOp[op]); if (!graph->build_graph(op_constructor, to_ggml_tensor_array<_InputSize>(inputs), - to_ggml_tensor_array<_OutputSize>(outputs))) { + to_ggml_tensor_array<1>({output}))) { QNN_LOG_ERROR("build_graph failed\n"); return nullptr; } @@ -278,9 +278,9 @@ bool qnn_binary_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src0, CHECK_PARAMS(ctx, src0, src1, dst); bool succeed = false; - auto *graph_ptr = get_qnn_graph_from_cache<2, 1>(ctx, _GgmlOp, { src0, src1 }, { dst }); + auto *graph_ptr = get_qnn_graph_from_cache<2>(ctx, _GgmlOp, {src0, src1}, dst); if (graph_ptr) { - succeed = execute_graph<2, 1>(graph_ptr, { src0, src1 }, { dst }); + succeed = execute_graph<2>(graph_ptr, {src0, src1}, dst); } #ifndef NDEBUG @@ -301,9 +301,9 @@ bool qnn_unary_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src, g CHECK_PARAMS(ctx, src, dst); bool succeed = false; - auto *graph_ptr = get_qnn_graph_from_cache<1, 1>(ctx, _GgmlOp, { src }, { dst }); + auto *graph_ptr = get_qnn_graph_from_cache<1>(ctx, _GgmlOp, {src}, dst); if (graph_ptr) { - succeed = execute_graph<1, 1>(graph_ptr, { src }, { dst }); + succeed = execute_graph<1>(graph_ptr, {src}, dst); } #ifndef NDEBUG @@ -315,6 +315,22 @@ bool qnn_unary_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src, g return succeed; } + +bool qnn_unary_nop_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src, ggml_tensor *dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(src); + GGML_UNUSED(dst); + return true; +} + +bool qnn_binary_nop_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src0, ggml_tensor *src1, ggml_tensor *dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(src0); + GGML_UNUSED(src1); + GGML_UNUSED(dst); + return true; +} + constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = { nullptr, // GGML_OP_NONE nullptr, // GGML_OP_DUP @@ -347,37 +363,37 @@ constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = { nullptr, // GGML_OP_MUL_MAT_ID nullptr, // GGML_OP_OUT_PROD - nullptr, // GGML_OP_SCALE - nullptr, // GGML_OP_SET - nullptr, // GGML_OP_CPY - nullptr, // GGML_OP_CONT - nullptr, // GGML_OP_RESHAPE - nullptr, // GGML_OP_VIEW - nullptr, // GGML_OP_PERMUTE - nullptr, // GGML_OP_TRANSPOSE - nullptr, // GGML_OP_GET_ROWS - nullptr, // GGML_OP_GET_ROWS_BACK - nullptr, // GGML_OP_DIAG - nullptr, // GGML_OP_DIAG_MASK_INF - nullptr, // GGML_OP_DIAG_MASK_ZERO - nullptr, // GGML_OP_SOFT_MAX - nullptr, // GGML_OP_SOFT_MAX_BACK - nullptr, // GGML_OP_ROPE - nullptr, // GGML_OP_ROPE_BACK - nullptr, // GGML_OP_CLAMP - nullptr, // GGML_OP_CONV_TRANSPOSE_1D - nullptr, // GGML_OP_IM2COL - nullptr, // GGML_OP_IM2COL_BACK - nullptr, // GGML_OP_CONV_TRANSPOSE_2D - nullptr, // GGML_OP_POOL_1D - nullptr, // GGML_OP_POOL_2D - nullptr, // GGML_OP_POOL_2D_BACK - nullptr, // GGML_OP_UPSCALE - nullptr, // GGML_OP_PAD - nullptr, // GGML_OP_ARANGE - nullptr, // GGML_OP_TIMESTEP_EMBEDDING - nullptr, // GGML_OP_ARGSORT - nullptr, // GGML_OP_LEAKY_RELU + nullptr, // GGML_OP_SCALE + nullptr, // GGML_OP_SET + nullptr, // GGML_OP_CPY + nullptr, // GGML_OP_CONT + nullptr, // GGML_OP_RESHAPE + qnn_unary_nop_impl, // GGML_OP_VIEW + qnn_unary_op_impl, // GGML_OP_PERMUTE + nullptr, // GGML_OP_TRANSPOSE + qnn_unary_nop_impl, // GGML_OP_GET_ROWS + nullptr, // GGML_OP_GET_ROWS_BACK + nullptr, // GGML_OP_DIAG + nullptr, // GGML_OP_DIAG_MASK_INF + nullptr, // GGML_OP_DIAG_MASK_ZERO + nullptr, // GGML_OP_SOFT_MAX + nullptr, // GGML_OP_SOFT_MAX_BACK + nullptr, // GGML_OP_ROPE + nullptr, // GGML_OP_ROPE_BACK + nullptr, // GGML_OP_CLAMP + nullptr, // GGML_OP_CONV_TRANSPOSE_1D + nullptr, // GGML_OP_IM2COL + nullptr, // GGML_OP_IM2COL_BACK + nullptr, // GGML_OP_CONV_TRANSPOSE_2D + nullptr, // GGML_OP_POOL_1D + nullptr, // GGML_OP_POOL_2D + nullptr, // GGML_OP_POOL_2D_BACK + nullptr, // GGML_OP_UPSCALE + nullptr, // GGML_OP_PAD + nullptr, // GGML_OP_ARANGE + nullptr, // GGML_OP_TIMESTEP_EMBEDDING + nullptr, // GGML_OP_ARGSORT + nullptr, // GGML_OP_LEAKY_RELU nullptr, // GGML_OP_FLASH_ATTN_EXT nullptr, // GGML_OP_FLASH_ATTN_BACK @@ -522,18 +538,24 @@ static_assert(sizeof(kQnnBinaryOpsTable) / sizeof(kQnnBinaryOpsTable[0]) == GGML "GGML_OP_COUNT does not match the size of the kQnnBinaryOpsTable table"); bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_tensor *tensor) { + if (!tensor) { + QNN_LOG_DEBUG("tensor is nullptr"); + return false; + } + + auto *type_name = ggml_get_type_traits(tensor->type)->type_name; switch (tensor->type) { case GGML_TYPE_F32: case GGML_TYPE_F16: case GGML_TYPE_Q8_0: case GGML_TYPE_Q4_0: if (ctx->supported_types.find(tensor->type) == ctx->supported_types.end()) { - QNN_LOG_DEBUG("unsupported data type GGML_TYPE_F16 for cpu backend"); + QNN_LOG_DEBUG("unsupported data type %s for backend %d", type_name, (int)ctx->device); return false; } break; default: - QNN_LOG_DEBUG("unsupported data type %d", tensor->type); + QNN_LOG_DEBUG("unsupported data type %s", type_name); return false; } @@ -591,19 +613,15 @@ bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tenso } } else { if (!kQnnUnaryOpsTable[op->op] && !kQnnBinaryOpsTable[op->op]) { - QNN_LOG_DEBUG("unsupported op %d", op->op); + QNN_LOG_DEBUG("[%s] unsupported op", ggml_op_name(op->op)); return false; } auto *src0 = op->src[0]; auto *src1 = op->src[1]; - if (!src0 || !src1) { - QNN_LOG_DEBUG("src0 or src1 is nullptr"); - return false; - } - - if (!ggml_qnn_supports_tensor(ctx, src0) || !ggml_qnn_supports_tensor(ctx, src1) || - !ggml_qnn_supports_tensor(ctx, op)) { + if (!ggml_qnn_supports_tensor(ctx, src0) || !ggml_qnn_supports_tensor(ctx, op) || + (kQnnBinaryOpsTable[op->op] && !ggml_qnn_supports_tensor(ctx, src1))) { + QNN_LOG_DEBUG("[%s] unsupported tensor", ggml_op_name(op->op)); return false; } @@ -642,7 +660,7 @@ bool ggml_qnn_forward(ggml_backend_qnn_device_context *ctx, struct ggml_tensor * return binary_op(ctx, tensor->src[0], tensor->src[1], tensor); } - QNN_LOG_WARN("unsupported op %s", ggml_op_desc(tensor)); + QNN_LOG_WARN("[forward]unsupported op %s", ggml_op_desc(tensor)); return false; } diff --git a/ggml/src/ggml-qnn/op-config.cpp b/ggml/src/ggml-qnn/op-config.cpp index 07dcba156471b..9b98051adfc8e 100644 --- a/ggml/src/ggml-qnn/op-config.cpp +++ b/ggml/src/ggml-qnn/op-config.cpp @@ -7,10 +7,10 @@ namespace { constexpr const qnn::qnn_dimension_array_t kTransposeParamData[GGML_MAX_DIMS] = { - { 0 }, - { 1, 0 }, - { 0, 2, 1 }, - { 0, 1, 3, 2 }, + {0}, + {1, 0}, + {0, 2, 1}, + {0, 1, 3, 2}, }; qnn::qnn_dimension_array_t get_transposed_dimensions(const qnn::qnn_dimension_array_t &dimensions, int rank) { @@ -96,9 +96,8 @@ bool bind_tensors(const qnn::ggml_tensor_array_t &ggml_tensors, qnn::ggml_qnn_te class ggml_qnn_connectable_op_config : public qnn::ggml_qnn_op_config_base { public: explicit ggml_qnn_connectable_op_config(const std::string &name, const std::string &package_name, - const std::string &op_type, - std::shared_ptr qnn_instance) : - ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} + const std::string &op_type, std::shared_ptr qnn_instance) + : ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, const qnn::ggml_tensor_array_t &tensor_inputs, @@ -264,11 +263,22 @@ bool ggml_qnn_single_op_config::create_tensors(QNNBackend device, Qnn_GraphHandl const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) { const auto tensor_rank = get_rank(tensor_inputs, tensor_outputs); - tensor_common_params params = { "src", tensor_rank, true, device, graph_handle, _qnn_instance }; + tensor_common_params params = {"src", tensor_rank, true, device, graph_handle, _qnn_instance}; create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs); params.name_prefix = "dst"; params.is_input = false; create_tensors_from_ggml_tensor(params, tensor_outputs, &_tensor_outputs, &_qnn_tensor_outputs); + + if (_param_buffer.size() > 0) { + // handle parameters in output tensor + auto *params = tensor_outputs.front()->op_params; + memcpy(_param_buffer.data(), params, _param_buffer.size()); + + const uint32_t count = uint32_t(_param_buffer.size() / qnn_datatype_size(_param_type)); + const qnn_dimension_array_t param_dims = {count, 1, 1, 1}; + add_tensor_param(_param_name, param_dims, 1, _param_buffer.data(), _param_type, device, graph_handle); + } + return true; } @@ -281,7 +291,7 @@ bool ggml_qnn_matmul_op_config::create_tensors(QNNBackend device, Qnn_GraphHandl GGML_ASSERT(tensor_rank >= 2); // create input tensors - tensor_common_params params = { "src", tensor_rank, true, device, graph_handle, _qnn_instance }; + tensor_common_params params = {"src", tensor_rank, true, device, graph_handle, _qnn_instance}; create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs); // create output tensor @@ -290,8 +300,49 @@ bool ggml_qnn_matmul_op_config::create_tensors(QNNBackend device, Qnn_GraphHandl params.is_input = false; create_tensors_from_ggml_tensor(params, tensor_outputs, &mat_mul_tensor_outputs, nullptr); + if (device == QNN_BACKEND_GPU) { + // there's no convert op for GPU, so we should create matmul nodes directl. + return create_mat_mul_nodes(device, graph_handle, tensor_rank, _tensor_inputs, mat_mul_tensor_outputs); + } + + // create tensors for convert node + ggml_qnn_tensor_array_t mat_mul_tensor_inputs = _tensor_inputs; + auto input_tensor_type = get_tensor_type(mat_mul_tensor_inputs); + QNN_LOG_DEBUG("matmul input tensor type: %s\n", qnn_datatype_to_string(input_tensor_type)); + + _input_converts.resize(mat_mul_tensor_inputs.size()); + for (size_t i = 0; i < mat_mul_tensor_inputs.size(); ++i) { + // create input convert nodes + std::string convert_name("convert_src" + std::to_string(i)); + auto convert_in = mat_mul_tensor_inputs[i]; + auto convert_out = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_out", + convert_in->get_dimensions(), input_tensor_type, + tensor_rank, device, graph_handle, _qnn_instance); + auto convert = std::make_shared(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_CONVERT, _qnn_instance); + convert->set_input_tensors({convert_in}); + convert->set_output_tensors({convert_out}); + mat_mul_tensor_inputs[i] = convert_out; + _input_converts[i] = convert; + } + + { + // create output convert node + std::string convert_name("convert_dst"); + auto convert_out = mat_mul_tensor_outputs.front(); + auto convert_in = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_in", + convert_out->get_dimensions(), input_tensor_type, + tensor_rank, device, graph_handle, _qnn_instance); + auto output_convert = std::make_shared( + convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_CONVERT, _qnn_instance); + output_convert->set_input_tensors({convert_in}); + output_convert->set_output_tensors({convert_out}); + mat_mul_tensor_outputs[0] = convert_in; + _output_convert = output_convert; + } + // create mat_mul nodes - return create_mat_mul_nodes(device, graph_handle, tensor_rank, _tensor_inputs, mat_mul_tensor_outputs); + return create_mat_mul_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs, mat_mul_tensor_outputs); } bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, @@ -371,7 +422,7 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap // set transpose0 parameters auto *params_data = reinterpret_cast(kTransposeParamData[rank - 1].data()); - const qnn_dimension_array_t param_dims = { (uint32_t)rank, 1, 1, 1 }; + const qnn_dimension_array_t param_dims = {(uint32_t)rank, 1, 1, 1}; transpose0->add_tensor_param(QNN_OP_TRANSPOSE_PARAM_PERM, param_dims, 1, params_data, QNN_DATATYPE_UINT_32, device, graph_handle); @@ -380,19 +431,19 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap graph_handle); // set tensor to transpose0 - ggml_qnn_tensor_array_t tensors = { tensor_inputs.back() }; + ggml_qnn_tensor_array_t tensors = {tensor_inputs.back()}; transpose0->set_input_tensors(tensors); - tensors = { src0_trans }; + tensors = {src0_trans}; transpose0->set_output_tensors(tensors); // set tensor to mat_mul - tensors = { tensor_inputs.front(), src0_trans }; + tensors = {tensor_inputs.front(), src0_trans}; mat_mul->set_input_tensors(tensors); - tensors = { dst_trans }; + tensors = {dst_trans}; mat_mul->set_output_tensors(tensors); // set tensor to transpose1 - tensors = { dst_trans }; + tensors = {dst_trans}; transpose1->set_input_tensors(tensors); transpose1->set_output_tensors(tensor_outputs); @@ -459,6 +510,13 @@ ggml_op_constructor_t create_op_constructor(const std::string &op_name) { QNN_LOG_DEBUG("create QNN_OP_MAT_MUL, name %s\n", instance_name.c_str()); return std::make_unique(instance_name, qnn_instance); }; + } else if (op_name == QNN_OP_TRANSPOSE) { + return [](const std::string &instance_name, + std::shared_ptr qnn_instance) -> std::unique_ptr { + return std::make_unique(instance_name, QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_TRANSPOSE, QNN_OP_TRANSPOSE_PARAM_PERM, + QNN_DATATYPE_UINT_32, 4 * sizeof(uint32_t), qnn_instance); + }; } return [op_name](const std::string &instance_name, diff --git a/ggml/src/ggml-qnn/op-config.hpp b/ggml/src/ggml-qnn/op-config.hpp index 2016cb4ac994d..4ec7aac9b256e 100644 --- a/ggml/src/ggml-qnn/op-config.hpp +++ b/ggml/src/ggml-qnn/op-config.hpp @@ -30,11 +30,16 @@ class ggml_qnn_op_config { virtual void unbind_output_tensors() = 0; }; +using ggml_op_constructor_t = + std::function(const std::string &, std::shared_ptr)>; + +ggml_op_constructor_t create_op_constructor(const std::string &op_name); + class ggml_qnn_op_config_base : public ggml_qnn_op_config { public: explicit ggml_qnn_op_config_base(const std::string &name, const std::string &package_name, - const std::string &op_type, std::shared_ptr qnn_instance) : - _name(name), _package_name(package_name), _op_type(op_type), _qnn_instance(qnn_instance) {} + const std::string &op_type, std::shared_ptr qnn_instance) + : _name(name), _package_name(package_name), _op_type(op_type), _qnn_instance(qnn_instance) {} void add_scalar_param(const std::string &name, const Qnn_Scalar_t scalar); bool add_tensor_param(const std::string &name, const qnn_dimension_array_t &dimensions, int rank, @@ -70,21 +75,34 @@ class ggml_qnn_op_config_base : public ggml_qnn_op_config { class ggml_qnn_single_op_config : public ggml_qnn_op_config_base { public: explicit ggml_qnn_single_op_config(const std::string &name, const std::string &package_name, - const std::string &op_type, std::shared_ptr qnn_instance) : - ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} + const std::string &op_type, std::shared_ptr qnn_instance) + : ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} + + explicit ggml_qnn_single_op_config(const std::string &name, const std::string &package_name, + const std::string &op_type, const std::string ¶m_name, + const Qnn_DataType_t param_type, const size_t param_size, + std::shared_ptr qnn_instance) + : ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance), + _param_name(param_name), + _param_type(param_type), + _param_buffer(param_size) {} bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) override; private: + const std::string _param_name; + const Qnn_DataType_t _param_type = QNN_DATATYPE_UINT_32; + std::vector _param_buffer; + DISABLE_COPY(ggml_qnn_single_op_config); DISABLE_MOVE(ggml_qnn_single_op_config); }; class ggml_qnn_matmul_op_config : public ggml_qnn_op_config { public: - ggml_qnn_matmul_op_config(const std::string &name, std::shared_ptr qnn_instance) : - _name(name), _qnn_instance(qnn_instance) {} + ggml_qnn_matmul_op_config(const std::string &name, std::shared_ptr qnn_instance) + : _name(name), _qnn_instance(qnn_instance) {} bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) override; @@ -114,9 +132,4 @@ class ggml_qnn_matmul_op_config : public ggml_qnn_op_config { DISABLE_MOVE(ggml_qnn_matmul_op_config); }; -using ggml_op_constructor_t = - std::function(const std::string &, std::shared_ptr)>; - -ggml_op_constructor_t create_op_constructor(const std::string &op_name); - } // namespace qnn diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index faf5b0df5f4e1..f28fc8e2ca1e2 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -257,6 +257,7 @@ class ggml_qnn_tensor { DISABLE_MOVE(ggml_qnn_tensor); }; +using ggml_qnn_tensor_ptr_t = std::shared_ptr; using ggml_qnn_tensor_array_t = std::vector>; } // namespace qnn From e6dbdacc3287ab89c0a21a9bd5972caa2c5338a1 Mon Sep 17 00:00:00 2001 From: nullname Date: Wed, 13 Nov 2024 17:06:46 +0800 Subject: [PATCH 128/166] feat: fix llama-bench (#7) * remove unused functions * wip * init from last devices * move init into constructor * wip * add static assert to device table * make kDeviceCaps as constexpr * get free memory and total memory * add optimize flag for qnn backend --- ggml/include/ggml-qnn.h | 15 +-- ggml/src/CMakeLists.txt | 1 + ggml/src/ggml-qnn.cpp | 172 +++++++++++++++--------------- ggml/src/ggml-qnn/backend-ops.cpp | 2 +- ggml/src/ggml-qnn/backend.hpp | 2 +- ggml/src/ggml-qnn/utils.cpp | 25 ++++- ggml/src/ggml-qnn/utils.hpp | 6 +- 7 files changed, 116 insertions(+), 107 deletions(-) diff --git a/ggml/include/ggml-qnn.h b/ggml/include/ggml-qnn.h index 23835f23cb0ec..2b25ce40d79e5 100644 --- a/ggml/include/ggml-qnn.h +++ b/ggml/include/ggml-qnn.h @@ -9,28 +9,17 @@ extern "C" { #endif #define GGML_QNN_NAME "QNN" -#define GGML_QNN_MAX_DEVICES 3 +#define GGML_QNN_MAX_DEVICES QNN_BACKEND_COUNT enum QNNBackend { QNN_BACKEND_CPU = 0, QNN_BACKEND_GPU, QNN_BACKEND_NPU, - QNN_BACKEND_GGML, //"fake" QNN backend, used for compare performance between - // QNN and original GGML + QNN_BACKEND_COUNT, }; -/** - * - * @param index 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2:QNN_BACKEND_NPU - * @param extend_lib_search_path extened lib search path for searching QNN backend dynamic libs - * @return - */ -GGML_API ggml_backend_t ggml_backend_qnn_init(size_t index, const char *extend_lib_search_path); - GGML_API bool ggml_backend_is_qnn(ggml_backend_t backend); -GGML_API int ggml_backend_qnn_get_device_count(void); - GGML_API ggml_backend_reg_t ggml_backend_qnn_reg(void); #ifdef __cplusplus diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index b827f9c8f0aba..0d4b388f324f3 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -969,6 +969,7 @@ if (GGML_QNN) message("QNN_SDK_PATH: ${GGML_QNN_SDK_PATH}") file(GLOB GGML_SOURCES_QNN "ggml-qnn/*.cpp") list(APPEND GGML_SOURCES_QNN "ggml-qnn.cpp") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3") set(GGML_HEADERS_QNN ../include/ggml-qnn.h) set(QNN_INC_PATH ${GGML_QNN_SDK_PATH}/include/QNN) set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${QNN_INC_PATH} "ggml-qnn") diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index d28163dce44bc..a41fae6bbb368 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -53,39 +53,50 @@ struct qnn_device_caps { enum ggml_backend_dev_type type; // TODO: should get this caps from device - std::unordered_set supported_types; + uint64_t supported_types; }; -const qnn_device_caps kDeviceCaps[GGML_QNN_MAX_DEVICES]{ - {// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/CpuOpDefSupplement.html#matmul - "qnn-cpu", - "Qualcomm Kryo CPU", - "libQnnCpu.so", - GGML_BACKEND_DEVICE_TYPE_CPU, - {GGML_TYPE_F32, GGML_TYPE_I8}}, - {// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/GpuOpDefSupplement.html#matmul - "qnn-gpu", - "Qualcomm Adreno GPU", - "libQnnGpu.so", - GGML_BACKEND_DEVICE_TYPE_GPU, - {GGML_TYPE_F32, GGML_TYPE_F16}}, - {// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/HtpOpDefSupplement.html#matmul - "qnn-npu", - "Qualcomm NPU", - "libQnnHtp.so", - GGML_BACKEND_DEVICE_TYPE_GPU, - {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_I16, GGML_TYPE_I8}}, +constexpr const qnn_device_caps kDeviceCaps[] = { + { + // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/CpuOpDefSupplement.html#matmul + "qnn-cpu", + "Qualcomm Kryo CPU", + "libQnnCpu.so", + GGML_BACKEND_DEVICE_TYPE_CPU, + (1 << GGML_TYPE_I8) | (1 << GGML_TYPE_F32), + }, + { + // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/GpuOpDefSupplement.html#matmul + "qnn-gpu", + "Qualcomm Adreno GPU", + "libQnnGpu.so", + GGML_BACKEND_DEVICE_TYPE_GPU, + (1 << GGML_TYPE_F32) | (1 << GGML_TYPE_F16), + }, + { + // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/HtpOpDefSupplement.html#matmul + "qnn-npu", + "Qualcomm NPU", + "libQnnHtp.so", + GGML_BACKEND_DEVICE_TYPE_ACCEL, + (1 << GGML_TYPE_F32) | (1 << GGML_TYPE_F16) | (1 << GGML_TYPE_I16) | (1 << GGML_TYPE_I8), + }, }; +static_assert(sizeof(kDeviceCaps) / sizeof(kDeviceCaps[0]) == GGML_QNN_MAX_DEVICES, + "The number of qnn devices should be equal to GGML_QNN_MAX_DEVICES"); +static_assert(kDeviceCaps[QNN_BACKEND_NPU].type == GGML_BACKEND_DEVICE_TYPE_ACCEL, + "The NPU device should be an accelerator device"); + class ggml_backend_qnn_buffer_context { public: ggml_backend_qnn_buffer_context(QNNBackend device, std::shared_ptr instance, size_t size) : _instance(instance), _name(QNN_BACKEND_NAME + std::to_string(device)) { - // TODO: fix this for other platforms size_t size_page = sysconf(_SC_PAGESIZE); - // TODO: for qnn npu, a better way here is to reuse the buffer allocated by qnn rpc, will save an extra copy + // TODO: for qnn npu, a better way here is to reuse the buffer allocated by + // qnn rpc, will save an extra copy _buffer = qnn::align_alloc(size_page, size); if (!_buffer) { @@ -192,8 +203,8 @@ ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = { * ----------------------------------------------------------------------------------------------- */ const char *ggml_backend_qnn_buffer_type_name(ggml_backend_buffer_type_t buft) { - GGML_UNUSED(buft); - return GGML_QNN_NAME; + auto *dev_ctx = get_device_context(buft->device); + return qnn::get_backend_name(dev_ctx->device); } ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { @@ -209,13 +220,14 @@ ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(ggml_backend_buf size_t ggml_backend_qnn_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { GGML_UNUSED(buft); + // TODO: fix this return 32; } -// TODO: this value is an experimental value, works fine with whisper/llm/minicpm-v inference on Android size_t ggml_backend_qnn_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { GGML_UNUSED(buft); - + // TODO: this value is an experimental value, works fine with + // whisper/llm/minicpm-v inference on Android return (96 * 1024 * 1024); } @@ -255,9 +267,12 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(ggml_backend_dev_t dev) ggml_backend_qnn_buffer_types[i] = { /* .iface = */ { /* .get_name = */ ggml_backend_qnn_buffer_type_name, - /* .alloc_buffer = */ ggml_backend_qnn_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_qnn_buffer_type_get_alignment, - /* .get_max_size = */ ggml_backend_qnn_buffer_type_get_max_size, + /* .alloc_buffer = */ + ggml_backend_qnn_buffer_type_alloc_buffer, + /* .get_alignment = */ + ggml_backend_qnn_buffer_type_get_alignment, + /* .get_max_size = */ + ggml_backend_qnn_buffer_type_get_max_size, /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes /* .is_host = */ ggml_backend_qnn_buffer_is_host, }, @@ -321,17 +336,13 @@ const char *ggml_backend_qnn_device_get_description(ggml_backend_dev_t dev) { } void ggml_backend_qnn_device_get_memory(ggml_backend_dev_t dev, size_t *free, size_t *total) { - // TODO: get memory info - *free = 0; - *total = 0; - GGML_UNUSED(dev); + *free = qnn::get_system_free_memory_in_bytes(); + *total = qnn::get_system_total_memory_in_bytes(); } enum ggml_backend_dev_type ggml_backend_qnn_device_get_type(ggml_backend_dev_t dev) { - // TODO: for cpu backend, we should return GGML_BACKEND_DEVICE_TYPE_CPU - GGML_UNUSED(dev); - return GGML_BACKEND_DEVICE_TYPE_GPU; + return kDeviceCaps[get_device_context(dev)->device].type; } void ggml_backend_qnn_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props *props) { @@ -356,41 +367,43 @@ ggml_guid_t ggml_backend_qnn_guid() { ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, const char *extend_lib_search_path) { if (!extend_lib_search_path) { extend_lib_search_path = GGML_QNN_DEFAULT_LIB_SEARCH_PATH; - QNN_LOG_WARN("extend_lib_search_path is nullptr, will use " GGML_QNN_DEFAULT_LIB_SEARCH_PATH " as default"); + QNN_LOG_WARN( + "extend_lib_search_path is nullptr, will " + "use " GGML_QNN_DEFAULT_LIB_SEARCH_PATH " as default"); } auto *dev_ctx = get_device_context(dev); - auto device_index = dev_ctx->device; - QNN_LOG_DEBUG("device %d", device_index); + const auto device = dev_ctx->device; + QNN_LOG_DEBUG("device %d", device); QNN_LOG_DEBUG("extend_lib_search_path %s", extend_lib_search_path); std::string path = extend_lib_search_path; // TODO: Fix this for other platforms #if defined(__ANDROID__) || defined(ANDROID) - if (QNN_BACKEND_NPU == device_index) { - if (0 == setenv("LD_LIBRARY_PATH", - (path + ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/" - "dsp:/vendor/dsp/images") - .c_str(), - 1)) { + if (device == QNN_BACKEND_NPU) { + if (setenv("LD_LIBRARY_PATH", + (path + ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/" + "dsp:/vendor/dsp/images") + .c_str(), + 1) == 0) { QNN_LOG_INFO("QNN NPU backend setenv successfully"); } else { QNN_LOG_ERROR("QNN NPU backend setenv failure"); } - if (0 == setenv("ADSP_LIBRARY_PATH", - (path + ";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/" - "rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp") - .c_str(), - 1)) { + if (setenv("ADSP_LIBRARY_PATH", + (path + ";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/" + "rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp") + .c_str(), + 1) == 0) { QNN_LOG_INFO("QNN NPU backend setenv successfully"); } else { QNN_LOG_ERROR("QNN NPU backend setenv failure"); } } else { - if (0 == setenv("LD_LIBRARY_PATH", path.c_str(), 1)) { - QNN_LOG_INFO("%s backend setenv successfully\n", qnn::get_backend_name(device_index)); + if (setenv("LD_LIBRARY_PATH", path.c_str(), 1) == 0) { + QNN_LOG_INFO("%s backend setenv successfully\n", qnn::get_backend_name(device)); } else { - QNN_LOG_ERROR("%s backend setenv failure\n", qnn::get_backend_name(device_index)); + QNN_LOG_ERROR("%s backend setenv failure\n", qnn::get_backend_name(device)); } } #endif @@ -398,8 +411,7 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, auto instance = std::make_shared(path, dev_ctx->lib_name, "ggml"); auto result = instance->qnn_init(nullptr); if (result != 0) { - QNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why\n", - qnn::get_backend_name(device_index)); + QNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why\n", qnn::get_backend_name(device)); return nullptr; } auto qnn_interface = instance->get_qnn_interface(); @@ -408,12 +420,12 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, return nullptr; } - std::string device_name = qnn::get_backend_name(device_index); + std::string device_name = qnn::get_backend_name(device); QNN_LOG_INFO("qnn device name %s", device_name.c_str()); dev_ctx->instance = instance; dev_ctx->qnn_interface = qnn_interface; dev_ctx->socinfo = instance->get_soc_info(); - dev_ctx->supported_types = kDeviceCaps[device_index].supported_types; + dev_ctx->supported_types = kDeviceCaps[device].supported_types; ggml_backend_t qnn_backend = new ggml_backend{ /* .guid = */ ggml_backend_qnn_guid(), @@ -479,9 +491,23 @@ struct ggml_backend_qnn_reg_impl : ggml_backend_reg { std::array, GGML_QNN_MAX_DEVICES> device_contexts; std::array devices; - ggml_backend_qnn_reg_impl(ggml_backend_reg_i interface) { + explicit ggml_backend_qnn_reg_impl(ggml_backend_reg_i interface) { context = this; iface = interface; + + for (int i = 0; i < GGML_QNN_MAX_DEVICES; i++) { + const auto device_enum = (QNNBackend)(GGML_QNN_MAX_DEVICES - 1 - i); // init from the last device, i.e. NPU + device_contexts[i] = std::make_unique( + /* .device = */ device_enum, // init from the last device, i.e. NPU + /* .threads = */ 1, + /* .name = */ qnn::get_backend_name(device_enum), + /* .lib_name = */ kDeviceCaps[device_enum].lib_name); + + auto &device = devices[i]; + device.iface = ggml_backend_qnn_device_interface; + device.reg = this; + device.context = device_contexts[i].get(); + } } }; @@ -512,35 +538,5 @@ const ggml_backend_reg_i ggml_backend_qnn_reg_interface = { ggml_backend_reg_t ggml_backend_qnn_reg() { static ggml_backend_qnn_reg_impl reg{ggml_backend_qnn_reg_interface}; - static bool initialized = false; - static std::mutex mutex; - - { - std::lock_guard lock(mutex); - if (!initialized) { - for (int i = 0; i < GGML_QNN_MAX_DEVICES; i++) { - reg.device_contexts[i] = std::make_unique( - /* .device = */ (QNNBackend)i, - /* .threads = */ 1, - /* .name = */ qnn::get_backend_name(i), - /* .lib_name = */ kDeviceCaps[i].lib_name); - - auto &device = reg.devices[i]; - device.iface = ggml_backend_qnn_device_interface; - device.reg = ® - device.context = reg.device_contexts[i].get(); - } - initialized = true; - } - } - return ® } - -int ggml_backend_qnn_get_device_count() { return GGML_QNN_MAX_DEVICES; } - -ggml_backend_t ggml_backend_qnn_init(size_t index, const char *extend_lib_search_path) { - auto *reg = ggml_backend_qnn_reg(); - auto *device = ggml_backend_qnn_reg_get_device(reg, index); - return ggml_backend_qnn_device_init(device, extend_lib_search_path); -} diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index c0e263a640eea..5643a746313d3 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -549,7 +549,7 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t case GGML_TYPE_F16: case GGML_TYPE_Q8_0: case GGML_TYPE_Q4_0: - if (ctx->supported_types.find(tensor->type) == ctx->supported_types.end()) { + if (!(ctx->supported_types & (1 << tensor->type))) { QNN_LOG_DEBUG("unsupported data type %s for backend %d", type_name, (int)ctx->device); return false; } diff --git a/ggml/src/ggml-qnn/backend.hpp b/ggml/src/ggml-qnn/backend.hpp index eb292e89bfd21..aaced227275c8 100644 --- a/ggml/src/ggml-qnn/backend.hpp +++ b/ggml/src/ggml-qnn/backend.hpp @@ -27,7 +27,7 @@ struct ggml_backend_qnn_device_context { // initialize in init qnn::qcom_socinfo socinfo = {}; - std::unordered_set supported_types; + uint64_t supported_types; std::shared_ptr instance; std::shared_ptr qnn_interface; diff --git a/ggml/src/ggml-qnn/utils.cpp b/ggml/src/ggml-qnn/utils.cpp index 0de9d203ebee9..8ae375ffc8afc 100644 --- a/ggml/src/ggml-qnn/utils.cpp +++ b/ggml/src/ggml-qnn/utils.cpp @@ -7,6 +7,10 @@ #include "qnn-types.hpp" +#ifdef __linux__ +#include +#endif + namespace qnn { qnn_dimension_array_t get_internal_dimension(const ggml_dimension_array_t &dims, uint32_t rank) { @@ -141,7 +145,7 @@ const char *get_ggml_type_name(ggml_type type) { return traits->type_name; } -const char *get_backend_name(size_t device_index) { +const char *get_backend_name(QNNBackend device_index) { switch (device_index) { case QNN_BACKEND_CPU: return "QNN-CPU"; @@ -149,8 +153,7 @@ const char *get_backend_name(size_t device_index) { return "QNN-GPU"; case QNN_BACKEND_NPU: return "QNN-NPU"; - case QNN_BACKEND_GGML: - return "ggml"; //"fake" QNN backend, used for compare performance between QNN backend and original GGML + case QNN_BACKEND_COUNT: default: return "unknown"; } @@ -295,4 +298,20 @@ const char *get_qnn_error_string(Qnn_ErrorHandle_t error) { } } +#ifdef __linux__ + +size_t get_system_total_memory_in_bytes() { + auto pages = (size_t)sysconf(_SC_PHYS_PAGES); + auto page_size = (size_t)sysconf(_SC_PAGE_SIZE); + return pages * page_size; +} + +size_t get_system_free_memory_in_bytes() { + auto avail_pages = (size_t)sysconf(_SC_AVPHYS_PAGES); + auto page_size = (size_t)sysconf(_SC_PAGE_SIZE); + return avail_pages * page_size; +} + +#endif + } // namespace qnn diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp index 2c58d037982f6..40dff321b970e 100644 --- a/ggml/src/ggml-qnn/utils.hpp +++ b/ggml/src/ggml-qnn/utils.hpp @@ -11,6 +11,8 @@ #include "ggml.h" +#include "ggml-qnn.h" + #include "QnnTypes.h" #include "logger.hpp" @@ -25,7 +27,7 @@ qnn_dimension_array_t get_internal_dimension(const ggml_dimension_array_t &dims, uint32_t get_ggml_tensor_rank(const ggml_tensor *tensor); const char *get_ggml_type_name(ggml_type type); -const char *get_backend_name(size_t device_index); +const char *get_backend_name(QNNBackend device_index); const char *get_chipset_desc(uint32_t chipset_id); const char *get_htparch_desc(size_t htp_arch); intptr_t align_to(size_t alignment, intptr_t offset); @@ -198,6 +200,8 @@ Qnn_DataType_t qnn_datatype_from_ggml_datatype(ggml_type ggml_type); ggml_type ggml_datatype_from_qnn_datatype(Qnn_DataType_t qnn_type); size_t qnn_datatype_size(Qnn_DataType_t qnn_type); const char *qnn_datatype_to_string(Qnn_DataType_t qnn_type); +size_t get_system_total_memory_in_bytes(); +size_t get_system_free_memory_in_bytes(); #if ENABLE_QNNBACKEND_PERF class qnn_perf { From a2df09b6afa37cd97df10e3207bdeb10c8c042b9 Mon Sep 17 00:00:00 2001 From: nullname Date: Fri, 29 Nov 2024 00:03:23 +0800 Subject: [PATCH 129/166] [WIP] feat: perf opt (#10) * reduce log * wip * add function to create concat nodes * opt * insert concat node before mulmat * use resize op * wip * add bind_buffer and remov ggml prefix in tensor types * use gather node instead * fix tensor type, now succeed in gpu and cpu, failed in npu * add comment * wip * add comment * wip * in destructor, clear internal buffer before unbind * disable gather for npu * wip * count swap memory as free memory * wip * fix supported_types ggml_backend_device_i.supports_op will be invoked before ggml_backend_device_i.init_backend * rename create_tensors -> initialize_op_nodes * move ggml_qnn_op_config to deparated file * wip * add create_convert_nodes * add comment * enable different type in/out for npu and cpu backend * fix npu convert op * enlarge max buffer size * add more error code * check tensor type before create convert node * add log * add log * remove transpose0 and use buildin transpose flag * rename transpose1 -> transpose_out * disable convert for npu * add more logs --- ggml/src/ggml-qnn.cpp | 19 +- ggml/src/ggml-qnn/backend-ops.cpp | 47 +++-- ggml/src/ggml-qnn/backend.hpp | 17 +- ggml/src/ggml-qnn/graph.hpp | 52 +++--- ggml/src/ggml-qnn/logger.cpp | 5 +- ggml/src/ggml-qnn/op-config-base.hpp | 129 ++++++++++++++ ggml/src/ggml-qnn/op-config.cpp | 257 +++++++++++++++++---------- ggml/src/ggml-qnn/op-config.hpp | 55 +++--- ggml/src/ggml-qnn/qnn-lib.hpp | 64 +++---- ggml/src/ggml-qnn/qnn-types.hpp | 2 +- ggml/src/ggml-qnn/tensor.hpp | 184 +++++++++++-------- ggml/src/ggml-qnn/utils.cpp | 69 +++++-- 12 files changed, 590 insertions(+), 310 deletions(-) create mode 100644 ggml/src/ggml-qnn/op-config-base.hpp diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index a41fae6bbb368..a4dace7078d3b 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -226,9 +226,8 @@ size_t ggml_backend_qnn_buffer_type_get_alignment(ggml_backend_buffer_type_t buf size_t ggml_backend_qnn_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { GGML_UNUSED(buft); - // TODO: this value is an experimental value, works fine with - // whisper/llm/minicpm-v inference on Android - return (96 * 1024 * 1024); + // TODO: get the max size from device + return (1024 * 1024 * 1024); } bool ggml_backend_qnn_buffer_is_host(ggml_backend_buffer_type_t buft) { @@ -339,6 +338,7 @@ void ggml_backend_qnn_device_get_memory(ggml_backend_dev_t dev, size_t *free, si GGML_UNUSED(dev); *free = qnn::get_system_free_memory_in_bytes(); *total = qnn::get_system_total_memory_in_bytes(); + QNN_LOG_DEBUG("free memory: %ldMB, total memory: %ldMB", (*free / 1048576), (*total) / 1048576); } enum ggml_backend_dev_type ggml_backend_qnn_device_get_type(ggml_backend_dev_t dev) { @@ -374,7 +374,7 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, auto *dev_ctx = get_device_context(dev); const auto device = dev_ctx->device; - QNN_LOG_DEBUG("device %d", device); + QNN_LOG_DEBUG("device %s", qnn::get_backend_name(device)); QNN_LOG_DEBUG("extend_lib_search_path %s", extend_lib_search_path); std::string path = extend_lib_search_path; @@ -386,7 +386,7 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, "dsp:/vendor/dsp/images") .c_str(), 1) == 0) { - QNN_LOG_INFO("QNN NPU backend setenv successfully"); + QNN_LOG_DEBUG("QNN NPU backend setenv successfully"); } else { QNN_LOG_ERROR("QNN NPU backend setenv failure"); } @@ -395,13 +395,13 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, "rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp") .c_str(), 1) == 0) { - QNN_LOG_INFO("QNN NPU backend setenv successfully"); + QNN_LOG_DEBUG("QNN NPU backend setenv successfully"); } else { QNN_LOG_ERROR("QNN NPU backend setenv failure"); } } else { if (setenv("LD_LIBRARY_PATH", path.c_str(), 1) == 0) { - QNN_LOG_INFO("%s backend setenv successfully\n", qnn::get_backend_name(device)); + QNN_LOG_DEBUG("%s backend setenv successfully\n", qnn::get_backend_name(device)); } else { QNN_LOG_ERROR("%s backend setenv failure\n", qnn::get_backend_name(device)); } @@ -454,6 +454,7 @@ ggml_backend_buffer_t ggml_backend_qnn_device_buffer_from_ptr(ggml_backend_dev_t } bool ggml_backend_qnn_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor *op) { + // Note that this function could be called before the device context is initialized auto *device_ctx = get_device_context(dev); return qnn::ggml_qnn_supports_op(device_ctx, op); } @@ -495,13 +496,15 @@ struct ggml_backend_qnn_reg_impl : ggml_backend_reg { context = this; iface = interface; + QNN_LOG_DEBUG("qnn backend registry init"); for (int i = 0; i < GGML_QNN_MAX_DEVICES; i++) { const auto device_enum = (QNNBackend)(GGML_QNN_MAX_DEVICES - 1 - i); // init from the last device, i.e. NPU device_contexts[i] = std::make_unique( /* .device = */ device_enum, // init from the last device, i.e. NPU /* .threads = */ 1, /* .name = */ qnn::get_backend_name(device_enum), - /* .lib_name = */ kDeviceCaps[device_enum].lib_name); + /* .lib_name = */ kDeviceCaps[device_enum].lib_name, + /* .supported_types = */ kDeviceCaps[device_enum].supported_types); auto &device = devices[i]; device.iface = ggml_backend_qnn_device_interface; diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 5643a746313d3..da0480df7fd9f 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -543,14 +543,17 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t return false; } +#ifndef NDEBUG auto *type_name = ggml_get_type_traits(tensor->type)->type_name; +#endif switch (tensor->type) { case GGML_TYPE_F32: case GGML_TYPE_F16: case GGML_TYPE_Q8_0: case GGML_TYPE_Q4_0: if (!(ctx->supported_types & (1 << tensor->type))) { - QNN_LOG_DEBUG("unsupported data type %s for backend %d", type_name, (int)ctx->device); + QNN_LOG_DEBUG("unsupported data type %s for backend %s, supported_types: 0x%x", type_name, + qnn::get_backend_name(ctx->device), ctx->supported_types); return false; } break; @@ -563,25 +566,42 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t } bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) { - GGML_UNUSED(ctx); - auto *src0 = op->src[0]; auto *src1 = op->src[1]; - if (src0->type != src1->type || src0->type != op->type) { - // current qnn implementation only supports the same type for src0 and src1 - QNN_LOG_DEBUG("src0 type %d and src1 type %d and op type %d are not equal", src0->type, src1->type, op->type); - return false; + switch (ctx->device) { + case QNN_BACKEND_NPU: + if (src1->ne[2] != src0->ne[2] || src1->ne[3] != src0->ne[3]) { + /* + * TODO: remove the blocker here when NPU backend supports mul_mat like this: + * [ne03, ne02, n, k] * [ne03 * x, ne02 * y, m, k] -> [ne03 * x, ne02 * y, m, n] + */ + QNN_LOG_DEBUG("[qnn-npu] src0 and src1 dimensions are not equal, support/unsupported: %d/%d", + ctx->support_op_count.load(), ++(ctx->unsupported_op_count)); + return false; + } + // fall through, from test here, the convert op is super slow on NPU: + // https://github.com/usefulsensors/qc_npu_benchmark + case QNN_BACKEND_GPU: + if (src0->type != src1->type || src0->type != op->type) { + // there's no convert op for GPU. + QNN_LOG_DEBUG("[qnn-gpu]type src0(%d), src1(%d) and op(%d) are not equal, support/unsupported: %d/%d", + src0->type, src1->type, op->type, ctx->support_op_count.load(), + ++(ctx->unsupported_op_count)); + return false; + } + break; + default: + break; } - if (src0->ne[2] != src1->ne[2] || src0->ne[3] != src1->ne[3]) { - /* - * TODO: remove the blocker here when qnn backend supports mul_mat like this: - * [ne03, ne02, n, k] * [ne03 * x, ne02 * y, m, k] -> [ne03 * x, ne02 * y, m, n] - */ - QNN_LOG_DEBUG("src0 and src1 dimensions are not equal"); + if ((src1->ne[2] % src0->ne[2]) != 0 || (src1->ne[3] % src0->ne[3]) != 0) { + QNN_LOG_DEBUG("[%s] src0 and src1 dimensions are not equal, support/unsupported: %d/%d", + qnn::get_backend_name(ctx->device), ctx->support_op_count.load(), ++(ctx->unsupported_op_count)); return false; } + QNN_LOG_DEBUG("[%s] supported matmul op, support/unsupported: %d/%d", qnn::get_backend_name(ctx->device), + ++(ctx->support_op_count), ctx->unsupported_op_count.load()); return true; } @@ -590,6 +610,7 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggm namespace qnn { bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) { + // Note that this function could be called before the device context is initialized if (op->op == GGML_OP_NONE) { return true; } diff --git a/ggml/src/ggml-qnn/backend.hpp b/ggml/src/ggml-qnn/backend.hpp index aaced227275c8..17823ed577aaa 100644 --- a/ggml/src/ggml-qnn/backend.hpp +++ b/ggml/src/ggml-qnn/backend.hpp @@ -1,6 +1,10 @@ #pragma once +#ifndef NDEBUG +#include +#endif + #include #include #include @@ -25,7 +29,7 @@ struct ggml_backend_qnn_device_context { std::string name; std::string lib_name; - // initialize in init + // initialize in qnn init qnn::qcom_socinfo socinfo = {}; uint64_t supported_types; std::shared_ptr instance; @@ -33,7 +37,12 @@ struct ggml_backend_qnn_device_context { qnn::ggml_qnn_graph_cache_t qnn_graph_cache; - explicit ggml_backend_qnn_device_context(QNNBackend device, size_t threads, const char *name, - const char *lib_name) : - device(device), threads(threads), name(name), lib_name(lib_name) {} +#ifndef NDEBUG + std::atomic_uint32_t support_op_count = 0; + std::atomic_uint32_t unsupported_op_count = 0; +#endif + + explicit ggml_backend_qnn_device_context(QNNBackend device, size_t threads, const char *name, const char *lib_name, + uint64_t supported_types) + : device(device), threads(threads), name(name), lib_name(lib_name), supported_types(supported_types) {} }; diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 858a7d3af29a2..1b0dcd78faa17 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -17,9 +17,9 @@ namespace qnn { class ggml_qnn_graph { public: explicit ggml_qnn_graph(const std::string &graph_name, QNNBackend device, - std::shared_ptr qnn_instance, size_t vtcm_size_in_mb) : - _graph_name(graph_name), _device(device), _qnn_instance(qnn_instance) { - QNN_LOG_INFO("[%s]create", graph_name.c_str()); + std::shared_ptr qnn_instance, size_t vtcm_size_in_mb) + : _graph_name(graph_name), _device(device), _qnn_instance(qnn_instance) { + QNN_LOG_DEBUG("[%s][%s]created", get_backend_name(device), graph_name.c_str()); auto qnn_interface = qnn_instance->get_qnn_interface(); auto qnn_context = qnn_instance->get_qnn_context_handle(); @@ -56,24 +56,25 @@ class ggml_qnn_graph { graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; graph_vtcm_config.customConfig = &vtcm_config; - const QnnGraph_Config_t *graph_configs[] = { &graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config, - &graph_opt_config, nullptr }; + const QnnGraph_Config_t *graph_configs[] = {&graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config, + &graph_opt_config, nullptr}; error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), graph_configs, &graph_handle); } else { error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), nullptr, &graph_handle); } if (error != QNN_SUCCESS) { - QNN_LOG_INFO("[%s]can't create qnn graph handle, error = %d\n", graph_name.c_str(), error); + QNN_LOG_ERROR("[%s][%s]failed to create qnn graph, error: %s\n", get_backend_name(device), + graph_name.c_str(), get_qnn_error_string(error)); return; } - QNN_LOG_INFO("[%s]create succeed\n", graph_name.c_str()); + QNN_LOG_INFO("[%s][%s]create succeed\n", get_backend_name(device), graph_name.c_str()); _graph_handle = graph_handle; _qnn_interface = qnn_interface; } - ~ggml_qnn_graph() { QNN_LOG_DEBUG("[%s]destroy", _graph_name.c_str()); } + ~ggml_qnn_graph() { QNN_LOG_DEBUG("[%s][%s]destroy", get_backend_name(_device), _graph_name.c_str()); } bool build_graph(ggml_op_constructor_t op_constructor, const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) { @@ -83,10 +84,10 @@ class ggml_qnn_graph { return false; } - QNN_LOG_DEBUG("[%s]build_graph start", _graph_name.c_str()); + QNN_LOG_DEBUG("[%s][%s]build_graph start", get_backend_name(_device), _graph_name.c_str()); _op_config = op_constructor(_graph_name, _qnn_instance); - if (!_op_config->create_tensors(_device, _graph_handle, tensor_inputs, tensor_outputs)) { - QNN_LOG_ERROR("[%s]create_tensors failed\n", _graph_name.c_str()); + if (!_op_config->initialize_op_nodes(_device, _graph_handle, tensor_inputs, tensor_outputs)) { + QNN_LOG_ERROR("[%s][%s]initialize_op_nodes failed", get_backend_name(_device), _graph_name.c_str()); return false; } @@ -97,27 +98,23 @@ class ggml_qnn_graph { auto error = _qnn_interface->qnn_graph_finalize(_graph_handle, nullptr, nullptr); if (error != QNN_SUCCESS) { - auto *error_str = get_qnn_error_string(error); - if (error_str) { - QNN_LOG_ERROR("[%s]qnn_graph_finalize.error: %s\n", _graph_name.c_str(), error_str); - } else { - QNN_LOG_ERROR("[%s]qnn_graph_finalize.error: %d\n", _graph_name.c_str(), error); - } + QNN_LOG_ERROR("[%s][%s]qnn_graph_finalize.error: %s", get_backend_name(_device), _graph_name.c_str(), + get_qnn_error_string(error)); return false; } - QNN_LOG_DEBUG("[%s]build_graph succeed", _graph_name.c_str()); + QNN_LOG_DEBUG("[%s][%s]build_graph succeed", get_backend_name(_device), _graph_name.c_str()); return true; } bool execute(const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) { if (!_op_config->bind_input_tensors(tensor_inputs)) { - QNN_LOG_ERROR("[%s]bind input tensors failed\n", _graph_name.c_str()); + QNN_LOG_ERROR("[%s][%s]bind input tensors failed\n", get_backend_name(_device), _graph_name.c_str()); return false; } if (!_op_config->bind_output_tensors(tensor_outputs)) { - QNN_LOG_ERROR("[%s]bind output tensors failed\n", _graph_name.c_str()); + QNN_LOG_ERROR("[%s][%s]bind output tensors failed\n", get_backend_name(_device), _graph_name.c_str()); return false; } @@ -127,20 +124,21 @@ class ggml_qnn_graph { auto error = _qnn_interface->qnn_graph_execute(_graph_handle, qnn_tensor_inputs.data(), qnn_tensor_inputs.size(), qnn_tensor_outputs.data(), qnn_tensor_outputs.size(), nullptr, nullptr); - if (_device == QNN_BACKEND_NPU) { - if (error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) { - QNN_LOG_WARN("[%s]NPU crashed. SSR detected. Caused QNN graph execute error\n", _graph_name.c_str()); - } - } - _op_config->unbind_input_tensors(); _op_config->unbind_output_tensors(); if (error != QNN_SUCCESS) { - QNN_LOG_INFO("[%s]error = %d\n", _graph_name.c_str(), error); + if (_device == QNN_BACKEND_NPU && error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) { + QNN_LOG_WARN("[%s][%s]NPU crashed. SSR detected. Caused QNN graph execute error.", + get_backend_name(_device), _graph_name.c_str()); + } else { + QNN_LOG_ERROR("[%s][%s]error: %s", get_backend_name(_device), _graph_name.c_str(), + get_qnn_error_string(error)); + } return false; } + QNN_LOG_DEBUG("[%s][%s]execute succeed", get_backend_name(_device), _graph_name.c_str()); return true; } diff --git a/ggml/src/ggml-qnn/logger.cpp b/ggml/src/ggml-qnn/logger.cpp index 187e9088c779c..1e781721d629c 100644 --- a/ggml/src/ggml-qnn/logger.cpp +++ b/ggml/src/ggml-qnn/logger.cpp @@ -34,7 +34,7 @@ void qnn::internal_log(ggml_log_level level, const char * /*file*/, const char * } #if ENABLE_QNNSDK_LOG -void qnn::sdk_logcallback(const char *fmt, QnnLog_Level_t level, uint64_t timestamp, va_list argp) { +void qnn::sdk_logcallback(const char *fmt, QnnLog_Level_t level, uint64_t /*timestamp*/, va_list argp) { static std::mutex log_mutex; static unsigned char s_ggml_qnn_logbuf[QNN_LOGBUF_LEN]; @@ -60,13 +60,12 @@ void qnn::sdk_logcallback(const char *fmt, QnnLog_Level_t level, uint64_t timest break; } - double ms = (double)timestamp / 1000000.0; { std::lock_guard lock(log_mutex); memset(s_ggml_qnn_logbuf, 0, QNN_LOGBUF_LEN); vsnprintf(reinterpret_cast(s_ggml_qnn_logbuf), QNN_LOGBUF_LEN, fmt, argp); - QNN_LOG_INFO("%8.1fms [%-7s] %s", ms, log_level_desc, s_ggml_qnn_logbuf); + QNN_LOG_INFO("[%s]%s", log_level_desc, s_ggml_qnn_logbuf); } } #else diff --git a/ggml/src/ggml-qnn/op-config-base.hpp b/ggml/src/ggml-qnn/op-config-base.hpp new file mode 100644 index 0000000000000..159944a7d7f60 --- /dev/null +++ b/ggml/src/ggml-qnn/op-config-base.hpp @@ -0,0 +1,129 @@ +#pragma once + +#include +#include + +#include "ggml-qnn.h" + +#include "qnn-types.hpp" +#include "tensor.hpp" + +namespace qnn { + +using ggml_tensor_array_t = std::vector; + +/** + * @class ggml_qnn_op_config + * @brief Abstract base class for configuring QNN operations. + * + * This class provides an interface for creating and managing tensors, + * adding operations to a graph, and binding/unbinding input and output tensors. + */ +class ggml_qnn_op_config { +public: + virtual ~ggml_qnn_op_config() {} + + /** + * @brief Creates tensors and internal nodes for constructing the calculation graph. + * + * This pure virtual function is responsible for creating tensors on the given + * backend device, associating them with the provided graph handle, and creating + * the internal nodes necessary for constructing the calculation graph. It takes + * input and output tensor arrays as parameters. + * + * @param device The backend device where tensors will be created. + * @param graph_handle The handle to the graph where tensors and nodes will be associated. + * @param tensor_inputs An array of input tensors. + * @param tensor_outputs An array of output tensors. + * @return true if tensors and nodes are successfully created, false otherwise. + */ + virtual bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, + const ggml_tensor_array_t &tensor_inputs, + const ggml_tensor_array_t &tensor_outputs) = 0; + + /** + * @brief Pure virtual function to retrieve the input tensors for QNN (Quantized Neural Network). + * + * This function must be overridden by derived classes to provide the specific implementation + * for retrieving the input tensors used in QNN operations. + * + * @return A reference to a vector of Qnn_Tensor_t objects representing the input tensors. + */ + virtual std::vector &get_qnn_input_tensors() = 0; + + /** + * @brief Pure virtual function to retrieve the output tensors of a QNN (Quantized Neural Network). + * + * This function must be overridden by any derived class to provide access to the + * output tensors of the QNN. The function returns a reference to a vector of + * Qnn_Tensor_t objects, which represent the output tensors. + * + * @return std::vector& Reference to a vector of Qnn_Tensor_t objects. + */ + virtual std::vector &get_qnn_output_tensors() = 0; + + /** + * @brief Adds an operation to the given graph. + * + * This pure virtual function must be implemented by derived classes to add + * a specific operation to the provided graph handle. + * + * This function will be called after `initialize_op_nodes` during initialization. + * + * @param graph_handle The handle to the graph where the operation will be added. + * @return true if the operation was successfully added to the graph, false otherwise. + */ + virtual bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) = 0; + + /** + * @brief Binds the input tensors to the operation. + * + * This pure virtual function must be implemented by derived classes to bind + * the provided input tensors to the operation. The function takes a constant + * reference to a ggml_tensor_array_t object, which contains the input tensors + * to be bound. + * + * @param tensor_inputs A constant reference to a ggml_tensor_array_t object + * containing the input tensors. + * @return true if the input tensors were successfully bound, false otherwise. + */ + virtual bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) = 0; + + /** + * @brief Binds the output tensors to the given tensor array. + * + * This pure virtual function must be implemented by derived classes to bind + * the output tensors to the provided array of tensors. The function is expected + * to establish the necessary connections or mappings between the output tensors + * and the elements of the given tensor array. + * + * @param tensor_outputs A constant reference to an array of ggml tensors that + * represent the output tensors to be bound. + * @return true if the binding is successful, false otherwise. + */ + virtual bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) = 0; + + /** + * @brief Unbinds the input tensors from the operation. + * + * This pure virtual function is intended to be overridden by derived classes + * to implement the logic for unbinding or detaching input tensors that were + * previously bound to the operation. This is typically used to release resources + * or reset the state of the operation. + */ + virtual void unbind_input_tensors() = 0; + + /** + * @brief Unbinds the output tensors. + * + * This pure virtual function is responsible for unbinding or detaching + * the output tensors from their current bindings. Implementations of this + * function should ensure that any resources or references held by the + * output tensors are properly released or reset. + */ + virtual void unbind_output_tensors() = 0; +}; + +using qnn_op_config_ptr_t = std::shared_ptr; + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/op-config.cpp b/ggml/src/ggml-qnn/op-config.cpp index 9b98051adfc8e..df70d548a44e0 100644 --- a/ggml/src/ggml-qnn/op-config.cpp +++ b/ggml/src/ggml-qnn/op-config.cpp @@ -36,7 +36,7 @@ int get_rank(const qnn::ggml_tensor_array_t &tensor_inputs, const qnn::ggml_tens return tensor_rank; } -Qnn_DataType_t get_tensor_type(const qnn::ggml_qnn_tensor_array_t &tensors) { +Qnn_DataType_t get_tensor_type(const qnn::qnn_tensor_array_t &tensors) { Qnn_DataType_t type = QNN_DATATYPE_UNDEFINED; for (auto tensor : tensors) { auto tensor_type_size = qnn::qnn_datatype_size(tensor->get_data_type()); @@ -59,8 +59,7 @@ struct tensor_common_params { }; void create_tensors_from_ggml_tensor(const tensor_common_params ¶ms, const qnn::ggml_tensor_array_t &ggml_tensors, - qnn::ggml_qnn_tensor_array_t *tensor_wrappers, - std::vector *qnn_tensors) { + qnn::qnn_tensor_array_t *tensor_wrappers, std::vector *qnn_tensors) { using namespace qnn; tensor_wrappers->resize(ggml_tensors.size()); @@ -78,7 +77,7 @@ void create_tensors_from_ggml_tensor(const tensor_common_params ¶ms, const q } } -bool bind_tensors(const qnn::ggml_tensor_array_t &ggml_tensors, qnn::ggml_qnn_tensor_array_t &tensor_wrappers, +bool bind_tensors(const qnn::ggml_tensor_array_t &ggml_tensors, qnn::qnn_tensor_array_t &tensor_wrappers, std::vector &qnn_tensors) { for (size_t i = 0; i < ggml_tensors.size(); i++) { auto *ggml_tensor = ggml_tensors[i]; @@ -99,9 +98,9 @@ class ggml_qnn_connectable_op_config : public qnn::ggml_qnn_op_config_base { const std::string &op_type, std::shared_ptr qnn_instance) : ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} - bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, - const qnn::ggml_tensor_array_t &tensor_inputs, - const qnn::ggml_tensor_array_t &tensor_outputs) override { + bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, + const qnn::ggml_tensor_array_t &tensor_inputs, + const qnn::ggml_tensor_array_t &tensor_outputs) override { GGML_UNUSED(device); GGML_UNUSED(graph_handle); GGML_UNUSED(tensor_inputs); @@ -109,28 +108,28 @@ class ggml_qnn_connectable_op_config : public qnn::ggml_qnn_op_config_base { return true; } - void set_input_tensors(qnn::ggml_qnn_tensor_array_t &tensor_inputs) { + void set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) { _tensor_inputs = tensor_inputs; _qnn_tensor_inputs.resize(_tensor_inputs.size()); } - void set_input_tensors(qnn::ggml_qnn_tensor_array_t &&tensor_inputs) { + void set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) { _tensor_inputs = std::move(tensor_inputs); _qnn_tensor_inputs.resize(_tensor_inputs.size()); } - void set_output_tensors(qnn::ggml_qnn_tensor_array_t &tensor_outputs) { + void set_output_tensors(qnn::qnn_tensor_array_t &tensor_outputs) { _tensor_outputs = tensor_outputs; _qnn_tensor_outputs.resize(_tensor_outputs.size()); } - void set_output_tensors(qnn::ggml_qnn_tensor_array_t &&tensor_outputs) { + void set_output_tensors(qnn::qnn_tensor_array_t &&tensor_outputs) { _tensor_outputs = std::move(tensor_outputs); _qnn_tensor_outputs.resize(_tensor_outputs.size()); } - qnn::ggml_qnn_tensor_array_t &get_input_tensors() { return _tensor_inputs; } - qnn::ggml_qnn_tensor_array_t &get_output_tensors() { return _tensor_outputs; } + qnn::qnn_tensor_array_t &get_input_tensors() { return _tensor_inputs; } + qnn::qnn_tensor_array_t &get_output_tensors() { return _tensor_outputs; } private: DISABLE_COPY(ggml_qnn_connectable_op_config); @@ -186,7 +185,7 @@ bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) { GGML_ASSERT(_qnn_tensor_inputs.size() == _tensor_inputs.size()); GGML_ASSERT(_qnn_tensor_outputs.size() == _tensor_outputs.size()); - auto qnn_interface = _qnn_instance->get_qnn_interface(); + QNN_LOG_DEBUG("[%s]add to graph start\n", _name.c_str()); for (size_t i = 0; i < _tensor_inputs.size(); i++) { auto tensor = _tensor_inputs[i]; if (!tensor->alloc_qnn_tensor_id()) { @@ -194,6 +193,7 @@ bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) { return false; } + QNN_LOG_DEBUG("[%s]input tensor id: %d\n", _name.c_str(), tensor->get_qnn_tensor_id()); _qnn_tensor_inputs[i] = tensor->get_qnn_tensor(); } @@ -203,21 +203,19 @@ bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) { QNN_LOG_ERROR("[%s]output tensor alloc_qnn_tensor_id failed\n", _name.c_str()); return false; } - _qnn_tensor_outputs[i] = _tensor_outputs[i]->get_qnn_tensor(); + + QNN_LOG_DEBUG("[%s]output tensor id: %d\n", _name.c_str(), tensor->get_qnn_tensor_id()); + _qnn_tensor_outputs[i] = tensor->get_qnn_tensor(); } + auto qnn_interface = _qnn_instance->get_qnn_interface(); auto error = qnn_interface->qnn_graph_add_node(graph_handle, get_op_config()); if (error != QNN_SUCCESS) { - auto *error_str = get_qnn_error_string(error); - if (error_str) { - QNN_LOG_ERROR("[%s]qnn_graph_add_node.error: %s\n", _name.c_str(), error_str); - } else { - QNN_LOG_ERROR("[%s]qnn_graph_add_node.error: %d\n", _name.c_str(), error); - } + QNN_LOG_ERROR("[%s]qnn_graph_add_node.error: %s", _name.c_str(), get_qnn_error_string(error)); return false; } - QNN_LOG_DEBUG("[%s]added to graph\n", _name.c_str()); + QNN_LOG_DEBUG("[%s]added to graph succeed\n", _name.c_str()); return true; } @@ -259,9 +257,9 @@ Qnn_OpConfig_t ggml_qnn_op_config_base::get_op_config() { return config; } -bool ggml_qnn_single_op_config::create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, - const ggml_tensor_array_t &tensor_inputs, - const ggml_tensor_array_t &tensor_outputs) { +bool ggml_qnn_single_op_config::initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, + const ggml_tensor_array_t &tensor_inputs, + const ggml_tensor_array_t &tensor_outputs) { const auto tensor_rank = get_rank(tensor_inputs, tensor_outputs); tensor_common_params params = {"src", tensor_rank, true, device, graph_handle, _qnn_instance}; create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs); @@ -282,9 +280,9 @@ bool ggml_qnn_single_op_config::create_tensors(QNNBackend device, Qnn_GraphHandl return true; } -bool ggml_qnn_matmul_op_config::create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, - const ggml_tensor_array_t &tensor_inputs, - const ggml_tensor_array_t &tensor_outputs) { +bool ggml_qnn_matmul_op_config::initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, + const ggml_tensor_array_t &tensor_inputs, + const ggml_tensor_array_t &tensor_outputs) { GGML_ASSERT(tensor_inputs.size() == 2); GGML_ASSERT(tensor_outputs.size() == 1); const auto tensor_rank = get_rank(tensor_inputs, tensor_outputs); @@ -295,59 +293,143 @@ bool ggml_qnn_matmul_op_config::create_tensors(QNNBackend device, Qnn_GraphHandl create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs); // create output tensor - ggml_qnn_tensor_array_t mat_mul_tensor_outputs; + qnn_tensor_array_t mat_mul_tensor_outputs; params.name_prefix = "dst"; params.is_input = false; create_tensors_from_ggml_tensor(params, tensor_outputs, &mat_mul_tensor_outputs, nullptr); + // create convert nodes + qnn_tensor_array_t mat_mul_tensor_inputs = _tensor_inputs; + if (!create_convert_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs, mat_mul_tensor_outputs)) { + QNN_LOG_ERROR("create convert nodes failed\n"); + return false; + } + + mat_mul_tensor_inputs.front() = create_gather_nodes(device, graph_handle, tensor_rank, _tensor_inputs.front(), + _tensor_inputs.back()->get_dimensions()); + return create_mat_mul_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs, mat_mul_tensor_outputs); +} + +qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, + const int rank, qnn_tensor_ptr_t tensor_input, + qnn_dimension_array_t output_dimensions) { + if (rank <= 2) { + return tensor_input; + } + + const auto &input_dimensions = tensor_input->get_dimensions(); + output_dimensions[rank - 1] = input_dimensions[rank - 1]; + output_dimensions[rank - 2] = input_dimensions[rank - 2]; + + const auto y = output_dimensions[rank - 3] / input_dimensions[rank - 3]; + if (y == 1 && (rank == 3 || (rank == 4 && output_dimensions[rank - 4] == input_dimensions[rank - 4]))) { + return tensor_input; + } + + // create concat nodes, to convert tensor shape from [ne03, ne02, n, k] to [ne03 * x, ne02 * y, n, k] + constexpr const auto create_node = + [](const std::string &name, const int rank, const int axis, const qnn_dimension_array_t &dimensions, + qnn_tensor_ptr_t tensor_input, QNNBackend device, Qnn_GraphHandle_t graph_handle, + std::shared_ptr qnn_instance, qnn_tensor_ptr_t &tensor_output) -> qnn_op_config_ptr_t { + auto gather_out = + std::make_shared(ggml_qnn_tensor::INTERMEDIATE, name + "_out", dimensions, + tensor_input->get_data_type(), rank, device, graph_handle, qnn_instance); + auto gather_op = std::make_shared(name, QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_GATHER, qnn_instance); + + Qnn_Scalar_t scalar = QNN_SCALAR_INIT; + scalar.dataType = QNN_DATATYPE_INT_32; + scalar.int32Value = axis; + gather_op->add_scalar_param(QNN_OP_GATHER_PARAM_AXIS, scalar); + gather_op->set_output_tensors({gather_out}); + + // here we calculate the index mapping, will generate a 1d tensor like [0, 0, 0, 1, 1, 1, 2, 2, 2, ...], + // by repeating each index [scale] times. + const auto scale = dimensions[axis] / tensor_input->get_dimensions()[axis]; + std::vector index_buffer(dimensions[axis] * sizeof(uint32_t)); + for (uint32_t *curr = reinterpret_cast(index_buffer.data()), *end = curr + dimensions[axis]; + curr < end; curr++) { + *curr = (curr - reinterpret_cast(index_buffer.data())) / scale; + } + + auto gather_index = std::make_shared( + ggml_qnn_tensor::PARAMETER, name + "_index", qnn_dimension_array_t{dimensions[axis]}, QNN_DATATYPE_UINT_32, + 1, device, graph_handle, qnn_instance); + gather_index->set_data_buffer(std::move(index_buffer)); + gather_op->set_input_tensors({tensor_input, gather_index}); + + tensor_output = gather_out; + return gather_op; + }; + + qnn_dimension_array_t intermediate_dimensions = input_dimensions; + intermediate_dimensions[rank - 3] = output_dimensions[rank - 3]; + qnn_tensor_ptr_t gather0_out; + _gather0 = create_node(_name + "_gather0", rank, rank - 3, intermediate_dimensions, tensor_input, device, + graph_handle, _qnn_instance, gather0_out); + if (rank == 3) { + return gather0_out; + } + + qnn_tensor_ptr_t gather1_out; + _gather1 = create_node(_name + "_gather1", rank, rank - 4, output_dimensions, gather0_out, device, graph_handle, + _qnn_instance, gather1_out); + return gather1_out; +} + +bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, + qnn_tensor_array_t &tensor_inputs, + qnn_tensor_array_t &tensor_outputs) { if (device == QNN_BACKEND_GPU) { - // there's no convert op for GPU, so we should create matmul nodes directl. - return create_mat_mul_nodes(device, graph_handle, tensor_rank, _tensor_inputs, mat_mul_tensor_outputs); + // there's no convert op for GPU, so we should create matmul nodes directly. + return true; } // create tensors for convert node - ggml_qnn_tensor_array_t mat_mul_tensor_inputs = _tensor_inputs; - auto input_tensor_type = get_tensor_type(mat_mul_tensor_inputs); - QNN_LOG_DEBUG("matmul input tensor type: %s\n", qnn_datatype_to_string(input_tensor_type)); + auto tensor_type = get_tensor_type(tensor_inputs); + QNN_LOG_DEBUG("input tensor type: %s\n", qnn_datatype_to_string(tensor_type)); - _input_converts.resize(mat_mul_tensor_inputs.size()); - for (size_t i = 0; i < mat_mul_tensor_inputs.size(); ++i) { + _input_converts.resize(tensor_inputs.size()); + for (size_t i = 0; i < tensor_inputs.size(); ++i) { // create input convert nodes + auto convert_in = tensor_inputs[i]; + if (convert_in->get_data_type() == tensor_type) { + continue; + } + std::string convert_name("convert_src" + std::to_string(i)); - auto convert_in = mat_mul_tensor_inputs[i]; auto convert_out = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_out", - convert_in->get_dimensions(), input_tensor_type, - tensor_rank, device, graph_handle, _qnn_instance); + convert_in->get_dimensions(), tensor_type, rank, device, + graph_handle, _qnn_instance); auto convert = std::make_shared(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_CONVERT, _qnn_instance); convert->set_input_tensors({convert_in}); convert->set_output_tensors({convert_out}); - mat_mul_tensor_inputs[i] = convert_out; + tensor_inputs[i] = convert_out; _input_converts[i] = convert; } - { + if (tensor_outputs.front()->get_data_type() != tensor_type) { // create output convert node std::string convert_name("convert_dst"); - auto convert_out = mat_mul_tensor_outputs.front(); + auto convert_out = tensor_outputs.front(); auto convert_in = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_in", - convert_out->get_dimensions(), input_tensor_type, - tensor_rank, device, graph_handle, _qnn_instance); + convert_out->get_dimensions(), tensor_type, rank, device, + graph_handle, _qnn_instance); auto output_convert = std::make_shared( convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_CONVERT, _qnn_instance); output_convert->set_input_tensors({convert_in}); output_convert->set_output_tensors({convert_out}); - mat_mul_tensor_outputs[0] = convert_in; + tensor_outputs.front() = convert_in; _output_convert = output_convert; } - // create mat_mul nodes - return create_mat_mul_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs, mat_mul_tensor_outputs); + return true; } bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, - ggml_qnn_tensor_array_t &tensor_inputs, - ggml_qnn_tensor_array_t &tensor_outputs) { + qnn_tensor_array_t &tensor_inputs, + qnn_tensor_array_t &tensor_outputs) { /* * First, both the ggml and qnn tensor in memory are stored as row-major format. (For more details, please also: @@ -386,9 +468,8 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap * ```mermaid * graph TD; * i1>ggml_tensor_in0] --src0--> mat_mul0; - * i2>ggml_tensor_in1] --src1--> transpose0; - * transpose0 --src0_trans--> mat_mul0; - * mat_mul0 --dst_trans--> transpose1; + * i2>ggml_tensor_in1] --src1--> mat_mul0; + * mat_mul0 --dst_trans--> transpose_out; * transpose1 --dst0--> o1>ggml_tensor_out]; * ``` */ @@ -398,9 +479,6 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS does not match the expected value"); qnn_dimension_array_t dimensions = get_transposed_dimensions(src1->get_dimensions(), rank); - auto src0_trans = - std::make_shared(ggml_qnn_tensor::INTERMEDIATE, "src0_trans", dimensions, - src1->get_data_type(), rank, device, graph_handle, _qnn_instance); // create dst_trans tensor auto dst = tensor_outputs.front(); @@ -408,48 +486,37 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap auto dst_trans = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, "dst_trans", dimensions, dst->get_data_type(), rank, device, graph_handle, _qnn_instance); - // create transpose0 - auto transpose0 = std::make_shared(_name + "_trans0", QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_TRANSPOSE, _qnn_instance); - - // create transpose1 - auto transpose1 = std::make_shared(_name + "_trans1", QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_TRANSPOSE, _qnn_instance); + // create transpose_out + auto transpose_out = std::make_shared( + _name + "_trans1", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_TRANSPOSE, _qnn_instance); // create mat_mul auto mat_mul = std::make_shared(_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL, _qnn_instance); - // set transpose0 parameters + Qnn_Scalar_t scalar = QNN_SCALAR_INIT; + scalar.dataType = QNN_DATATYPE_BOOL_8; + scalar.bool8Value = 1; + mat_mul->add_scalar_param(QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1, scalar); + + // set transpose_out parameters auto *params_data = reinterpret_cast(kTransposeParamData[rank - 1].data()); const qnn_dimension_array_t param_dims = {(uint32_t)rank, 1, 1, 1}; - transpose0->add_tensor_param(QNN_OP_TRANSPOSE_PARAM_PERM, param_dims, 1, params_data, QNN_DATATYPE_UINT_32, device, - graph_handle); - - // set transpose1 parameters - transpose1->add_tensor_param(QNN_OP_TRANSPOSE_PARAM_PERM, param_dims, 1, params_data, QNN_DATATYPE_UINT_32, device, - graph_handle); - - // set tensor to transpose0 - ggml_qnn_tensor_array_t tensors = {tensor_inputs.back()}; - transpose0->set_input_tensors(tensors); - tensors = {src0_trans}; - transpose0->set_output_tensors(tensors); + transpose_out->add_tensor_param(QNN_OP_TRANSPOSE_PARAM_PERM, param_dims, 1, params_data, QNN_DATATYPE_UINT_32, + device, graph_handle); // set tensor to mat_mul - tensors = {tensor_inputs.front(), src0_trans}; - mat_mul->set_input_tensors(tensors); - tensors = {dst_trans}; + mat_mul->set_input_tensors(tensor_inputs); + qnn_tensor_array_t tensors = {dst_trans}; mat_mul->set_output_tensors(tensors); - // set tensor to transpose1 + // set tensor to transpose_out tensors = {dst_trans}; - transpose1->set_input_tensors(tensors); - transpose1->set_output_tensors(tensor_outputs); + transpose_out->set_input_tensors(tensors); + transpose_out->set_output_tensors(tensor_outputs); _mat_mul = mat_mul; - _transpose0 = transpose0; - _transpose1 = transpose1; + _transpose_out = transpose_out; return true; } @@ -460,8 +527,15 @@ bool ggml_qnn_matmul_op_config::add_op_to_graph(Qnn_GraphHandle_t graph_handle) } } - return _transpose0->add_op_to_graph(graph_handle) && _mat_mul->add_op_to_graph(graph_handle) && - _transpose1->add_op_to_graph(graph_handle) && + if (_gather0 && !_gather0->add_op_to_graph(graph_handle)) { + return false; + } + + if (_gather1 && !_gather1->add_op_to_graph(graph_handle)) { + return false; + } + + return _mat_mul->add_op_to_graph(graph_handle) && _transpose_out->add_op_to_graph(graph_handle) && (!_output_convert || _output_convert->add_op_to_graph(graph_handle)); } @@ -473,13 +547,12 @@ bool ggml_qnn_matmul_op_config::bind_output_tensors(const ggml_tensor_array_t &t if (_output_convert) { return _output_convert->bind_output_tensors(tensor_outputs); } else { - return _transpose1->bind_output_tensors(tensor_outputs); + return _transpose_out->bind_output_tensors(tensor_outputs); } } void ggml_qnn_matmul_op_config::unbind_input_tensors() { _mat_mul->unbind_input_tensors(); - _transpose0->unbind_input_tensors(); for (auto &convert : _input_converts) { if (convert) { convert->unbind_input_tensors(); @@ -488,7 +561,7 @@ void ggml_qnn_matmul_op_config::unbind_input_tensors() { } void ggml_qnn_matmul_op_config::unbind_output_tensors() { - _transpose1->unbind_output_tensors(); + _transpose_out->unbind_output_tensors(); if (_output_convert) { _output_convert->unbind_output_tensors(); } @@ -498,7 +571,7 @@ std::vector &ggml_qnn_matmul_op_config::get_qnn_output_tensors() { if (_output_convert) { return _output_convert->get_qnn_output_tensors(); } else { - return _transpose1->get_qnn_output_tensors(); + return _transpose_out->get_qnn_output_tensors(); } } @@ -513,9 +586,9 @@ ggml_op_constructor_t create_op_constructor(const std::string &op_name) { } else if (op_name == QNN_OP_TRANSPOSE) { return [](const std::string &instance_name, std::shared_ptr qnn_instance) -> std::unique_ptr { - return std::make_unique(instance_name, QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_TRANSPOSE, QNN_OP_TRANSPOSE_PARAM_PERM, - QNN_DATATYPE_UINT_32, 4 * sizeof(uint32_t), qnn_instance); + return std::make_unique( + instance_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_TRANSPOSE, QNN_OP_TRANSPOSE_PARAM_PERM, + QNN_DATATYPE_UINT_32, 4 * sizeof(uint32_t), qnn_instance); }; } diff --git a/ggml/src/ggml-qnn/op-config.hpp b/ggml/src/ggml-qnn/op-config.hpp index 4ec7aac9b256e..27571563309a8 100644 --- a/ggml/src/ggml-qnn/op-config.hpp +++ b/ggml/src/ggml-qnn/op-config.hpp @@ -5,31 +5,13 @@ #include #include -#include "ggml-qnn.h" - +#include "op-config-base.hpp" #include "qnn-lib.hpp" #include "qnn-types.hpp" #include "tensor.hpp" namespace qnn { -using ggml_tensor_array_t = std::vector; - -class ggml_qnn_op_config { -public: - virtual ~ggml_qnn_op_config() {} - virtual bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, - const ggml_tensor_array_t &tensor_inputs, - const ggml_tensor_array_t &tensor_outputs) = 0; - virtual std::vector &get_qnn_input_tensors() = 0; - virtual std::vector &get_qnn_output_tensors() = 0; - virtual bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) = 0; - virtual bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) = 0; - virtual bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) = 0; - virtual void unbind_input_tensors() = 0; - virtual void unbind_output_tensors() = 0; -}; - using ggml_op_constructor_t = std::function(const std::string &, std::shared_ptr)>; @@ -60,9 +42,9 @@ class ggml_qnn_op_config_base : public ggml_qnn_op_config { std::string _package_name; std::string _op_type; std::shared_ptr _qnn_instance; - ggml_qnn_tensor_array_t _tensor_inputs; - ggml_qnn_tensor_array_t _tensor_outputs; - ggml_qnn_tensor_array_t _tensor_parameters; + qnn_tensor_array_t _tensor_inputs; + qnn_tensor_array_t _tensor_outputs; + qnn_tensor_array_t _tensor_parameters; std::vector _qnn_tensor_inputs; std::vector _qnn_tensor_outputs; std::vector _qnn_parameters; @@ -87,8 +69,9 @@ class ggml_qnn_single_op_config : public ggml_qnn_op_config_base { _param_type(param_type), _param_buffer(param_size) {} - bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, const ggml_tensor_array_t &tensor_inputs, - const ggml_tensor_array_t &tensor_outputs) override; + bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, + const ggml_tensor_array_t &tensor_inputs, + const ggml_tensor_array_t &tensor_outputs) override; private: const std::string _param_name; @@ -104,8 +87,9 @@ class ggml_qnn_matmul_op_config : public ggml_qnn_op_config { ggml_qnn_matmul_op_config(const std::string &name, std::shared_ptr qnn_instance) : _name(name), _qnn_instance(qnn_instance) {} - bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, const ggml_tensor_array_t &tensor_inputs, - const ggml_tensor_array_t &tensor_outputs) override; + bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, + const ggml_tensor_array_t &tensor_inputs, + const ggml_tensor_array_t &tensor_outputs) override; bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override; bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) override; bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) override; @@ -115,17 +99,22 @@ class ggml_qnn_matmul_op_config : public ggml_qnn_op_config { std::vector &get_qnn_output_tensors() override; private: + qnn_tensor_ptr_t create_gather_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, + qnn_tensor_ptr_t tensor_input, qnn_dimension_array_t output_dimensions); + bool create_convert_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, + qnn_tensor_array_t &tensor_inputs, qnn_tensor_array_t &tensor_outputs); bool create_mat_mul_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, - ggml_qnn_tensor_array_t &tensor_inputs, ggml_qnn_tensor_array_t &tensor_outputs); + qnn_tensor_array_t &tensor_inputs, qnn_tensor_array_t &tensor_outputs); std::string _name; std::shared_ptr _qnn_instance; - std::shared_ptr _transpose0; - std::shared_ptr _transpose1; - std::shared_ptr _mat_mul; - std::vector> _input_converts; - std::shared_ptr _output_convert; - ggml_qnn_tensor_array_t _tensor_inputs; + qnn_op_config_ptr_t _transpose_out; + qnn_op_config_ptr_t _mat_mul; + qnn_op_config_ptr_t _gather0; + qnn_op_config_ptr_t _gather1; + std::vector _input_converts; + qnn_op_config_ptr_t _output_convert; + qnn_tensor_array_t _tensor_inputs; std::vector _qnn_tensor_inputs; DISABLE_COPY(ggml_qnn_matmul_op_config); diff --git a/ggml/src/ggml-qnn/qnn-lib.hpp b/ggml/src/ggml-qnn/qnn-lib.hpp index 74bc2b3f95f6b..c6801b7771ee9 100644 --- a/ggml/src/ggml-qnn/qnn-lib.hpp +++ b/ggml/src/ggml-qnn/qnn-lib.hpp @@ -188,8 +188,8 @@ class qnn_instance { public: using BackendIdType = decltype(QnnInterface_t{}.backendId); - explicit qnn_instance(const std::string &lib_path, const std::string &backend_name, const std::string &model_name) : - _lib_path(std::move(lib_path)), _backend_name(std::move(backend_name)), _model_name(std::move(model_name)) {} + explicit qnn_instance(const std::string &lib_path, const std::string &backend_name, const std::string &model_name) + : _lib_path(std::move(lib_path)), _backend_name(std::move(backend_name)), _model_name(std::move(model_name)) {} ~qnn_instance() {} @@ -269,7 +269,7 @@ class qnn_instance { QNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB", chipinfo.socModel, qnn::get_chipset_desc(chipinfo.socModel), htp_arch, qnn::get_htparch_desc(htp_arch), chipinfo.vtcmSize); - _soc_info = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize }; + _soc_info = {chipinfo.socModel, htp_arch, chipinfo.vtcmSize}; } _qnn_interface->qnn_device_free_platform_info(nullptr, p_info); @@ -288,7 +288,7 @@ class qnn_instance { arch_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; arch_devconfig.customConfig = &arch_customconfig; - const QnnDevice_Config_t *p_deviceconfig[] = { &soc_devconfig, &arch_devconfig, nullptr }; + const QnnDevice_Config_t *p_deviceconfig[] = {&soc_devconfig, &arch_devconfig, nullptr}; qnn_status = _qnn_interface->qnn_device_create(_qnn_log_handle, p_deviceconfig, &_qnn_device_handle); } else { qnn_status = _qnn_interface->qnn_device_create(_qnn_log_handle, nullptr, &_qnn_device_handle); @@ -299,27 +299,17 @@ class qnn_instance { QNN_LOG_INFO("create QNN device successfully\n"); } - if (qnn::sdk_profile_level::profile_off != _profile_level) { + if (_profile_level != sdk_profile_level::profile_off) { QNN_LOG_INFO("profiling turned on; level = %d", _profile_level); - if (qnn::sdk_profile_level::profile_basic == _profile_level) { - QNN_LOG_INFO("basic profiling requested. creating Qnn Profile object\n"); - if (QNN_PROFILE_NO_ERROR != _qnn_interface->qnn_profile_create( - _qnn_backend_handle, QNN_PROFILE_LEVEL_BASIC, &_qnn_profile_handle)) { - QNN_LOG_WARN("unable to create profile handle in the backend\n"); - return 6; - } else { - QNN_LOG_DEBUG("initialize qnn profile successfully\n"); - } - } else if (qnn::sdk_profile_level::profile_detail == _profile_level) { - QNN_LOG_INFO("detailed profiling requested. Creating Qnn Profile object\n"); - if (QNN_PROFILE_NO_ERROR != _qnn_interface->qnn_profile_create(_qnn_backend_handle, - QNN_PROFILE_LEVEL_DETAILED, - &_qnn_profile_handle)) { - QNN_LOG_WARN("unable to create profile handle in the backend\n"); - return 7; - } else { - QNN_LOG_DEBUG("initialize qnn profile successfully\n"); - } + auto profile_level = _profile_level == sdk_profile_level::profile_detail ? QNN_PROFILE_LEVEL_DETAILED + : QNN_PROFILE_LEVEL_BASIC; + + if (QNN_PROFILE_NO_ERROR != + _qnn_interface->qnn_profile_create(_qnn_backend_handle, profile_level, &_qnn_profile_handle)) { + QNN_LOG_WARN("unable to create profile handle in the backend\n"); + return 6; + } else { + QNN_LOG_DEBUG("initialize qnn profile successfully\n"); } } @@ -364,7 +354,7 @@ class qnn_instance { size_t candidate_size = 0; uint8_t *rpc_buffer = nullptr; const int size_in_mb = (1 << 20); - size_t probe_slots[] = { 1024, 1536, 2048 - 48, 2048 }; + size_t probe_slots[] = {1024, 1536, 2048 - 48, 2048}; size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); for (size_t idx = 0; idx < probe_counts; idx++) { rpc_buffer = static_cast(alloc_rpcmem(probe_slots[idx] * size_in_mb, sizeof(void *))); @@ -526,13 +516,13 @@ class qnn_instance { // use rpc control latency recommended 100 us, refer hexagon sdk rpc_control_latency.rpcControlLatencyConfig = 100; - const QnnHtpPerfInfrastructure_PowerConfig_t *power_configs[] = { &rpc_polling_time, &rpc_control_latency, - nullptr }; + const QnnHtpPerfInfrastructure_PowerConfig_t *power_configs[] = {&rpc_polling_time, &rpc_control_latency, + nullptr}; Qnn_ErrorHandle_t qnn_status = _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs); if (qnn_status != QNN_SUCCESS) { QNN_LOG_WARN("set htp perf failed\n"); } else { - QNN_LOG_INFO("set htp perf ok\n"); + QNN_LOG_DEBUG("set htp perf ok\n"); } } else { QNN_LOG_WARN("can't set htp perf\n"); @@ -572,13 +562,13 @@ class qnn_instance { power_config.dcvsV3Config.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; // set power config with different performance parameters - const QnnHtpPerfInfrastructure_PowerConfig_t *power_configs[] = { &power_config, nullptr }; + const QnnHtpPerfInfrastructure_PowerConfig_t *power_configs[] = {&power_config, nullptr}; Qnn_ErrorHandle_t qnn_status = QNN_SUCCESS; qnn_status = _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs); if (qnn_status != QNN_SUCCESS) { QNN_LOG_WARN("set htp high performance mode failed\n"); } else { - QNN_LOG_INFO("set htp high performance mode ok\n"); + QNN_LOG_DEBUG("set htp high performance mode ok\n"); } return 0; @@ -659,8 +649,8 @@ class qnn_instance { return nullptr; } - QNN_LOG_INFO("mem_fd %d\n", mem_fd); - Qnn_MemDescriptor_t descriptor = { { rank, dimensions, nullptr }, data_type, QNN_MEM_TYPE_ION, { { mem_fd } } }; + QNN_LOG_DEBUG("mem_fd %d\n", mem_fd); + Qnn_MemDescriptor_t descriptor = {{rank, dimensions, nullptr}, data_type, QNN_MEM_TYPE_ION, {{mem_fd}}}; Qnn_MemHandle_t handle = nullptr; auto error = _qnn_interface->qnn_mem_register(_qnn_context_handle, &descriptor, /*numDescriptors=*/1, &handle); @@ -670,8 +660,8 @@ class qnn_instance { return nullptr; } - _qnn_rpc_buffer_to_handles.insert({ p_data, handle }); - QNN_LOG_INFO("successfully register shared memory handler: %p\n", handle); + _qnn_rpc_buffer_to_handles.insert({p_data, handle}); + QNN_LOG_DEBUG("successfully register shared memory handler: %p\n", handle); return handle; } @@ -748,7 +738,7 @@ class qnn_instance { QNN_LOG_WARN("unable to find a valid qnn system interface\n"); return 6; } else { - QNN_LOG_INFO("find a valid qnn system interface\n"); + QNN_LOG_DEBUG("find a valid qnn system interface\n"); } auto qnn_sys_interface = std::make_shared(*provider_list[0], system_lib_handle); @@ -810,7 +800,7 @@ class qnn_instance { QNN_LOG_WARN("unable to find a valid qnn interface\n"); return 6; } else { - QNN_LOG_INFO("find a valid qnn interface\n"); + QNN_LOG_DEBUG("find a valid qnn interface\n"); } BackendIdType backend_id = provider_list[0]->backendId; @@ -890,7 +880,7 @@ class qnn_instance { std::unordered_map _loaded_backend; dl_handler_t _rpc_lib_handle = nullptr; - std::atomic_bool _rpcmem_initialized{ false }; + std::atomic_bool _rpcmem_initialized{false}; qnn::pfn_rpc_mem_alloc _pfn_rpc_mem_alloc; qnn::pfn_rpc_mem_free _pfn_rpc_mem_free; qnn::pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd; diff --git a/ggml/src/ggml-qnn/qnn-types.hpp b/ggml/src/ggml-qnn/qnn-types.hpp index 8fce790defb61..7461ac3012755 100644 --- a/ggml/src/ggml-qnn/qnn-types.hpp +++ b/ggml/src/ggml-qnn/qnn-types.hpp @@ -14,7 +14,7 @@ namespace qnn { // Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK // ref:https://github.com/pytorch/executorch/tree/main/backends/qualcomm // ================================================================================================= -enum sdk_profile_level { profile_off = 0, profile_basic = 1, profile_detail = 2 }; +enum sdk_profile_level { profile_off = 0, profile_basic, profile_detail }; enum qcom_htp_arch { NONE = 0, diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index f28fc8e2ca1e2..0a9a367015127 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -27,8 +27,8 @@ class ggml_qnn_tensor { explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string &name, const qnn_dimension_array_t &dimensions, Qnn_DataType_t data_type, int rank, QNNBackend device, Qnn_GraphHandle_t graph_handle, - std::shared_ptr qnn_instance) : - _tensor_name(name), _device(device), _qnn_instance(qnn_instance), _graph_handle(graph_handle) { + std::shared_ptr qnn_instance) + : _tensor_name(name), _device(device), _qnn_instance(qnn_instance), _graph_handle(graph_handle) { if (!_tensor_name.empty()) { QNN_TENSOR_SET_NAME(_qnn_tensor, _tensor_name.c_str()); } @@ -37,23 +37,35 @@ class ggml_qnn_tensor { _dimensions = dimensions; update_params_from_ggml_tensor(tensor_type, data_type, rank); - QNN_LOG_DEBUG("create tensor %s, rank: %d, dims: [%d, %d, %d, %d], data_type: %d, device: %d", + QNN_LOG_DEBUG("[%s][%s]created, rank: %d, dims: [%d, %d, %d, %d], type: %s", get_backend_name(device), _tensor_name.c_str(), rank, (int)_dimensions[0], (int)_dimensions[1], (int)_dimensions[2], - (int)_dimensions[3], (int)data_type, (int)device); + (int)_dimensions[3], qnn_datatype_to_string(data_type)); } explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string &name, const ggml_dimension_array_t &dimensions, ggml_type data_type, int rank, QNNBackend device, - Qnn_GraphHandle_t graph_handle, std::shared_ptr qnn_instance) : - ggml_qnn_tensor(tensor_type, name, get_internal_dimension(dimensions, rank), - qnn_datatype_from_ggml_datatype(data_type), rank, device, graph_handle, qnn_instance) {} + Qnn_GraphHandle_t graph_handle, std::shared_ptr qnn_instance) + : ggml_qnn_tensor(tensor_type, name, get_internal_dimension(dimensions, rank), + qnn_datatype_from_ggml_datatype(data_type), rank, device, graph_handle, qnn_instance) {} + + ~ggml_qnn_tensor() { + _buffer_storage.clear(); + unbind(); + _qnn_rpc_buffer.reset(); + } + + bool set_data_buffer(std::vector &&buffer) { + if (!bind_buffer_impl(buffer.data(), buffer.size())) { + return false; + } - ~ggml_qnn_tensor() { _qnn_rpc_buffer.reset(); } + _buffer_storage = std::move(buffer); + return true; + } bool alloc_qnn_tensor_id() { if (QNN_TENSOR_GET_ID(_qnn_tensor)) { - QNN_LOG_WARN("graph tensor %s already created, id %d", _tensor_name.c_str(), - QNN_TENSOR_GET_ID(_qnn_tensor)); + QNN_LOG_DEBUG("[%s]tensor already has a id: %d", _tensor_name.c_str(), QNN_TENSOR_GET_ID(_qnn_tensor)); return true; } @@ -61,30 +73,90 @@ class ggml_qnn_tensor { auto qnn_interface = _qnn_instance->get_qnn_interface(); auto error = qnn_interface->qnn_tensor_create_graph_tensor(_graph_handle, &qnn_tensor); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("create graph tensor failed, tensor %s, error: %d\n", _tensor_name.c_str(), error); + QNN_LOG_WARN("[%s]allocate id failed , error: %d\n", _tensor_name.c_str(), error); return false; } QNN_TENSOR_SET_ID(_qnn_tensor, QNN_TENSOR_GET_ID(qnn_tensor)); - QNN_LOG_DEBUG("create graph tensor %s, id: %d, rank: %d", _tensor_name.c_str(), QNN_TENSOR_GET_ID(qnn_tensor), - QNN_TENSOR_GET_RANK(qnn_tensor)); - + QNN_LOG_DEBUG("[%s][%s]allocated id: %d, rank: %d", get_backend_name(_device), _tensor_name.c_str(), + QNN_TENSOR_GET_ID(qnn_tensor), QNN_TENSOR_GET_RANK(qnn_tensor)); return true; } bool bind_buffer(uint8_t *buffer, const size_t buffer_size) { + if (!_buffer_storage.empty()) { + QNN_LOG_DEBUG("[%s]already has buffer storage, skip bind", _tensor_name.c_str()); + return true; + } + + return bind_buffer_impl(buffer, buffer_size); + } + + bool bind_ggml_tensor(ggml_tensor *tensor) { + if (!bind_buffer(reinterpret_cast(tensor->data), ggml_nbytes(tensor))) { + QNN_LOG_WARN("[%s]failed to bind ggml tensor(%s)", _tensor_name.c_str(), ggml_get_name(tensor)); + return false; + } + + QNN_LOG_DEBUG("[%s][%s]bind to ggml tensor(%s)", get_backend_name(_device), _tensor_name.c_str(), + ggml_get_name(tensor)); + return true; + } + + bool unbind() { + if (!_graph_handle) { + QNN_LOG_WARN("[%s]not bound to any graph", _tensor_name.c_str()); + return false; + } + + if (!_buffer) { + QNN_LOG_DEBUG("[%s]bound to ggml tensor", _tensor_name.c_str()); + return true; + } + + if (!read_from_qnn_tensor()) { + QNN_LOG_WARN("[%s]read from qnn tensor failed", _tensor_name.c_str()); + return false; + } + + if (!_buffer_storage.empty()) { + QNN_LOG_DEBUG("[%s]already has buffer storage, stop unbind", _tensor_name.c_str()); + return true; + } + + if (!should_use_mem_handle()) { + QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); + Qnn_ClientBuffer_t client_buf = {}; + QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); + QNN_LOG_DEBUG("[%s]clear client buffer", _tensor_name.c_str()); + } + + QNN_LOG_DEBUG("[%s][%s]unbind from buffer: %p, size: %d", get_backend_name(_device), _tensor_name.c_str(), + _buffer, (int)_buffer_size); + _buffer = nullptr; + _buffer_size = 0; + return true; + } + + const Qnn_Tensor_t &get_qnn_tensor() const { return _qnn_tensor; } + Qnn_DataType_t get_data_type() const { return QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor); } + const qnn_dimension_array_t &get_dimensions() const { return _dimensions; } + uint32_t get_qnn_tensor_id() const { return QNN_TENSOR_GET_ID(_qnn_tensor); } + +private: + bool bind_buffer_impl(uint8_t *buffer, const size_t buffer_size) { if (_buffer) { if (_buffer != buffer) { - QNN_LOG_WARN("tensor %s has been bound to another buffer %p", _tensor_name.c_str(), _buffer); + QNN_LOG_WARN("[%s]has been bound to another buffer %p", _tensor_name.c_str(), _buffer); return false; } - QNN_LOG_INFO("tensor %s already bound to same ggml tensor %p", _tensor_name.c_str(), _buffer); + QNN_LOG_DEBUG("[%s]already bound to same ggml tensor %p", _tensor_name.c_str(), _buffer); return true; } if (QNN_TENSOR_GET_TYPE(_qnn_tensor) == QNN_TENSOR_TYPE_NATIVE) { - QNN_LOG_DEBUG("tensor %s type(%d) not READ/WRITE, skipping", _tensor_name.c_str(), + QNN_LOG_DEBUG("[%s]tensor type(%d) not READ/WRITE, skipping", _tensor_name.c_str(), (int)QNN_TENSOR_TYPE_NATIVE); return true; } @@ -95,7 +167,7 @@ class ggml_qnn_tensor { _qnn_instance, buffer_size, QNN_TENSOR_GET_RANK(_qnn_tensor), QNN_TENSOR_GET_DIMENSIONS(_qnn_tensor), QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor)); if (!qnn_rpc_buffer->is_valid()) { - QNN_LOG_WARN("alloc rpc mem failed, tensor %s", _tensor_name.c_str()); + QNN_LOG_WARN("[%s]alloc rpc mem failed", _tensor_name.c_str()); return false; } @@ -104,12 +176,12 @@ class ggml_qnn_tensor { QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_MEMHANDLE); QNN_TENSOR_SET_MEM_HANDLE(_qnn_tensor, _qnn_rpc_buffer->get_mem_handle()); - QNN_LOG_DEBUG("tensor %s, use mem handle %p", _tensor_name.c_str(), QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor)); + QNN_LOG_DEBUG("[%s]use mem handle %p", _tensor_name.c_str(), QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor)); } else { QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); - Qnn_ClientBuffer_t client_buf = { buffer, (uint32_t)buffer_size }; + Qnn_ClientBuffer_t client_buf = {buffer, (uint32_t)buffer_size}; QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); - QNN_LOG_DEBUG("tensor %s, use client buffer %p size %d", _tensor_name.c_str(), client_buf.data, + QNN_LOG_DEBUG("[%s]use client buffer %p size %d", _tensor_name.c_str(), client_buf.data, (int)client_buf.dataSize); } @@ -117,62 +189,19 @@ class ggml_qnn_tensor { _buffer_size = buffer_size; if (!write_to_qnn_tensor()) { - QNN_LOG_WARN("write to qnn tensor failed, tensor %s", _tensor_name.c_str()); - return false; - } - - QNN_LOG_DEBUG("bind tensor %s to buffer: %p, size: %d", _tensor_name.c_str(), buffer, (int)buffer_size); - return true; - } - - bool bind_ggml_tensor(ggml_tensor *tensor) { - if (!bind_buffer(reinterpret_cast(tensor->data), ggml_nbytes(tensor))) { - QNN_LOG_WARN("Failed to bind tensor: %s to ggml tensor: %s", _tensor_name.c_str(), ggml_get_name(tensor)); - return false; - } - - QNN_LOG_DEBUG("Bind tensor %s to ggml tensor %s", _tensor_name.c_str(), ggml_get_name(tensor)); - return true; - } - - bool unbind() { - if (!_graph_handle) { - QNN_LOG_WARN("tensor %s not bound to any graph", _tensor_name.c_str()); + QNN_LOG_WARN("[%s]write to qnn tensor failed", _tensor_name.c_str()); return false; } - if (!_buffer) { - QNN_LOG_DEBUG("tensor %s not bound to ggml tensor", _tensor_name.c_str()); - return true; - } - - if (!read_from_qnn_tensor()) { - QNN_LOG_WARN("read from qnn tensor failed, tensor %s", _tensor_name.c_str()); - return false; - } - - if (!should_use_mem_handle()) { - QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); - Qnn_ClientBuffer_t client_buf = {}; - QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); - QNN_LOG_DEBUG("tensor %s, clear client buffer", _tensor_name.c_str()); - } - - QNN_LOG_DEBUG("unbind tensor: %s from buffer: %p, size: %d", _tensor_name.c_str(), _buffer, (int)_buffer_size); - _buffer = nullptr; - _buffer_size = 0; + QNN_LOG_DEBUG("[%s][%s]bind to buffer: %p, size: %d", get_backend_name(_device), _tensor_name.c_str(), buffer, + (int)buffer_size); return true; } - const Qnn_Tensor_t &get_qnn_tensor() const { return _qnn_tensor; } - Qnn_DataType_t get_data_type() const { return QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor); } - const qnn_dimension_array_t &get_dimensions() const { return _dimensions; } - -private: bool write_to_qnn_tensor() { auto tensor_type = QNN_TENSOR_GET_TYPE(_qnn_tensor); if (tensor_type != QNN_TENSOR_TYPE_APP_WRITE && tensor_type != QNN_TENSOR_TYPE_APP_READWRITE) { - QNN_LOG_DEBUG("tensor %s type(%d) not WRITE", _tensor_name.c_str(), (int)tensor_type); + QNN_LOG_DEBUG("[%s]tensor type(%d) not WRITE", _tensor_name.c_str(), (int)tensor_type); return true; } @@ -180,20 +209,20 @@ class ggml_qnn_tensor { if (_qnn_rpc_buffer) { memcpy(_qnn_rpc_buffer->get_buffer(), _buffer, _buffer_size); } else { - QNN_LOG_WARN("tensor %s: can't find rpcmem from qnn mem handle\n", _tensor_name.c_str()); + QNN_LOG_WARN("[%s]can't find rpcmem from qnn mem handle\n", _tensor_name.c_str()); return false; } } // For CPU and GPU, the data is already in the tensor. - QNN_LOG_DEBUG("write tensor %s to qnn", _tensor_name.c_str()); + QNN_LOG_DEBUG("[%s][%s]write tensor to qnn", get_backend_name(_device), _tensor_name.c_str()); return true; } bool read_from_qnn_tensor() { auto tensor_type = QNN_TENSOR_GET_TYPE(_qnn_tensor); if (tensor_type != QNN_TENSOR_TYPE_APP_READ && tensor_type != QNN_TENSOR_TYPE_APP_READWRITE) { - QNN_LOG_DEBUG("tensor %s type(%d) not READ", _tensor_name.c_str(), (int)tensor_type); + QNN_LOG_DEBUG("[%s]tensor type(%d) not READ", _tensor_name.c_str(), (int)tensor_type); return true; } @@ -201,13 +230,13 @@ class ggml_qnn_tensor { if (_qnn_rpc_buffer) { memcpy(_buffer, _qnn_rpc_buffer->get_buffer(), _buffer_size); } else { - QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n"); + QNN_LOG_WARN("[%s]can't find rpcmem from qnn mem handle", _tensor_name.c_str()); return false; } } // For CPU and GPU, the data is already in the tensor. - QNN_LOG_DEBUG("read tensor %s from qnn", _tensor_name.c_str()); + QNN_LOG_DEBUG("[%s][%s]read tensor from qnn", get_backend_name(_device), _tensor_name.c_str()); return true; } @@ -231,12 +260,14 @@ class ggml_qnn_tensor { case PARAMETER: new_tensor_type = QNN_TENSOR_TYPE_STATIC; break; + case INTERMEDIATE: default: new_tensor_type = QNN_TENSOR_TYPE_NATIVE; break; } QNN_TENSOR_SET_TYPE(_qnn_tensor, new_tensor_type); - QNN_LOG_INFO("tensor %s changed to type %d", _tensor_name.c_str(), new_tensor_type); + QNN_LOG_DEBUG("[%s][%s]tensor changed to type %d", get_backend_name(_device), _tensor_name.c_str(), + new_tensor_type); } bool should_use_mem_handle() const { @@ -246,6 +277,7 @@ class ggml_qnn_tensor { std::string _tensor_name; uint8_t *_buffer = nullptr; size_t _buffer_size = 0; + std::vector _buffer_storage; QNNBackend _device; std::shared_ptr _qnn_instance; Qnn_Tensor_t _qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion); @@ -257,7 +289,7 @@ class ggml_qnn_tensor { DISABLE_MOVE(ggml_qnn_tensor); }; -using ggml_qnn_tensor_ptr_t = std::shared_ptr; -using ggml_qnn_tensor_array_t = std::vector>; +using qnn_tensor_ptr_t = std::shared_ptr; +using qnn_tensor_array_t = std::vector; } // namespace qnn diff --git a/ggml/src/ggml-qnn/utils.cpp b/ggml/src/ggml-qnn/utils.cpp index 8ae375ffc8afc..ebfc0372375fd 100644 --- a/ggml/src/ggml-qnn/utils.cpp +++ b/ggml/src/ggml-qnn/utils.cpp @@ -5,9 +5,11 @@ #include "ggml-qnn.h" +#include "QnnGraph.h" #include "qnn-types.hpp" #ifdef __linux__ +#include #include #endif @@ -148,11 +150,11 @@ const char *get_ggml_type_name(ggml_type type) { const char *get_backend_name(QNNBackend device_index) { switch (device_index) { case QNN_BACKEND_CPU: - return "QNN-CPU"; + return "qnn-cpu"; case QNN_BACKEND_GPU: - return "QNN-GPU"; + return "qnn-gpu"; case QNN_BACKEND_NPU: - return "QNN-NPU"; + return "qnn-npu"; case QNN_BACKEND_COUNT: default: return "unknown"; @@ -195,18 +197,7 @@ intptr_t align_to(size_t alignment, intptr_t offset) { : offset + (static_cast(alignment) - (offset % static_cast(alignment))); } -uint32_t get_ggml_tensor_data_size(const ggml_tensor *tensor) { - /* - size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); - size_t n_dims = qnn_get_ggml_tensor_rank(tensor); - for (int i = 1; i < n_dims; i++) { - data_size *= tensor->ne[i]; - } - - return data_size; - */ - return ggml_nbytes(tensor); -} +uint32_t get_ggml_tensor_data_size(const ggml_tensor *tensor) { return ggml_nbytes(tensor); } void *align_alloc(size_t alignment, size_t size) { size_t size_aligned = size; @@ -248,6 +239,7 @@ const char *opname_from_ggmlop(enum ggml_op ggmlop) { const char *get_qnn_error_string(Qnn_ErrorHandle_t error) { // A complete list of error codes can be found at here: // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/api_error_codes.html + thread_local static char error_code[128] = {}; switch (error) { case QNN_SUCCESS: return "QNN_SUCCESS"; @@ -277,6 +269,36 @@ const char *get_qnn_error_string(Qnn_ErrorHandle_t error) { return "QNN_GRAPH_ERROR_UNCONNECTED_NODE"; case QNN_GRAPH_ERROR_CREATE_FAILED: return "QNN_GRAPH_ERROR_CREATE_FAILED"; + case QNN_GRAPH_ERROR_OPTIMIZATION_FAILED: + return "QNN_GRAPH_ERROR_OPTIMIZATION_FAILED"; + case QNN_GRAPH_ERROR_FINALIZE_FAILED: + return "QNN_GRAPH_ERROR_FINALIZE_FAILED"; + case QNN_GRAPH_ERROR_GRAPH_NOT_FINALIZED: + return "QNN_GRAPH_ERROR_GRAPH_NOT_FINALIZED"; + case QNN_GRAPH_ERROR_GRAPH_FINALIZED: + return "QNN_GRAPH_ERROR_GRAPH_FINALIZED"; + case QNN_GRAPH_ERROR_EXECUTION_ASYNC_FIFO_FULL: + return "QNN_GRAPH_ERROR_EXECUTION_ASYNC_FIFO_FULL"; + case QNN_GRAPH_ERROR_SIGNAL_IN_USE: + return "QNN_GRAPH_ERROR_SIGNAL_IN_USE"; + case QNN_GRAPH_ERROR_ABORTED: + return "QNN_GRAPH_ERROR_ABORTED"; + case QNN_GRAPH_ERROR_PROFILE_IN_USE: + return "QNN_GRAPH_ERROR_PROFILE_IN_USE"; + case QNN_GRAPH_ERROR_TIMED_OUT: + return "QNN_GRAPH_ERROR_TIMED_OUT"; + case QNN_GRAPH_ERROR_SUBGRAPH: + return "QNN_GRAPH_ERROR_SUBGRAPH"; + case QNN_GRAPH_ERROR_DISABLED: + return "QNN_GRAPH_ERROR_DISABLED"; + case QNN_GRAPH_ERROR_DYNAMIC_TENSOR_SHAPE: + return "QNN_GRAPH_ERROR_DYNAMIC_TENSOR_SHAPE"; + case QNN_GRAPH_ERROR_TENSOR_SPARSITY: + return "QNN_GRAPH_ERROR_TENSOR_SPARSITY"; + case QNN_GRAPH_ERROR_EARLY_TERMINATION: + return "QNN_GRAPH_ERROR_EARLY_TERMINATION"; + case QNN_GRAPH_ERROR_INVALID_CONTEXT: + return "QNN_GRAPH_ERROR_INVALID_CONTEXT"; // QnnOpPackage_Error_t case QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED: @@ -294,19 +316,34 @@ const char *get_qnn_error_string(Qnn_ErrorHandle_t error) { case QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT: return "QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT"; default: - return nullptr; + if (error >= QNN_GRAPH_MIN_ERROR && error < QNN_GRAPH_MAX_ERROR) { + snprintf(error_code, sizeof(error_code), "UNKNOWN_GRAPH_ERROR_%d", int(error - QNN_GRAPH_MIN_ERROR)); + } else { + snprintf(error_code, sizeof(error_code), "%d", int(error)); + } + return error_code; } } #ifdef __linux__ size_t get_system_total_memory_in_bytes() { + struct sysinfo info = {}; + if (sysinfo(&info) == 0) { + return (info.totalram + info.totalswap) * info.mem_unit; + } + auto pages = (size_t)sysconf(_SC_PHYS_PAGES); auto page_size = (size_t)sysconf(_SC_PAGE_SIZE); return pages * page_size; } size_t get_system_free_memory_in_bytes() { + struct sysinfo info = {}; + if (sysinfo(&info) == 0) { + return (info.freeram + info.freeswap) * info.mem_unit; + } + auto avail_pages = (size_t)sysconf(_SC_AVPHYS_PAGES); auto page_size = (size_t)sysconf(_SC_PAGE_SIZE); return avail_pages * page_size; From 5103b166badb497fbddaf9de8b07cf1bacd83ff7 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 29 Nov 2024 14:19:34 +0800 Subject: [PATCH 130/166] bugfix: block large tensor calc in npu --- ggml/src/ggml-qnn/backend-ops.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index da0480df7fd9f..30930be422496 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -566,6 +566,10 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t } bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) { + constexpr const auto get_tensor_size = [](const ggml_tensor *tensor) -> size_t { + return tensor->ne[0] * tensor->ne[1] * tensor->ne[2] * tensor->ne[3]; + }; + auto *src0 = op->src[0]; auto *src1 = op->src[1]; switch (ctx->device) { @@ -578,6 +582,11 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggm QNN_LOG_DEBUG("[qnn-npu] src0 and src1 dimensions are not equal, support/unsupported: %d/%d", ctx->support_op_count.load(), ++(ctx->unsupported_op_count)); return false; + } else if (get_tensor_size(src0) + get_tensor_size(src1) + get_tensor_size(op) >= + (8192 * 2048 + 8192 * 512 + 2048 * 512)) { + QNN_LOG_DEBUG("[qnn-npu] tensor size is too large, support/unsupported: %d/%d", + ctx->support_op_count.load(), ++(ctx->unsupported_op_count)); + return false; } // fall through, from test here, the convert op is super slow on NPU: // https://github.com/usefulsensors/qc_npu_benchmark From 6d4feae5791038d9415a8538bc8083f11e72875e Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 29 Nov 2024 16:51:06 +0800 Subject: [PATCH 131/166] redo conflict changes --- ggml/CMakeLists.txt | 3 ++- ggml/src/CMakeLists.txt | 1 + ggml/src/ggml-backend-reg.cpp | 11 +++++++++ ggml/src/ggml-qnn/CMakeLists.txt | 34 ++++++++++++++++++++++++++++ ggml/src/{ => ggml-qnn}/ggml-qnn.cpp | 0 5 files changed, 48 insertions(+), 1 deletion(-) create mode 100644 ggml/src/ggml-qnn/CMakeLists.txt rename ggml/src/{ => ggml-qnn}/ggml-qnn.cpp (100%) diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index d2c377dcb4fd5..b0dca348f7ed6 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -232,7 +232,8 @@ set(GGML_PUBLIC_HEADERS include/ggml-metal.h include/ggml-rpc.h include/ggml-sycl.h - include/ggml-vulkan.h) + include/ggml-vulkan.h + include/ggml-qnn.h) set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}") #if (GGML_METAL) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 9022aa3ae197d..f6db35571bb0b 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -286,6 +286,7 @@ ggml_add_backend(MUSA) ggml_add_backend(RPC) ggml_add_backend(SYCL) ggml_add_backend(Vulkan) +ggml_add_backend(QNN) foreach (target ggml-base ggml) target_include_directories(${target} PUBLIC $ $) diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index a0e0e2c5852f7..8dc267c2dbaad 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -61,6 +61,14 @@ #include "ggml-kompute.h" #endif +#ifdef GGML_USE_KOMPUTE +#include "ggml-kompute.h" +#endif + +#ifdef GGML_USE_QNN +#include "ggml-qnn.h" +#endif + struct ggml_backend_reg_entry { ggml_backend_reg_t reg; void * handle; @@ -98,6 +106,9 @@ struct ggml_backend_registry { #ifdef GGML_USE_KOMPUTE register_backend(ggml_backend_kompute_reg()); #endif +#ifdef GGML_USE_QNN + register_backend(ggml_backend_qnn_reg()); +#endif #ifdef GGML_USE_CPU register_backend(ggml_backend_cpu_reg()); #endif diff --git a/ggml/src/ggml-qnn/CMakeLists.txt b/ggml/src/ggml-qnn/CMakeLists.txt new file mode 100644 index 0000000000000..af60de67d38e0 --- /dev/null +++ b/ggml/src/ggml-qnn/CMakeLists.txt @@ -0,0 +1,34 @@ +message(STATUS "Using QNN backend") + +if(CMAKE_SYSTEM_NAME STREQUAL "Android") + find_library(LOG_LIB log) + find_library(ANDROID_LIB android) + set(QNN_LINK_LIBRARIES ${LOG_LIB} ${ANDROID_LIB}) + set(QNN_DEFAULT_LIB_SEARCH_PATH "/data/local/tmp/" CACHE STRING "customized library search path for QNN backend") +else() + message(FATAL_ERROR "QNN now only available on Android") +endif() + +if(NOT DEFINED GGML_QNN_SDK_PATH) + # try read from environment variable + if(DEFINED ENV{QNN_SDK_PATH}) + set(GGML_QNN_SDK_PATH $ENV{QNN_SDK_PATH}) + else() + message(FATAL_ERROR "GGML_QNN_SDK_PATH not defined") + endif() +endif() + +message("QNN_SDK_PATH: ${GGML_QNN_SDK_PATH}") + +string(REGEX REPLACE "/$" "" GGML_QNN_DEFAULT_LIB_SEARCH_PATH "${QNN_DEFAULT_LIB_SEARCH_PATH}") +add_compile_definitions(GGML_QNN_DEFAULT_LIB_SEARCH_PATH="${QNN_DEFAULT_LIB_SEARCH_PATH}/") + +set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3") + +file(GLOB QNN_SOURCES "${CMAKE_CURRENT_LIST_DIR}/*.cpp") +ggml_add_backend_library(ggml-qnn + ${QNN_SOURCES} +) + +target_include_directories(ggml-qnn PRIVATE ${GGML_QNN_SDK_PATH}/include/QNN ${CMAKE_CURRENT_LIST_DIR}) +target_link_libraries(ggml-qnn PRIVATE ${QNN_LINK_LIBRARIES}) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp similarity index 100% rename from ggml/src/ggml-qnn.cpp rename to ggml/src/ggml-qnn/ggml-qnn.cpp From 09efaa389e3525e4a972b4390c7f2c5ec36ae5e2 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 29 Nov 2024 17:24:05 +0800 Subject: [PATCH 132/166] define compile flag as module private --- ggml/src/ggml-qnn/CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-qnn/CMakeLists.txt b/ggml/src/ggml-qnn/CMakeLists.txt index af60de67d38e0..b8d84d078e082 100644 --- a/ggml/src/ggml-qnn/CMakeLists.txt +++ b/ggml/src/ggml-qnn/CMakeLists.txt @@ -20,9 +20,6 @@ endif() message("QNN_SDK_PATH: ${GGML_QNN_SDK_PATH}") -string(REGEX REPLACE "/$" "" GGML_QNN_DEFAULT_LIB_SEARCH_PATH "${QNN_DEFAULT_LIB_SEARCH_PATH}") -add_compile_definitions(GGML_QNN_DEFAULT_LIB_SEARCH_PATH="${QNN_DEFAULT_LIB_SEARCH_PATH}/") - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3") file(GLOB QNN_SOURCES "${CMAKE_CURRENT_LIST_DIR}/*.cpp") @@ -32,3 +29,6 @@ ggml_add_backend_library(ggml-qnn target_include_directories(ggml-qnn PRIVATE ${GGML_QNN_SDK_PATH}/include/QNN ${CMAKE_CURRENT_LIST_DIR}) target_link_libraries(ggml-qnn PRIVATE ${QNN_LINK_LIBRARIES}) + +string(REGEX REPLACE "/$" "" GGML_QNN_DEFAULT_LIB_SEARCH_PATH "${QNN_DEFAULT_LIB_SEARCH_PATH}") +target_compile_definitions(ggml-qnn PRIVATE GGML_QNN_DEFAULT_LIB_SEARCH_PATH="${QNN_DEFAULT_LIB_SEARCH_PATH}/") From c5e65493318cdcbe03b726723fa9b4cd86c74d35 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 29 Nov 2024 23:37:52 +0800 Subject: [PATCH 133/166] fix: fix assertion --- ggml/src/ggml-qnn/backend-ops.cpp | 2 ++ ggml/src/ggml-qnn/ggml-qnn.cpp | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 30930be422496..0e73cce668e83 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -666,6 +666,8 @@ bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tenso case GGML_OP_MUL_MAT: return ggml_qnn_supports_matmul_op(ctx, op); + case GGML_OP_VIEW: + return true; default: return false; } diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp index a4dace7078d3b..3bc91a061212f 100644 --- a/ggml/src/ggml-qnn/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn/ggml-qnn.cpp @@ -227,7 +227,7 @@ size_t ggml_backend_qnn_buffer_type_get_alignment(ggml_backend_buffer_type_t buf size_t ggml_backend_qnn_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { GGML_UNUSED(buft); // TODO: get the max size from device - return (1024 * 1024 * 1024); + return (2 * 1024 * 1024 * 1024); } bool ggml_backend_qnn_buffer_is_host(ggml_backend_buffer_type_t buft) { From 0d02ee09edafacbffa6630b67d68f1f27664e37d Mon Sep 17 00:00:00 2001 From: hongruichen Date: Tue, 3 Dec 2024 10:52:49 +0800 Subject: [PATCH 134/166] fix int overflow and remove view op to pass unit test --- ggml/src/ggml-qnn/backend-ops.cpp | 2 -- ggml/src/ggml-qnn/ggml-qnn.cpp | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 0e73cce668e83..30930be422496 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -666,8 +666,6 @@ bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tenso case GGML_OP_MUL_MAT: return ggml_qnn_supports_matmul_op(ctx, op); - case GGML_OP_VIEW: - return true; default: return false; } diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp index 3bc91a061212f..c57692b867bc7 100644 --- a/ggml/src/ggml-qnn/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn/ggml-qnn.cpp @@ -227,7 +227,7 @@ size_t ggml_backend_qnn_buffer_type_get_alignment(ggml_backend_buffer_type_t buf size_t ggml_backend_qnn_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { GGML_UNUSED(buft); // TODO: get the max size from device - return (2 * 1024 * 1024 * 1024); + return 1024 * 1024 * 1024; } bool ggml_backend_qnn_buffer_is_host(ggml_backend_buffer_type_t buft) { From e36ad89528a0276331e3c22f153d6837c353c5cf Mon Sep 17 00:00:00 2001 From: nullname Date: Wed, 11 Dec 2024 10:42:00 +0800 Subject: [PATCH 135/166] bugfix: error pre-allocated tensor (k_cache_view-0) (#12) * fix device binding at ggml_backend_qnn_buffer_type * merge ggml_backend_qnn_buffer_context and qnn_mem_buffer * wip * add log * wip * add qnn_buffer_ptr * remove tailing `\n` at log * add log * enable GGML_OP_NONE * wip * wip * disable tensor with view * wip * wip * more log for view tensor * re-enable view * wip * remove link android lib * set dimension at bind function * move graph traversal to backend-ops * wip * add get_view_internal_dimension to obtain the tensor view source dimension * use _view_source_dimensions to allocate qnn tensor * add place holder function ggml_backend_qnn_cpy_tensor_async * add ggml_qnn_aggregate_op_config * make matmul based on ggml_qnn_aggregate_op_config * wip * manually specify the order of op destruct * skip register qnn-cpu backend * disable view op again * remove _view_source_dimensions * add nop for reshape and view ops * add log * add comment --- ggml/src/ggml-qnn/CMakeLists.txt | 3 +- ggml/src/ggml-qnn/backend-ops.cpp | 151 ++++++++++++---------- ggml/src/ggml-qnn/backend-ops.hpp | 4 +- ggml/src/ggml-qnn/buffer.hpp | 84 ++++++++++--- ggml/src/ggml-qnn/ggml-qnn.cpp | 202 +++++++++++++----------------- ggml/src/ggml-qnn/graph.hpp | 14 +-- ggml/src/ggml-qnn/op-config.cpp | 122 +++++------------- ggml/src/ggml-qnn/op-config.hpp | 78 +++++++++--- ggml/src/ggml-qnn/qnn-lib.hpp | 145 +++++++++++---------- ggml/src/ggml-qnn/tensor.hpp | 50 ++++---- ggml/src/ggml-qnn/utils.cpp | 30 +++++ ggml/src/ggml-qnn/utils.hpp | 3 + 12 files changed, 469 insertions(+), 417 deletions(-) diff --git a/ggml/src/ggml-qnn/CMakeLists.txt b/ggml/src/ggml-qnn/CMakeLists.txt index b8d84d078e082..7bbb9be76b4f6 100644 --- a/ggml/src/ggml-qnn/CMakeLists.txt +++ b/ggml/src/ggml-qnn/CMakeLists.txt @@ -2,8 +2,7 @@ message(STATUS "Using QNN backend") if(CMAKE_SYSTEM_NAME STREQUAL "Android") find_library(LOG_LIB log) - find_library(ANDROID_LIB android) - set(QNN_LINK_LIBRARIES ${LOG_LIB} ${ANDROID_LIB}) + set(QNN_LINK_LIBRARIES ${LOG_LIB}) set(QNN_DEFAULT_LIB_SEARCH_PATH "/data/local/tmp/" CACHE STRING "customized library search path for QNN backend") else() message(FATAL_ERROR "QNN now only available on Android") diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 30930be422496..990338c953524 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -3,6 +3,8 @@ #include +#include "ggml-impl.h" + #include "graph.hpp" #include "logger.hpp" #include "op-config.hpp" @@ -15,13 +17,13 @@ namespace { bool qnn_is_valid_params(ggml_backend_qnn_device_context *ctx, const ggml_tensor *src, ggml_tensor *dst) { if (!ctx || !src || !dst) { - QNN_LOG_WARN("invalid params\n"); + QNN_LOG_WARN("invalid params"); return false; } auto instance = ctx->instance; if (!instance) { - QNN_LOG_WARN("invalid instance\n"); + QNN_LOG_WARN("invalid instance"); return false; } @@ -31,13 +33,13 @@ bool qnn_is_valid_params(ggml_backend_qnn_device_context *ctx, const ggml_tensor bool qnn_is_valid_params(ggml_backend_qnn_device_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst) { if (!ctx || !src0 || !src1 || !dst) { - QNN_LOG_WARN("invalid params\n"); + QNN_LOG_WARN("invalid params"); return false; } auto instance = ctx->instance; if (!instance) { - QNN_LOG_WARN("invalid instance\n"); + QNN_LOG_WARN("invalid instance"); return false; } @@ -45,7 +47,7 @@ bool qnn_is_valid_params(ggml_backend_qnn_device_context *ctx, const ggml_tensor } void print_ggml_tensor(const ggml_tensor *tensor) { - QNN_LOG_DEBUG("%s: type:%s ne: %ldx%ldx%ldx%ld, nb: %ldx%ldx%ldx%ld\n", tensor->name, ggml_type_name(tensor->type), + QNN_LOG_DEBUG("%s: type:%s ne: %ldx%ldx%ldx%ld, nb: %ldx%ldx%ldx%ld", tensor->name, ggml_type_name(tensor->type), (long)tensor->ne[0], (long)tensor->ne[1], (long)tensor->ne[2], (long)tensor->ne[3], (long)tensor->nb[0], (long)tensor->nb[1], (long)tensor->nb[2], (long)tensor->nb[3]); } @@ -96,7 +98,7 @@ template bool execute_graph(qnn::ggml_qnn_graph *graph, const std::array &inputs, ggml_tensor *output) { if (!graph->execute(to_ggml_tensor_array<_InputSize>(inputs), to_ggml_tensor_array<1>({output}))) { - QNN_LOG_WARN("execute failed\n"); + QNN_LOG_WARN("execute failed"); return false; } @@ -248,7 +250,7 @@ qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *c auto it = graph_cache.find(graph_key); qnn::ggml_qnn_graph *graph_ptr = nullptr; if (it != graph_cache.end()) { - QNN_LOG_DEBUG("found graph %s in cache\n", graph_key.c_str()); + QNN_LOG_DEBUG("[%s]found graph %s in cache", qnn::get_backend_name(ctx->device), graph_key.c_str()); graph_ptr = it->second.get(); } else { auto graph = @@ -260,7 +262,7 @@ qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *c auto op_constructor = qnn::create_op_constructor(kGgmlOpToQnnOp[op]); if (!graph->build_graph(op_constructor, to_ggml_tensor_array<_InputSize>(inputs), to_ggml_tensor_array<1>({output}))) { - QNN_LOG_ERROR("build_graph failed\n"); + QNN_LOG_ERROR("[%s]build_graph failed", qnn::get_backend_name(ctx->device)); return nullptr; } @@ -332,7 +334,7 @@ bool qnn_binary_nop_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src0 } constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = { - nullptr, // GGML_OP_NONE + qnn_unary_nop_impl, // GGML_OP_NONE nullptr, // GGML_OP_DUP nullptr, // GGML_OP_ADD nullptr, // GGML_OP_ADD1 @@ -363,37 +365,37 @@ constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = { nullptr, // GGML_OP_MUL_MAT_ID nullptr, // GGML_OP_OUT_PROD - nullptr, // GGML_OP_SCALE - nullptr, // GGML_OP_SET - nullptr, // GGML_OP_CPY - nullptr, // GGML_OP_CONT - nullptr, // GGML_OP_RESHAPE - qnn_unary_nop_impl, // GGML_OP_VIEW - qnn_unary_op_impl, // GGML_OP_PERMUTE - nullptr, // GGML_OP_TRANSPOSE - qnn_unary_nop_impl, // GGML_OP_GET_ROWS - nullptr, // GGML_OP_GET_ROWS_BACK - nullptr, // GGML_OP_DIAG - nullptr, // GGML_OP_DIAG_MASK_INF - nullptr, // GGML_OP_DIAG_MASK_ZERO - nullptr, // GGML_OP_SOFT_MAX - nullptr, // GGML_OP_SOFT_MAX_BACK - nullptr, // GGML_OP_ROPE - nullptr, // GGML_OP_ROPE_BACK - nullptr, // GGML_OP_CLAMP - nullptr, // GGML_OP_CONV_TRANSPOSE_1D - nullptr, // GGML_OP_IM2COL - nullptr, // GGML_OP_IM2COL_BACK - nullptr, // GGML_OP_CONV_TRANSPOSE_2D - nullptr, // GGML_OP_POOL_1D - nullptr, // GGML_OP_POOL_2D - nullptr, // GGML_OP_POOL_2D_BACK - nullptr, // GGML_OP_UPSCALE - nullptr, // GGML_OP_PAD - nullptr, // GGML_OP_ARANGE - nullptr, // GGML_OP_TIMESTEP_EMBEDDING - nullptr, // GGML_OP_ARGSORT - nullptr, // GGML_OP_LEAKY_RELU + nullptr, // GGML_OP_SCALE + nullptr, // GGML_OP_SET + nullptr, // GGML_OP_CPY + nullptr, // GGML_OP_CONT + qnn_unary_nop_impl, // GGML_OP_RESHAPE + qnn_unary_nop_impl, // GGML_OP_VIEW + qnn_unary_nop_impl, // GGML_OP_PERMUTE + qnn_unary_nop_impl, // GGML_OP_TRANSPOSE + qnn_unary_nop_impl, // GGML_OP_GET_ROWS + nullptr, // GGML_OP_GET_ROWS_BACK + nullptr, // GGML_OP_DIAG + nullptr, // GGML_OP_DIAG_MASK_INF + nullptr, // GGML_OP_DIAG_MASK_ZERO + nullptr, // GGML_OP_SOFT_MAX + nullptr, // GGML_OP_SOFT_MAX_BACK + nullptr, // GGML_OP_ROPE + nullptr, // GGML_OP_ROPE_BACK + nullptr, // GGML_OP_CLAMP + nullptr, // GGML_OP_CONV_TRANSPOSE_1D + nullptr, // GGML_OP_IM2COL + nullptr, // GGML_OP_IM2COL_BACK + nullptr, // GGML_OP_CONV_TRANSPOSE_2D + nullptr, // GGML_OP_POOL_1D + nullptr, // GGML_OP_POOL_2D + nullptr, // GGML_OP_POOL_2D_BACK + nullptr, // GGML_OP_UPSCALE + nullptr, // GGML_OP_PAD + nullptr, // GGML_OP_ARANGE + nullptr, // GGML_OP_TIMESTEP_EMBEDDING + nullptr, // GGML_OP_ARGSORT + nullptr, // GGML_OP_LEAKY_RELU nullptr, // GGML_OP_FLASH_ATTN_EXT nullptr, // GGML_OP_FLASH_ATTN_BACK @@ -442,7 +444,7 @@ constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = { static_assert(sizeof(kQnnUnaryOpsTable) / sizeof(kQnnUnaryOpsTable[0]) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), "GGML_OP_COUNT does not match the size of the kQnnUnaryOpsTable table"); -static constexpr const ggml_qnn_binary_op_t kQnnBinaryOpsTable[] = { +constexpr const ggml_qnn_binary_op_t kQnnBinaryOpsTable[] = { nullptr, // GGML_OP_NONE nullptr, // GGML_OP_DUP qnn_binary_op_impl, // GGML_OP_ADD @@ -543,22 +545,28 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t return false; } -#ifndef NDEBUG - auto *type_name = ggml_get_type_traits(tensor->type)->type_name; -#endif + if (tensor->view_src) { + auto *src_tensor = tensor->view_src; + QNN_LOG_DEBUG("[%s]tensor(%s_%dx%dx%dx%d) is a view, src: %s_%dx%dx%dx%d", qnn::get_backend_name(ctx->device), + ggml_get_name(tensor), tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], + ggml_get_name(src_tensor), src_tensor->ne[0], src_tensor->ne[1], src_tensor->ne[2], + src_tensor->ne[3]); + } + switch (tensor->type) { case GGML_TYPE_F32: case GGML_TYPE_F16: case GGML_TYPE_Q8_0: case GGML_TYPE_Q4_0: if (!(ctx->supported_types & (1 << tensor->type))) { - QNN_LOG_DEBUG("unsupported data type %s for backend %s, supported_types: 0x%x", type_name, - qnn::get_backend_name(ctx->device), ctx->supported_types); + QNN_LOG_DEBUG("[%s]unsupported data type %s, supported_types: 0x%x", qnn::get_backend_name(ctx->device), + ggml_type_name(tensor->type), ctx->supported_types); return false; } break; default: - QNN_LOG_DEBUG("unsupported data type %s", type_name); + QNN_LOG_DEBUG("[%s]unsupported data type %s", qnn::get_backend_name(ctx->device), + ggml_type_name(tensor->type)); return false; } @@ -566,6 +574,7 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t } bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) { + constexpr const size_t kMaxNpuTensorSize = 8192L * 2048 + 8192 * 512 + 2048 * 512; constexpr const auto get_tensor_size = [](const ggml_tensor *tensor) -> size_t { return tensor->ne[0] * tensor->ne[1] * tensor->ne[2] * tensor->ne[3]; }; @@ -582,8 +591,7 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggm QNN_LOG_DEBUG("[qnn-npu] src0 and src1 dimensions are not equal, support/unsupported: %d/%d", ctx->support_op_count.load(), ++(ctx->unsupported_op_count)); return false; - } else if (get_tensor_size(src0) + get_tensor_size(src1) + get_tensor_size(op) >= - (8192 * 2048 + 8192 * 512 + 2048 * 512)) { + } else if (get_tensor_size(src0) + get_tensor_size(src1) + get_tensor_size(op) >= kMaxNpuTensorSize) { QNN_LOG_DEBUG("[qnn-npu] tensor size is too large, support/unsupported: %d/%d", ctx->support_op_count.load(), ++(ctx->unsupported_op_count)); return false; @@ -618,12 +626,13 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggm namespace qnn { -bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) { +bool device_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) { // Note that this function could be called before the device context is initialized if (op->op == GGML_OP_NONE) { return true; } + auto *src0 = op->src[0]; if (op->op == GGML_OP_UNARY) { const auto unary_op = ggml_get_unary_op(op); if (unary_op == GGML_UNARY_OP_GELU && ctx->device == QNN_BACKEND_NPU) { @@ -637,7 +646,7 @@ bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tenso return false; } - if (!op->src[0]) { + if (!ggml_qnn_supports_tensor(ctx, src0) || !ggml_qnn_supports_tensor(ctx, op)) { QNN_LOG_DEBUG("src0 is nullptr"); return false; } @@ -647,7 +656,6 @@ bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tenso return false; } - auto *src0 = op->src[0]; auto *src1 = op->src[1]; if (!ggml_qnn_supports_tensor(ctx, src0) || !ggml_qnn_supports_tensor(ctx, op) || (kQnnBinaryOpsTable[op->op] && !ggml_qnn_supports_tensor(ctx, src1))) { @@ -674,24 +682,35 @@ bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tenso return true; } -bool ggml_qnn_forward(ggml_backend_qnn_device_context *ctx, struct ggml_tensor *tensor) { - size_t unary_op_idx = tensor->op; - if (tensor->op == GGML_OP_UNARY) { - unary_op_idx = kGgmlUnaryOpStart + ggml_get_unary_op(tensor); - } +bool device_compute_graph(ggml_backend_qnn_device_context *ctx, ggml_cgraph *cgraph) { + QNN_LOG_DEBUG("[%s]compute graph, nodes count: %d", qnn::get_backend_name(ctx->device), (int)cgraph->n_nodes); + for (int i = 0; i < cgraph->n_nodes; i++) { + ggml_tensor *tensor = cgraph->nodes[i]; + if (ggml_is_empty(tensor)) { + continue; + } - auto unary_op = kQnnUnaryOpsTable[unary_op_idx]; - if (unary_op) { - return unary_op(ctx, tensor->src[0], tensor); - } + size_t unary_op_idx = tensor->op; + if (tensor->op == GGML_OP_UNARY) { + unary_op_idx = kGgmlUnaryOpStart + ggml_get_unary_op(tensor); + } + + bool ok = false; + auto unary_op = kQnnUnaryOpsTable[unary_op_idx]; + auto binary_op = kQnnBinaryOpsTable[tensor->op]; + if (unary_op) { + ok = unary_op(ctx, tensor->src[0], tensor); + } else if (binary_op) { + ok = binary_op(ctx, tensor->src[0], tensor->src[1], tensor); + } - auto binary_op = kQnnBinaryOpsTable[tensor->op]; - if (binary_op) { - return binary_op(ctx, tensor->src[0], tensor->src[1], tensor); + if (!ok) { + QNN_LOG_WARN("[%s]unsupported op %s", qnn::get_backend_name(ctx->device), ggml_op_desc(tensor)); + return false; + } } - QNN_LOG_WARN("[forward]unsupported op %s", ggml_op_desc(tensor)); - return false; + return true; } } // namespace qnn diff --git a/ggml/src/ggml-qnn/backend-ops.hpp b/ggml/src/ggml-qnn/backend-ops.hpp index 3df7f4a98a146..c49c4d6dc19d7 100644 --- a/ggml/src/ggml-qnn/backend-ops.hpp +++ b/ggml/src/ggml-qnn/backend-ops.hpp @@ -6,7 +6,7 @@ namespace qnn { -bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op); -bool ggml_qnn_forward(ggml_backend_qnn_device_context *ctx, struct ggml_tensor *tensor); +bool device_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op); +bool device_compute_graph(ggml_backend_qnn_device_context *ctx, ggml_cgraph *cgraph); } // namespace qnn diff --git a/ggml/src/ggml-qnn/buffer.hpp b/ggml/src/ggml-qnn/buffer.hpp index 676e88c0454be..9573e160b4176 100644 --- a/ggml/src/ggml-qnn/buffer.hpp +++ b/ggml/src/ggml-qnn/buffer.hpp @@ -1,28 +1,42 @@ #pragma once #include +#include #include "logger.hpp" #include "qnn-lib.hpp" namespace qnn { -class ggml_qnn_rpc_buffer { + +class qnn_buffer_interface { public: - ggml_qnn_rpc_buffer(std::shared_ptr qnn_instance, const size_t size, const uint32_t rank, - uint32_t *dimensions, Qnn_DataType_t data_type) : - _qnn_instance(qnn_instance), _size(size) { + virtual ~qnn_buffer_interface() = default; + + virtual bool is_valid() const = 0; + virtual uint8_t *get_buffer() = 0; + virtual size_t get_size() const = 0; + virtual Qnn_MemHandle_t get_mem_handle() const = 0; +}; + +using qnn_buffer_ptr = std::shared_ptr; - _qnn_rpc_buffer = static_cast(qnn_instance->alloc_rpcmem(size, alignof(void *))); +class qnn_rpc_buffer : public qnn_buffer_interface { +public: + qnn_rpc_buffer(std::shared_ptr qnn_instance, const size_t size, const uint32_t rank, + uint32_t *dimensions, Qnn_DataType_t data_type) + : _size(size), _qnn_instance(qnn_instance) { + + _qnn_rpc_buffer = static_cast(qnn_instance->alloc_rpcmem(size, alignof(uint8_t *))); _qnn_rpc_mem_handle = qnn_instance->register_rpcmem(_qnn_rpc_buffer, rank, dimensions, data_type); if (!_qnn_rpc_buffer || !_qnn_rpc_mem_handle) { - QNN_LOG_WARN("register rpc mem failure\n"); + QNN_LOG_WARN("register rpc mem failure"); // let the destructor free the buffer return; } - QNN_LOG_DEBUG("alloc rpcmem(%p) successfully, size %d\n", _qnn_rpc_buffer, (int)size); + QNN_LOG_DEBUG("alloc rpcmem(%p) successfully, size %d", _qnn_rpc_buffer, (int)size); } - ~ggml_qnn_rpc_buffer() { + ~qnn_rpc_buffer() { if (_qnn_instance) { if (_qnn_rpc_mem_handle) { _qnn_instance->unregister_rpcmem(_qnn_rpc_mem_handle); @@ -34,22 +48,58 @@ class ggml_qnn_rpc_buffer { } } - bool is_valid() const { return _qnn_rpc_buffer && _qnn_rpc_mem_handle; } + bool is_valid() const override { return _qnn_rpc_buffer && _qnn_rpc_mem_handle; } - uint8_t *get_buffer() const { return _qnn_rpc_buffer; } - size_t get_size() const { return _size; } - Qnn_MemHandle_t get_mem_handle() const { return _qnn_rpc_mem_handle; } + uint8_t *get_buffer() override { return _qnn_rpc_buffer; } + size_t get_size() const override { return _size; } + Qnn_MemHandle_t get_mem_handle() const override { return _qnn_rpc_mem_handle; } private: - std::shared_ptr _qnn_instance; size_t _size = 0; uint8_t *_qnn_rpc_buffer = nullptr; Qnn_MemHandle_t _qnn_rpc_mem_handle = nullptr; + std::shared_ptr _qnn_instance; + + DISABLE_COPY(qnn_rpc_buffer); + DISABLE_MOVE(qnn_rpc_buffer); +}; + +class qnn_mem_buffer : public qnn_buffer_interface { +public: + explicit qnn_mem_buffer(const uint8_t *data, size_t size) { + _buffer = reinterpret_cast(qnn::page_align_alloc(size)); + + if (!_buffer) { + QNN_LOG_WARN("failed to allocate %.2f MiB", float(size / (1 << 20))); + return; + } + + _size = size; + + if (data) { + memcpy(_buffer, data, size); + } + } + + explicit qnn_mem_buffer(size_t size) : qnn_mem_buffer(nullptr, size) {} + + ~qnn_mem_buffer() { + // the free will do nothing if the _buffer is nullptr + qnn::align_free(_buffer); + } + + bool is_valid() const override { return _buffer != nullptr; } + + uint8_t *get_buffer() override { return _buffer; } + size_t get_size() const override { return _size; } + Qnn_MemHandle_t get_mem_handle() const override { return nullptr; } + +private: + size_t _size = 0; + uint8_t *_buffer = nullptr; - ggml_qnn_rpc_buffer(const ggml_qnn_rpc_buffer &) = delete; - void operator=(const ggml_qnn_rpc_buffer &) = delete; - ggml_qnn_rpc_buffer(ggml_qnn_rpc_buffer &&) = delete; - void operator=(ggml_qnn_rpc_buffer &&) = delete; + DISABLE_COPY(qnn_mem_buffer); + DISABLE_MOVE(qnn_mem_buffer); }; } // namespace qnn diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp index c57692b867bc7..933016a62878e 100644 --- a/ggml/src/ggml-qnn/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn/ggml-qnn.cpp @@ -1,7 +1,5 @@ #include "ggml-qnn.h" -#include - #include #include #include @@ -87,78 +85,44 @@ static_assert(sizeof(kDeviceCaps) / sizeof(kDeviceCaps[0]) == GGML_QNN_MAX_DEVIC "The number of qnn devices should be equal to GGML_QNN_MAX_DEVICES"); static_assert(kDeviceCaps[QNN_BACKEND_NPU].type == GGML_BACKEND_DEVICE_TYPE_ACCEL, "The NPU device should be an accelerator device"); +static_assert(kDeviceCaps[QNN_BACKEND_GPU].type == GGML_BACKEND_DEVICE_TYPE_GPU, + "The NPU device should be an accelerator device"); -class ggml_backend_qnn_buffer_context { -public: - ggml_backend_qnn_buffer_context(QNNBackend device, std::shared_ptr instance, size_t size) - : _instance(instance), _name(QNN_BACKEND_NAME + std::to_string(device)) { - // TODO: fix this for other platforms - size_t size_page = sysconf(_SC_PAGESIZE); - - // TODO: for qnn npu, a better way here is to reuse the buffer allocated by - // qnn rpc, will save an extra copy - _buffer = qnn::align_alloc(size_page, size); - - if (!_buffer) { - QNN_LOG_WARN("failed to allocate %.2f MiB\n", float(size / (1 << 20))); - return; - } - - _buffer_size = size; - } - - ~ggml_backend_qnn_buffer_context() { - // the free will do nothing if the _buffer is nullptr - qnn::align_free(_buffer); - } - - bool is_valid() const { return _buffer != nullptr; } - - void *get_buffer() { return _buffer; } - size_t get_buffer_size() { return _buffer_size; } - -private: - std::shared_ptr _instance; - std::string _name; - void *_buffer = nullptr; - size_t _buffer_size = 0; -}; - -struct ggml_backend_qnn_buffer_type_context { - std::string name; -}; +static_assert(kDeviceCaps[QNN_BACKEND_CPU].type == GGML_BACKEND_DEVICE_TYPE_CPU, + "The NPU device should be an accelerator device"); ggml_backend_qnn_device_context *get_device_context(ggml_backend_dev_t dev) { return reinterpret_cast(dev->context); } +qnn::qnn_buffer_interface *get_buffer_context(ggml_backend_buffer_t buffer) { + return reinterpret_cast(buffer->context); +} + /* * ----------------------------------------------------------------------------------------------- * qnn backend buffer object * ----------------------------------------------------------------------------------------------- */ void ggml_backend_qnn_buffer_free_buffer(ggml_backend_buffer_t buffer) { - ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context; - + auto *ctx = get_buffer_context(buffer); delete ctx; } void *ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t buffer) { - ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context; - + auto *ctx = get_buffer_context(buffer); return ctx->get_buffer(); } void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor) { - // Do nothing here, the qnn tensor will be create along with the graph. GGML_UNUSED(buffer); GGML_UNUSED(tensor); + // TODO: we should create the qnn tensor along with the ggml tensor } void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data, size_t offset, size_t size) { GGML_UNUSED(buffer); - memcpy((char *)tensor->data + offset, data, size); } @@ -168,8 +132,7 @@ void ggml_backend_qnn_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml memcpy(data, (const char *)tensor->data + offset, size); } -bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor *src, - struct ggml_tensor *dst) { +bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor *src, ggml_tensor *dst) { GGML_UNUSED(buffer); if (ggml_backend_buffer_is_host(src->buffer)) { memcpy(dst->data, src->data, ggml_nbytes(src)); @@ -180,12 +143,11 @@ bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const stru } void ggml_backend_qnn_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { - ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context; - - memset(ctx->get_buffer(), value, ctx->get_buffer_size()); + auto *ctx = get_buffer_context(buffer); + memset(ctx->get_buffer(), value, ctx->get_size()); } -ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = { +constexpr const ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = { /* .free_buffer = */ ggml_backend_qnn_buffer_free_buffer, /* .get_base = */ ggml_backend_qnn_buffer_get_base, /* .init_tensor = */ ggml_backend_qnn_buffer_init_tensor, @@ -208,13 +170,13 @@ const char *ggml_backend_qnn_buffer_type_name(ggml_backend_buffer_type_t buft) { } ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - auto *dev_ctx = get_device_context(buft->device); - ggml_backend_qnn_buffer_context *ctx = - new ggml_backend_qnn_buffer_context((QNNBackend)dev_ctx->device, dev_ctx->instance, size); + qnn::qnn_buffer_interface *ctx = new qnn::qnn_mem_buffer(size); if (!ctx->is_valid()) { return nullptr; } + QNN_LOG_DEBUG("[%s]alloc buffer: %p, size: %ld", qnn::get_backend_name(get_device_context(buft->device)->device), + ctx->get_buffer(), size); return ggml_backend_buffer_init(buft, ggml_backend_qnn_buffer_interface, ctx, size); } @@ -227,7 +189,7 @@ size_t ggml_backend_qnn_buffer_type_get_alignment(ggml_backend_buffer_type_t buf size_t ggml_backend_qnn_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { GGML_UNUSED(buft); // TODO: get the max size from device - return 1024 * 1024 * 1024; + return 1024L * 1024 * 1024; } bool ggml_backend_qnn_buffer_is_host(ggml_backend_buffer_type_t buft) { @@ -254,61 +216,52 @@ void ggml_backend_qnn_free(ggml_backend_t backend) { } } +bool ggml_backend_qnn_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor *src, + ggml_tensor *dst) { + GGML_UNUSED(backend_src); + GGML_UNUSED(backend_dst); + GGML_UNUSED(src); + GGML_UNUSED(dst); + return false; +} + ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(ggml_backend_dev_t dev) { - static ggml_backend_qnn_buffer_type_context ggml_backend_qnn_buffer_type_contexts[GGML_QNN_MAX_DEVICES]; static ggml_backend_buffer_type ggml_backend_qnn_buffer_types[GGML_QNN_MAX_DEVICES]; - static bool ggml_backend_qnn_buffer_type_initialized = false; auto *dev_ctx = get_device_context(dev); - if (!ggml_backend_qnn_buffer_type_initialized) { - for (size_t i = 0; i < GGML_QNN_MAX_DEVICES; i++) { - auto &context = ggml_backend_qnn_buffer_type_contexts[i]; - context = {std::string(QNN_BACKEND_NAME) + std::to_string(i)}; - ggml_backend_qnn_buffer_types[i] = { - /* .iface = */ { - /* .get_name = */ ggml_backend_qnn_buffer_type_name, - /* .alloc_buffer = */ - ggml_backend_qnn_buffer_type_alloc_buffer, - /* .get_alignment = */ - ggml_backend_qnn_buffer_type_get_alignment, - /* .get_max_size = */ - ggml_backend_qnn_buffer_type_get_max_size, - /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes - /* .is_host = */ ggml_backend_qnn_buffer_is_host, - }, - /* .device */ dev, - /* .context = */ &context, - }; - } - ggml_backend_qnn_buffer_type_initialized = true; + if (!ggml_backend_qnn_buffer_types[dev_ctx->device].device) { + ggml_backend_qnn_buffer_types[dev_ctx->device] = { + /* .iface = */ { + /* .get_name = */ ggml_backend_qnn_buffer_type_name, + /* .alloc_buffer = */ + ggml_backend_qnn_buffer_type_alloc_buffer, + /* .get_alignment = */ + ggml_backend_qnn_buffer_type_get_alignment, + /* .get_max_size = */ + ggml_backend_qnn_buffer_type_get_max_size, + /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes + /* .is_host = */ ggml_backend_qnn_buffer_is_host, + }, + /* .device */ dev, + /* .context = */ nullptr, + }; + } else { + GGML_ASSERT(ggml_backend_qnn_buffer_types[dev_ctx->device].device == dev); } return &ggml_backend_qnn_buffer_types[dev_ctx->device]; } ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, ggml_cgraph *cgraph) { - enum ggml_status result = GGML_STATUS_SUCCESS; - auto *device_ctx = get_device_context(backend->device); - for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_tensor *node = cgraph->nodes[i]; - if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || - node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { - continue; - } - bool ok = qnn::ggml_qnn_forward(device_ctx, node); - if (!ok) { - QNN_LOG_DEBUG("error: op not supported %s (%s)\n", node->name, ggml_op_name(node->op)); - } - } - - return result; + return qnn::device_compute_graph(get_device_context(backend->device), cgraph) ? GGML_STATUS_SUCCESS + : GGML_STATUS_FAILED; } -ggml_backend_i ggml_backend_qnn_interface = { +constexpr const ggml_backend_i ggml_backend_qnn_interface = { /* .get_name = */ ggml_backend_qnn_name, /* .free = */ ggml_backend_qnn_free, /* .set_tensor_async = */ nullptr, /* .get_tensor_async = */ nullptr, - /* .cpy_tensor_async = */ nullptr, + /* .cpy_tensor_async = */ ggml_backend_qnn_cpy_tensor_async, /* .synchronize = */ nullptr, /* .graph_plan_create = */ nullptr, /* .graph_plan_free = */ nullptr, @@ -345,7 +298,7 @@ enum ggml_backend_dev_type ggml_backend_qnn_device_get_type(ggml_backend_dev_t d return kDeviceCaps[get_device_context(dev)->device].type; } -void ggml_backend_qnn_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props *props) { +void ggml_backend_qnn_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props *props) { props->name = ggml_backend_qnn_device_get_name(dev); props->description = ggml_backend_qnn_device_get_description(dev); props->type = ggml_backend_qnn_device_get_type(dev); @@ -364,6 +317,8 @@ ggml_guid_t ggml_backend_qnn_guid() { return &guid; } +bool ggml_backend_is_qnn(ggml_backend_t backend) { return ggml_guid_matches(backend->guid, ggml_backend_qnn_guid()); } + ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, const char *extend_lib_search_path) { if (!extend_lib_search_path) { extend_lib_search_path = GGML_QNN_DEFAULT_LIB_SEARCH_PATH; @@ -401,9 +356,9 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, } } else { if (setenv("LD_LIBRARY_PATH", path.c_str(), 1) == 0) { - QNN_LOG_DEBUG("%s backend setenv successfully\n", qnn::get_backend_name(device)); + QNN_LOG_DEBUG("%s backend setenv successfully", qnn::get_backend_name(device)); } else { - QNN_LOG_ERROR("%s backend setenv failure\n", qnn::get_backend_name(device)); + QNN_LOG_ERROR("%s backend setenv failure", qnn::get_backend_name(device)); } } #endif @@ -411,12 +366,12 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, auto instance = std::make_shared(path, dev_ctx->lib_name, "ggml"); auto result = instance->qnn_init(nullptr); if (result != 0) { - QNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why\n", qnn::get_backend_name(device)); + QNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why", qnn::get_backend_name(device)); return nullptr; } auto qnn_interface = instance->get_qnn_interface(); if (!qnn_interface) { - QNN_LOG_WARN("qnn subsystem failure\n"); + QNN_LOG_WARN("qnn subsystem failure"); return nullptr; } @@ -453,10 +408,10 @@ ggml_backend_buffer_t ggml_backend_qnn_device_buffer_from_ptr(ggml_backend_dev_t return ggml_backend_cpu_buffer_from_ptr(ptr, size); } -bool ggml_backend_qnn_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor *op) { +bool ggml_backend_qnn_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor *op) { // Note that this function could be called before the device context is initialized auto *device_ctx = get_device_context(dev); - return qnn::ggml_qnn_supports_op(device_ctx, op); + return qnn::device_supports_op(device_ctx, op); } bool ggml_backend_qnn_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { @@ -464,7 +419,13 @@ bool ggml_backend_qnn_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_ return ggml_backend_buft_is_host(buft); } -const struct ggml_backend_device_i ggml_backend_qnn_device_interface = { +bool ggml_backend_qnn_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor *op) { + auto *device_ctx = get_device_context(dev); + QNN_LOG_DEBUG("[%s][%s]offload op", qnn::get_backend_name(device_ctx->device), ggml_op_name(op->op)); + return false; +} + +constexpr const ggml_backend_device_i ggml_backend_qnn_device_interface = { /* .get_name = */ ggml_backend_qnn_device_get_name, /* .get_description = */ ggml_backend_qnn_device_get_description, /* .get_memory = */ ggml_backend_qnn_device_get_memory, @@ -476,7 +437,7 @@ const struct ggml_backend_device_i ggml_backend_qnn_device_interface = { /* .buffer_from_host_ptr = */ ggml_backend_qnn_device_buffer_from_ptr, /* .supports_op = */ ggml_backend_qnn_device_supports_op, /* .supports_buft = */ ggml_backend_qnn_device_supports_buft, - /* .offload_op = */ nullptr, + /* .offload_op = */ ggml_backend_qnn_device_offload_op, /* .event_new = */ nullptr, /* .event_free = */ nullptr, /* .event_synchronize = */ nullptr, @@ -489,27 +450,36 @@ const struct ggml_backend_device_i ggml_backend_qnn_device_interface = { */ struct ggml_backend_qnn_reg_impl : ggml_backend_reg { - std::array, GGML_QNN_MAX_DEVICES> device_contexts; - std::array devices; + std::vector> device_contexts; + std::vector devices; explicit ggml_backend_qnn_reg_impl(ggml_backend_reg_i interface) { context = this; iface = interface; QNN_LOG_DEBUG("qnn backend registry init"); - for (int i = 0; i < GGML_QNN_MAX_DEVICES; i++) { - const auto device_enum = (QNNBackend)(GGML_QNN_MAX_DEVICES - 1 - i); // init from the last device, i.e. NPU - device_contexts[i] = std::make_unique( + for (size_t i = 0; i < QNN_BACKEND_COUNT; i++) { + const auto device_enum = (QNNBackend)(QNN_BACKEND_COUNT - 1 - i); // init from the last device, i.e. NPU + if (device_enum == QNN_BACKEND_CPU) { + /* + * here we skip the initialization of CPU device, + * cause it'll block unsupported ops fallback to ggml cpu backend + */ + continue; + } + + device_contexts.emplace_back(std::make_unique( /* .device = */ device_enum, // init from the last device, i.e. NPU /* .threads = */ 1, /* .name = */ qnn::get_backend_name(device_enum), /* .lib_name = */ kDeviceCaps[device_enum].lib_name, - /* .supported_types = */ kDeviceCaps[device_enum].supported_types); + /* .supported_types = */ kDeviceCaps[device_enum].supported_types)); - auto &device = devices[i]; - device.iface = ggml_backend_qnn_device_interface; - device.reg = this; - device.context = device_contexts[i].get(); + devices.emplace_back(ggml_backend_device{ + /* iface = */ ggml_backend_qnn_device_interface, + /* reg = */ this, + /* context = */ device_contexts.back().get(), + }); } } }; diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 1b0dcd78faa17..1806f41126f3c 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -64,12 +64,12 @@ class ggml_qnn_graph { } if (error != QNN_SUCCESS) { - QNN_LOG_ERROR("[%s][%s]failed to create qnn graph, error: %s\n", get_backend_name(device), - graph_name.c_str(), get_qnn_error_string(error)); + QNN_LOG_ERROR("[%s][%s]failed to create qnn graph, error: %s", get_backend_name(device), graph_name.c_str(), + get_qnn_error_string(error)); return; } - QNN_LOG_INFO("[%s][%s]create succeed\n", get_backend_name(device), graph_name.c_str()); + QNN_LOG_INFO("[%s][%s]create succeed", get_backend_name(device), graph_name.c_str()); _graph_handle = graph_handle; _qnn_interface = qnn_interface; } @@ -80,7 +80,7 @@ class ggml_qnn_graph { const ggml_tensor_array_t &tensor_outputs) { GGML_ASSERT(op_constructor); if (!is_valid()) { - QNN_LOG_ERROR("Invalid graph\n"); + QNN_LOG_ERROR("Invalid graph"); return false; } @@ -92,7 +92,7 @@ class ggml_qnn_graph { } if (!_op_config->add_op_to_graph(_graph_handle)) { - QNN_LOG_ERROR("[%s]add nodes failed\n", _graph_name.c_str()); + QNN_LOG_ERROR("[%s]add nodes failed", _graph_name.c_str()); return false; } @@ -109,12 +109,12 @@ class ggml_qnn_graph { bool execute(const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) { if (!_op_config->bind_input_tensors(tensor_inputs)) { - QNN_LOG_ERROR("[%s][%s]bind input tensors failed\n", get_backend_name(_device), _graph_name.c_str()); + QNN_LOG_ERROR("[%s][%s]bind input tensors failed", get_backend_name(_device), _graph_name.c_str()); return false; } if (!_op_config->bind_output_tensors(tensor_outputs)) { - QNN_LOG_ERROR("[%s][%s]bind output tensors failed\n", get_backend_name(_device), _graph_name.c_str()); + QNN_LOG_ERROR("[%s][%s]bind output tensors failed", get_backend_name(_device), _graph_name.c_str()); return false; } diff --git a/ggml/src/ggml-qnn/op-config.cpp b/ggml/src/ggml-qnn/op-config.cpp index df70d548a44e0..b3c84b5435095 100644 --- a/ggml/src/ggml-qnn/op-config.cpp +++ b/ggml/src/ggml-qnn/op-config.cpp @@ -82,7 +82,7 @@ bool bind_tensors(const qnn::ggml_tensor_array_t &ggml_tensors, qnn::qnn_tensor_ for (size_t i = 0; i < ggml_tensors.size(); i++) { auto *ggml_tensor = ggml_tensors[i]; if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor)) { - QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); + QNN_LOG_ERROR("bind tensor %s failed", ggml_get_name(ggml_tensor)); return false; } @@ -162,12 +162,12 @@ bool ggml_qnn_op_config_base::add_tensor_param(const std::string &name, const qn GGML_ASSERT(data_size > 0); if (!param_tensor->bind_buffer(const_cast(data), data_size)) { - QNN_LOG_ERROR("parameter tensor bind_buffer failed\n"); + QNN_LOG_ERROR("parameter tensor bind_buffer failed"); return false; } if (!param_tensor->alloc_qnn_tensor_id()) { - QNN_LOG_ERROR("parameter tensor alloc_qnn_tensor_id failed\n"); + QNN_LOG_ERROR("parameter tensor alloc_qnn_tensor_id failed"); return false; } @@ -185,26 +185,26 @@ bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) { GGML_ASSERT(_qnn_tensor_inputs.size() == _tensor_inputs.size()); GGML_ASSERT(_qnn_tensor_outputs.size() == _tensor_outputs.size()); - QNN_LOG_DEBUG("[%s]add to graph start\n", _name.c_str()); + QNN_LOG_DEBUG("[%s]add to graph start", _name.c_str()); for (size_t i = 0; i < _tensor_inputs.size(); i++) { auto tensor = _tensor_inputs[i]; if (!tensor->alloc_qnn_tensor_id()) { - QNN_LOG_ERROR("[%s]input tensor alloc_qnn_tensor_id failed\n", _name.c_str()); + QNN_LOG_ERROR("[%s]input tensor alloc_qnn_tensor_id failed", _name.c_str()); return false; } - QNN_LOG_DEBUG("[%s]input tensor id: %d\n", _name.c_str(), tensor->get_qnn_tensor_id()); + QNN_LOG_DEBUG("[%s]input tensor id: %d", _name.c_str(), tensor->get_qnn_tensor_id()); _qnn_tensor_inputs[i] = tensor->get_qnn_tensor(); } for (size_t i = 0; i < _tensor_outputs.size(); i++) { auto tensor = _tensor_outputs[i]; if (!tensor->alloc_qnn_tensor_id()) { - QNN_LOG_ERROR("[%s]output tensor alloc_qnn_tensor_id failed\n", _name.c_str()); + QNN_LOG_ERROR("[%s]output tensor alloc_qnn_tensor_id failed", _name.c_str()); return false; } - QNN_LOG_DEBUG("[%s]output tensor id: %d\n", _name.c_str(), tensor->get_qnn_tensor_id()); + QNN_LOG_DEBUG("[%s]output tensor id: %d", _name.c_str(), tensor->get_qnn_tensor_id()); _qnn_tensor_outputs[i] = tensor->get_qnn_tensor(); } @@ -215,7 +215,7 @@ bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) { return false; } - QNN_LOG_DEBUG("[%s]added to graph succeed\n", _name.c_str()); + QNN_LOG_DEBUG("[%s]added to graph succeed", _name.c_str()); return true; } @@ -280,6 +280,14 @@ bool ggml_qnn_single_op_config::initialize_op_nodes(QNNBackend device, Qnn_Graph return true; } +bool ggml_qnn_aggregate_op_config::bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) { + return bind_tensors(tensor_inputs, _tensor_inputs, _qnn_tensor_inputs); +} + +bool ggml_qnn_aggregate_op_config::bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) { + return bind_tensors(tensor_outputs, _tensor_outputs, _qnn_tensor_outputs); +} + bool ggml_qnn_matmul_op_config::initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) { @@ -293,20 +301,21 @@ bool ggml_qnn_matmul_op_config::initialize_op_nodes(QNNBackend device, Qnn_Graph create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs); // create output tensor - qnn_tensor_array_t mat_mul_tensor_outputs; params.name_prefix = "dst"; params.is_input = false; - create_tensors_from_ggml_tensor(params, tensor_outputs, &mat_mul_tensor_outputs, nullptr); + create_tensors_from_ggml_tensor(params, tensor_outputs, &_tensor_outputs, &_qnn_tensor_outputs); // create convert nodes qnn_tensor_array_t mat_mul_tensor_inputs = _tensor_inputs; + qnn_tensor_array_t mat_mul_tensor_outputs = _tensor_outputs; if (!create_convert_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs, mat_mul_tensor_outputs)) { - QNN_LOG_ERROR("create convert nodes failed\n"); + QNN_LOG_ERROR("create convert nodes failed"); return false; } - mat_mul_tensor_inputs.front() = create_gather_nodes(device, graph_handle, tensor_rank, _tensor_inputs.front(), - _tensor_inputs.back()->get_dimensions()); + mat_mul_tensor_inputs.front() = + create_gather_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs.front(), + mat_mul_tensor_inputs.back()->get_dimensions()); return create_mat_mul_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs, mat_mul_tensor_outputs); } @@ -365,15 +374,15 @@ qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend devic qnn_dimension_array_t intermediate_dimensions = input_dimensions; intermediate_dimensions[rank - 3] = output_dimensions[rank - 3]; qnn_tensor_ptr_t gather0_out; - _gather0 = create_node(_name + "_gather0", rank, rank - 3, intermediate_dimensions, tensor_input, device, - graph_handle, _qnn_instance, gather0_out); + _operations.push_back(create_node(_name + "_gather0", rank, rank - 3, intermediate_dimensions, tensor_input, device, + graph_handle, _qnn_instance, gather0_out)); if (rank == 3) { return gather0_out; } qnn_tensor_ptr_t gather1_out; - _gather1 = create_node(_name + "_gather1", rank, rank - 4, output_dimensions, gather0_out, device, graph_handle, - _qnn_instance, gather1_out); + _operations.push_back(create_node(_name + "_gather1", rank, rank - 4, output_dimensions, gather0_out, device, + graph_handle, _qnn_instance, gather1_out)); return gather1_out; } @@ -387,9 +396,8 @@ bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_Grap // create tensors for convert node auto tensor_type = get_tensor_type(tensor_inputs); - QNN_LOG_DEBUG("input tensor type: %s\n", qnn_datatype_to_string(tensor_type)); + QNN_LOG_DEBUG("input tensor type: %s", qnn_datatype_to_string(tensor_type)); - _input_converts.resize(tensor_inputs.size()); for (size_t i = 0; i < tensor_inputs.size(); ++i) { // create input convert nodes auto convert_in = tensor_inputs[i]; @@ -406,7 +414,7 @@ bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_Grap convert->set_input_tensors({convert_in}); convert->set_output_tensors({convert_out}); tensor_inputs[i] = convert_out; - _input_converts[i] = convert; + _operations.push_back(convert); } if (tensor_outputs.front()->get_data_type() != tensor_type) { @@ -421,7 +429,7 @@ bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_Grap output_convert->set_input_tensors({convert_in}); output_convert->set_output_tensors({convert_out}); tensor_outputs.front() = convert_in; - _output_convert = output_convert; + _operations.push_back(output_convert); } return true; @@ -432,7 +440,7 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap qnn_tensor_array_t &tensor_outputs) { /* - * First, both the ggml and qnn tensor in memory are stored as row-major format. (For more details, please also: + * First, both the ggml and qnn tensor in memory are stored as row-major format. (For more details, please refer to: * https://pytorch.org/blog/tensor-memory-format-matters/#:~:text=Column%20Major%20Order:%20In%20this%20format,%20the%20matrix) * But the dimensions of the tensor are stored in different order. * For example, a 2x3 matrix: @@ -515,81 +523,19 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap transpose_out->set_input_tensors(tensors); transpose_out->set_output_tensors(tensor_outputs); - _mat_mul = mat_mul; - _transpose_out = transpose_out; + _operations.push_back(mat_mul); + _operations.push_back(transpose_out); return true; } -bool ggml_qnn_matmul_op_config::add_op_to_graph(Qnn_GraphHandle_t graph_handle) { - for (auto &convert : _input_converts) { - if (convert && !convert->add_op_to_graph(graph_handle)) { - return false; - } - } - - if (_gather0 && !_gather0->add_op_to_graph(graph_handle)) { - return false; - } - - if (_gather1 && !_gather1->add_op_to_graph(graph_handle)) { - return false; - } - - return _mat_mul->add_op_to_graph(graph_handle) && _transpose_out->add_op_to_graph(graph_handle) && - (!_output_convert || _output_convert->add_op_to_graph(graph_handle)); -} - -bool ggml_qnn_matmul_op_config::bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) { - return bind_tensors(tensor_inputs, _tensor_inputs, _qnn_tensor_inputs); -} - -bool ggml_qnn_matmul_op_config::bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) { - if (_output_convert) { - return _output_convert->bind_output_tensors(tensor_outputs); - } else { - return _transpose_out->bind_output_tensors(tensor_outputs); - } -} - -void ggml_qnn_matmul_op_config::unbind_input_tensors() { - _mat_mul->unbind_input_tensors(); - for (auto &convert : _input_converts) { - if (convert) { - convert->unbind_input_tensors(); - } - } -} - -void ggml_qnn_matmul_op_config::unbind_output_tensors() { - _transpose_out->unbind_output_tensors(); - if (_output_convert) { - _output_convert->unbind_output_tensors(); - } -} - -std::vector &ggml_qnn_matmul_op_config::get_qnn_output_tensors() { - if (_output_convert) { - return _output_convert->get_qnn_output_tensors(); - } else { - return _transpose_out->get_qnn_output_tensors(); - } -} - ggml_op_constructor_t create_op_constructor(const std::string &op_name) { if (op_name == QNN_OP_MAT_MUL) { // For QNN_OP_MAT_MUL, we need to transpose the input tensor return [](const std::string &instance_name, std::shared_ptr qnn_instance) -> std::unique_ptr { - QNN_LOG_DEBUG("create QNN_OP_MAT_MUL, name %s\n", instance_name.c_str()); + QNN_LOG_DEBUG("create QNN_OP_MAT_MUL, name %s", instance_name.c_str()); return std::make_unique(instance_name, qnn_instance); }; - } else if (op_name == QNN_OP_TRANSPOSE) { - return [](const std::string &instance_name, - std::shared_ptr qnn_instance) -> std::unique_ptr { - return std::make_unique( - instance_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_TRANSPOSE, QNN_OP_TRANSPOSE_PARAM_PERM, - QNN_DATATYPE_UINT_32, 4 * sizeof(uint32_t), qnn_instance); - }; } return [op_name](const std::string &instance_name, diff --git a/ggml/src/ggml-qnn/op-config.hpp b/ggml/src/ggml-qnn/op-config.hpp index 27571563309a8..a05b75ade7e6a 100644 --- a/ggml/src/ggml-qnn/op-config.hpp +++ b/ggml/src/ggml-qnn/op-config.hpp @@ -82,21 +82,70 @@ class ggml_qnn_single_op_config : public ggml_qnn_op_config_base { DISABLE_MOVE(ggml_qnn_single_op_config); }; -class ggml_qnn_matmul_op_config : public ggml_qnn_op_config { +class ggml_qnn_aggregate_op_config : public ggml_qnn_op_config { public: - ggml_qnn_matmul_op_config(const std::string &name, std::shared_ptr qnn_instance) + explicit ggml_qnn_aggregate_op_config(const std::string &name, std::shared_ptr qnn_instance) : _name(name), _qnn_instance(qnn_instance) {} - bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, - const ggml_tensor_array_t &tensor_inputs, - const ggml_tensor_array_t &tensor_outputs) override; - bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override; + ~ggml_qnn_aggregate_op_config() { + _qnn_tensor_inputs.clear(); + _qnn_tensor_outputs.clear(); + _tensor_inputs.clear(); + _tensor_outputs.clear(); + _operations.clear(); + } + + bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override { + for (auto &op : _operations) { + if (!op->add_op_to_graph(graph_handle)) { + return false; + } + } + return true; + } + bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) override; + bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) override; - void unbind_input_tensors() override; - void unbind_output_tensors() override; + + void unbind_input_tensors() override { + for (auto &tensor : _tensor_inputs) { + tensor->unbind(); + } + } + + void unbind_output_tensors() override { + for (auto &tensor : _tensor_outputs) { + tensor->unbind(); + } + } + std::vector &get_qnn_input_tensors() override { return _qnn_tensor_inputs; } - std::vector &get_qnn_output_tensors() override; + std::vector &get_qnn_output_tensors() override { return _qnn_tensor_outputs; } + +protected: + std::string _name; + std::shared_ptr _qnn_instance; + + std::vector _operations; + qnn_tensor_array_t _tensor_inputs; + qnn_tensor_array_t _tensor_outputs; + std::vector _qnn_tensor_inputs; + std::vector _qnn_tensor_outputs; + +private: + DISABLE_COPY(ggml_qnn_aggregate_op_config); + DISABLE_MOVE(ggml_qnn_aggregate_op_config); +}; + +class ggml_qnn_matmul_op_config : public ggml_qnn_aggregate_op_config { +public: + ggml_qnn_matmul_op_config(const std::string &name, std::shared_ptr qnn_instance) + : ggml_qnn_aggregate_op_config(name, qnn_instance) {} + + bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, + const ggml_tensor_array_t &tensor_inputs, + const ggml_tensor_array_t &tensor_outputs) override; private: qnn_tensor_ptr_t create_gather_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, @@ -106,17 +155,6 @@ class ggml_qnn_matmul_op_config : public ggml_qnn_op_config { bool create_mat_mul_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, qnn_tensor_array_t &tensor_inputs, qnn_tensor_array_t &tensor_outputs); - std::string _name; - std::shared_ptr _qnn_instance; - qnn_op_config_ptr_t _transpose_out; - qnn_op_config_ptr_t _mat_mul; - qnn_op_config_ptr_t _gather0; - qnn_op_config_ptr_t _gather1; - std::vector _input_converts; - qnn_op_config_ptr_t _output_convert; - qnn_tensor_array_t _tensor_inputs; - std::vector _qnn_tensor_inputs; - DISABLE_COPY(ggml_qnn_matmul_op_config); DISABLE_MOVE(ggml_qnn_matmul_op_config); }; diff --git a/ggml/src/ggml-qnn/qnn-lib.hpp b/ggml/src/ggml-qnn/qnn-lib.hpp index c6801b7771ee9..454c0c6aa32c5 100644 --- a/ggml/src/ggml-qnn/qnn-lib.hpp +++ b/ggml/src/ggml-qnn/qnn-lib.hpp @@ -195,21 +195,21 @@ class qnn_instance { int qnn_init(const QnnSaver_Config_t **saver_config) { BackendIdType backend_id = QNN_BACKEND_ID_NULL; - QNN_LOG_DEBUG("enter qni_init\n"); + QNN_LOG_DEBUG("enter qnn_init"); std::lock_guard lock(_init_mutex); if (load_system() != 0) { - QNN_LOG_WARN("can not load QNN system lib, pls check why?\n"); + QNN_LOG_WARN("can not load QNN system lib, pls check why?"); return 1; } else { - QNN_LOG_DEBUG("load QNN system lib successfully\n"); + QNN_LOG_DEBUG("load QNN system lib successfully"); } std::string backend_lib_path = _lib_path + _backend_name; if (_lib_path_to_backend_id.count(backend_lib_path) == 0) { int is_load_ok = load_backend(backend_lib_path, saver_config); if (is_load_ok != 0) { - QNN_LOG_WARN("failed to load QNN backend\n"); + QNN_LOG_WARN("failed to load QNN backend"); return 2; } } @@ -218,7 +218,7 @@ class qnn_instance { if (_loaded_backend.count(backend_id) == 0 || _loaded_lib_handle.count(backend_id) == 0) { QNN_LOG_WARN( "library %s is loaded but loaded backend count=%zu, " - "loaded lib_handle count=%zu\n", + "loaded lib_handle count=%zu", backend_lib_path.c_str(), _loaded_backend.count(backend_id), _loaded_lib_handle.count(backend_id)); return 3; } @@ -227,28 +227,28 @@ class qnn_instance { _qnn_interface->qnn_log_create(qnn::sdk_logcallback, _qnn_log_level, &_qnn_log_handle); if (nullptr == _qnn_log_handle) { // NPU backend not work on Qualcomm SoC equipped low-end phone - QNN_LOG_WARN("why failed to initialize qnn log\n"); + QNN_LOG_WARN("why failed to initialize qnn log"); return 4; } else { - QNN_LOG_DEBUG("initialize qnn log successfully\n"); + QNN_LOG_DEBUG("initialize qnn log successfully"); } std::vector temp_backend_config; _qnn_interface->qnn_backend_create( _qnn_log_handle, temp_backend_config.empty() ? nullptr : temp_backend_config.data(), &_qnn_backend_handle); if (nullptr == _qnn_backend_handle) { - QNN_LOG_WARN("why failed to initialize qnn backend\n"); + QNN_LOG_WARN("why failed to initialize qnn backend"); return 5; } else { - QNN_LOG_DEBUG("initialize qnn backend successfully\n"); + QNN_LOG_DEBUG("initialize qnn backend successfully"); } Qnn_ErrorHandle_t qnn_status = _qnn_interface->qnn_property_has_capability(QNN_PROPERTY_GROUP_DEVICE); if (QNN_PROPERTY_NOT_SUPPORTED == qnn_status) { - QNN_LOG_WARN("device property is not supported\n"); + QNN_LOG_WARN("device property is not supported"); } if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnn_status) { - QNN_LOG_WARN("device property is not known to backend\n"); + QNN_LOG_WARN("device property is not known to backend"); } qnn_status = QNN_SUCCESS; @@ -294,9 +294,9 @@ class qnn_instance { qnn_status = _qnn_interface->qnn_device_create(_qnn_log_handle, nullptr, &_qnn_device_handle); } if (QNN_SUCCESS != qnn_status && QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnn_status) { - QNN_LOG_WARN("failed to create QNN device\n"); + QNN_LOG_WARN("failed to create QNN device"); } else { - QNN_LOG_INFO("create QNN device successfully\n"); + QNN_LOG_INFO("create QNN device successfully"); } if (_profile_level != sdk_profile_level::profile_off) { @@ -306,19 +306,19 @@ class qnn_instance { if (QNN_PROFILE_NO_ERROR != _qnn_interface->qnn_profile_create(_qnn_backend_handle, profile_level, &_qnn_profile_handle)) { - QNN_LOG_WARN("unable to create profile handle in the backend\n"); + QNN_LOG_WARN("unable to create profile handle in the backend"); return 6; } else { - QNN_LOG_DEBUG("initialize qnn profile successfully\n"); + QNN_LOG_DEBUG("initialize qnn profile successfully"); } } _rpc_lib_handle = dl_load("libcdsprpc.so"); if (nullptr == _rpc_lib_handle) { - QNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dl_error()); + QNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s", dl_error()); return 8; } else { - QNN_LOG_DEBUG("load rpcmem lib successfully\n"); + QNN_LOG_DEBUG("load rpcmem lib successfully"); set_rpcmem_initialized(true); } _pfn_rpc_mem_init = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_init")); @@ -343,10 +343,10 @@ class qnn_instance { */ _qnn_interface->qnn_context_create(_qnn_backend_handle, _qnn_device_handle, nullptr, &_qnn_context_handle); if (nullptr == _qnn_context_handle) { - QNN_LOG_WARN("why failed to initialize qnn context\n"); + QNN_LOG_WARN("why failed to initialize qnn context"); return 10; } else { - QNN_LOG_DEBUG("initialize qnn context successfully\n"); + QNN_LOG_DEBUG("initialize qnn context successfully"); } if (_backend_name.find("Htp") != _backend_name.npos) { @@ -359,7 +359,7 @@ class qnn_instance { for (size_t idx = 0; idx < probe_counts; idx++) { rpc_buffer = static_cast(alloc_rpcmem(probe_slots[idx] * size_in_mb, sizeof(void *))); if (!rpc_buffer) { - QNN_LOG_DEBUG("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno)); + QNN_LOG_DEBUG("alloc rpcmem %d (MB) failure, %s", probe_slots[idx], strerror(errno)); break; } else { candidate_size = probe_slots[idx]; @@ -369,7 +369,7 @@ class qnn_instance { } _rpcmem_capacity = std::max(candidate_size, _rpcmem_capacity); - QNN_LOG_INFO("capacity of QNN rpc ion memory is about %d MB\n", _rpcmem_capacity); + QNN_LOG_INFO("capacity of QNN rpc ion memory is about %d MB", _rpcmem_capacity); if (0 != init_htp_perfinfra()) { QNN_LOG_WARN("initialize HTP performance failure"); @@ -382,7 +382,7 @@ class qnn_instance { } } - QNN_LOG_DEBUG("leave qni_init\n"); + QNN_LOG_DEBUG("leave qnn_init"); return 0; } @@ -395,9 +395,9 @@ class qnn_instance { _pfn_rpc_mem_deinit(); if (dl_unload(_rpc_lib_handle) != 0) { - QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dl_error()); + QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s", dl_error()); } else { - QNN_LOG_DEBUG("succeed to close rpcmem lib\n"); + QNN_LOG_DEBUG("succeed to close rpcmem lib"); } if (_backend_name.find("Htp") != _backend_name.npos) { @@ -407,7 +407,7 @@ class qnn_instance { if (nullptr != _qnn_context_handle) { error = _qnn_interface->qnn_context_free(_qnn_context_handle, _qnn_profile_handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), + QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d", _qnn_interface->get_backend_id(), QNN_GET_ERROR_CODE(error)); } _qnn_context_handle = nullptr; @@ -416,7 +416,7 @@ class qnn_instance { if (nullptr != _qnn_profile_handle) { error = _qnn_interface->qnn_profile_free(_qnn_profile_handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), + QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d", _qnn_interface->get_backend_id(), QNN_GET_ERROR_CODE(error)); } _qnn_profile_handle = nullptr; @@ -425,7 +425,7 @@ class qnn_instance { if (nullptr != _qnn_device_handle) { error = _qnn_interface->qnn_device_free(_qnn_device_handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), + QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d", _qnn_interface->get_backend_id(), QNN_GET_ERROR_CODE(error)); } _qnn_device_handle = nullptr; @@ -434,7 +434,7 @@ class qnn_instance { if (nullptr != _qnn_backend_handle) { error = _qnn_interface->qnn_backend_free(_qnn_backend_handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), + QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d", _qnn_interface->get_backend_id(), QNN_GET_ERROR_CODE(error)); } _qnn_backend_handle = nullptr; @@ -443,7 +443,7 @@ class qnn_instance { if (nullptr != _qnn_log_handle) { error = _qnn_interface->qnn_log_free(_qnn_log_handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), + QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d", _qnn_interface->get_backend_id(), QNN_GET_ERROR_CODE(error)); } _qnn_log_handle = nullptr; @@ -458,7 +458,7 @@ class qnn_instance { std::shared_ptr get_qnn_interface() { if (!_qnn_interface) { - QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + QNN_LOG_WARN("pls check why _qnn_interface is not loaded"); } return _qnn_interface; } @@ -479,10 +479,10 @@ class qnn_instance { QnnDevice_Infrastructure_t device_infra = nullptr; int error = _qnn_interface->qnn_device_get_infrastructure(&device_infra); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to get qnn device infra\n"); + QNN_LOG_WARN("failed to get qnn device infra"); return 1; } else { - QNN_LOG_INFO("HTP backend perf_infrastructure creation ok\n"); + QNN_LOG_INFO("HTP backend perf_infrastructure creation ok"); } QnnHtpDevice_Infrastructure_t *htp_infra = static_cast(device_infra); @@ -494,7 +494,7 @@ class qnn_instance { if (htp_infra->infraType != QNN_HTP_DEVICE_INFRASTRUCTURE_TYPE_PERF) { QNN_LOG_INFO("HTP infra type = %d, which is not perf infra type", htp_infra->infraType); } else { - QNN_LOG_INFO("HTP infra type = %d, which is perf infra type\n", htp_infra->infraType); + QNN_LOG_INFO("HTP infra type = %d, which is perf infra type", htp_infra->infraType); } _qnn_htp_perfinfra = htp_perfinfra; _qnn_power_configid = power_configid; @@ -520,12 +520,12 @@ class qnn_instance { nullptr}; Qnn_ErrorHandle_t qnn_status = _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs); if (qnn_status != QNN_SUCCESS) { - QNN_LOG_WARN("set htp perf failed\n"); + QNN_LOG_WARN("set htp perf failed"); } else { - QNN_LOG_DEBUG("set htp perf ok\n"); + QNN_LOG_DEBUG("set htp perf ok"); } } else { - QNN_LOG_WARN("can't set htp perf\n"); + QNN_LOG_WARN("can't set htp perf"); } return 0; @@ -533,7 +533,7 @@ class qnn_instance { int set_high_performance_mode() { if (nullptr == _qnn_htp_perfinfra) { - QNN_LOG_WARN("perf intra is null\n"); + QNN_LOG_WARN("perf intra is null"); return 1; } @@ -566,9 +566,9 @@ class qnn_instance { Qnn_ErrorHandle_t qnn_status = QNN_SUCCESS; qnn_status = _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs); if (qnn_status != QNN_SUCCESS) { - QNN_LOG_WARN("set htp high performance mode failed\n"); + QNN_LOG_WARN("set htp high performance mode failed"); } else { - QNN_LOG_DEBUG("set htp high performance mode ok\n"); + QNN_LOG_DEBUG("set htp high performance mode ok"); } return 0; @@ -584,21 +584,21 @@ class qnn_instance { void *alloc_rpcmem(size_t bytes, size_t alignment) { if (!_rpcmem_initialized) { - QNN_LOG_WARN("rpc memory not initialized\n"); + QNN_LOG_WARN("rpc memory not initialized"); return nullptr; } auto allocate_bytes = static_cast(bytes + alignment); void *buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, (int)allocate_bytes); if (!buf) { - QNN_LOG_WARN("failed to allocate rpc memory, size: %d MB\n", (int)(allocate_bytes / (1 << 20))); + QNN_LOG_WARN("failed to allocate rpc memory, size: %d MB", (int)(allocate_bytes / (1 << 20))); return nullptr; } auto aligned_buf = reinterpret_cast(qnn::align_to(alignment, reinterpret_cast(buf))); bool status = _rpcmem_store_map.insert(std::pair(aligned_buf, buf)).second; if (!status) { - QNN_LOG_WARN("failed to allocate rpc memory\n"); + QNN_LOG_WARN("failed to allocate rpc memory"); _pfn_rpc_mem_free(buf); } @@ -607,9 +607,9 @@ class qnn_instance { void free_rpcmem(void *buf) { if (!_rpcmem_initialized) { - QNN_LOG_WARN("rpc memory not initialized\n"); + QNN_LOG_WARN("rpc memory not initialized"); } else if (_rpcmem_store_map.count(buf) == 0) { - QNN_LOG_WARN("no allocated tensor\n"); + QNN_LOG_WARN("no allocated tensor"); } else { _pfn_rpc_mem_free(_rpcmem_store_map[buf]); _rpcmem_store_map.erase(buf); @@ -619,7 +619,7 @@ class qnn_instance { int32_t rpcmem_to_fd(void *buf) { int32_t mem_fd = -1; if (!is_rpcmem_initialized()) { - QNN_LOG_WARN("rpc memory not initialized\n"); + QNN_LOG_WARN("rpc memory not initialized"); } else { mem_fd = _pfn_rpc_mem_to_fd(buf); } @@ -629,52 +629,51 @@ class qnn_instance { Qnn_MemHandle_t register_rpcmem(void *p_data, const uint32_t rank, uint32_t *dimensions, Qnn_DataType_t data_type) { if (!p_data) { - QNN_LOG_WARN("invalid param\n"); + QNN_LOG_WARN("invalid param"); return nullptr; } if (!is_rpcmem_initialized()) { - QNN_LOG_WARN("rpc memory not initialized\n"); + QNN_LOG_WARN("rpc memory not initialized"); return nullptr; } if (is_rpcmem_registered(p_data)) { - QNN_LOG_WARN("rpc memory already registered\n"); + QNN_LOG_WARN("rpc memory already registered"); return _qnn_rpc_buffer_to_handles[p_data]; } auto mem_fd = rpcmem_to_fd(p_data); if (mem_fd == -1) { - QNN_LOG_WARN("failed to get file descriptor\n"); + QNN_LOG_WARN("failed to get file descriptor"); return nullptr; } - QNN_LOG_DEBUG("mem_fd %d\n", mem_fd); + QNN_LOG_DEBUG("mem_fd %d", mem_fd); Qnn_MemDescriptor_t descriptor = {{rank, dimensions, nullptr}, data_type, QNN_MEM_TYPE_ION, {{mem_fd}}}; Qnn_MemHandle_t handle = nullptr; auto error = _qnn_interface->qnn_mem_register(_qnn_context_handle, &descriptor, /*numDescriptors=*/1, &handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to register shared memory, error %d, %s\n", QNN_GET_ERROR_CODE(error), - strerror(error)); + QNN_LOG_WARN("failed to register shared memory, error %d, %s", QNN_GET_ERROR_CODE(error), strerror(error)); return nullptr; } _qnn_rpc_buffer_to_handles.insert({p_data, handle}); - QNN_LOG_DEBUG("successfully register shared memory handler: %p\n", handle); + QNN_LOG_DEBUG("successfully register shared memory handler: %p", handle); return handle; } void unregister_rpcmem(Qnn_MemHandle_t mem_handle) { Qnn_ErrorHandle_t error = _qnn_interface->qnn_mem_de_register(&mem_handle, 1); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to unregister shared memory, error %d\n", QNN_GET_ERROR_CODE(error)); + QNN_LOG_WARN("failed to unregister shared memory, error %d", QNN_GET_ERROR_CODE(error)); } auto it = std::find_if(_qnn_rpc_buffer_to_handles.begin(), _qnn_rpc_buffer_to_handles.end(), [mem_handle](const auto &kv) { return kv.second == mem_handle; }); if (it == _qnn_rpc_buffer_to_handles.end()) { - QNN_LOG_WARN("failed to find shared memory handler: %p\n", mem_handle); + QNN_LOG_WARN("failed to find shared memory handler: %p", mem_handle); return; } @@ -691,18 +690,18 @@ class qnn_instance { Qnn_ErrorHandle_t error = QNN_SUCCESS; std::string system_lib_path = _lib_path + "libQnnSystem.so"; - QNN_LOG_DEBUG("system_lib_path:%s\n", system_lib_path.c_str()); + QNN_LOG_DEBUG("system_lib_path:%s", system_lib_path.c_str()); auto system_lib_handle = dl_load(system_lib_path); if (!system_lib_handle) { - QNN_LOG_WARN("can not load QNN library %s, error: %s\n", system_lib_path.c_str(), dl_error()); + QNN_LOG_WARN("can not load QNN library %s, error: %s", system_lib_path.c_str(), dl_error()); return 1; } auto *get_providers = dl_sym_typed( system_lib_handle, "QnnSystemInterface_getProviders"); if (!get_providers) { - QNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s\n", dl_error()); + QNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s", dl_error()); return 2; } @@ -710,17 +709,17 @@ class qnn_instance { const QnnSystemInterface_t **provider_list = nullptr; error = get_providers(&provider_list, &num_providers); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to get providers, error %d\n", QNN_GET_ERROR_CODE(error)); + QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error)); return 3; } if (num_providers != _required_num_providers) { - QNN_LOG_WARN("providers is %d instead of required %d\n", num_providers, _required_num_providers); + QNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers); return 4; } if (!provider_list) { - QNN_LOG_WARN("can not get providers\n"); + QNN_LOG_WARN("can not get providers"); return 5; } @@ -735,15 +734,15 @@ class qnn_instance { } } if (!found_valid_system_interface) { - QNN_LOG_WARN("unable to find a valid qnn system interface\n"); + QNN_LOG_WARN("unable to find a valid qnn system interface"); return 6; } else { - QNN_LOG_DEBUG("find a valid qnn system interface\n"); + QNN_LOG_DEBUG("find a valid qnn system interface"); } auto qnn_sys_interface = std::make_shared(*provider_list[0], system_lib_handle); if (!qnn_sys_interface->is_valid()) { - QNN_LOG_WARN("failed to create QNN system interface\n"); + QNN_LOG_WARN("failed to create QNN system interface"); return 7; } @@ -753,7 +752,7 @@ class qnn_instance { int load_backend(std::string &lib_path, const QnnSaver_Config_t ** /*saver_config*/) { Qnn_ErrorHandle_t error = QNN_SUCCESS; - QNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str()); + QNN_LOG_DEBUG("lib_path:%s", lib_path.c_str()); auto lib_handle = dl_load(lib_path.c_str()); if (!lib_handle) { @@ -775,14 +774,14 @@ class qnn_instance { QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error)); return 3; } - QNN_LOG_DEBUG("num_providers=%d\n", num_providers); + QNN_LOG_DEBUG("num_providers=%d", num_providers); if (num_providers != _required_num_providers) { QNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers); return 4; } if (!provider_list) { - QNN_LOG_WARN("failed to get qnn interface providers\n"); + QNN_LOG_WARN("failed to get qnn interface providers"); return 5; } bool found_valid_interface = false; @@ -797,23 +796,23 @@ class qnn_instance { } if (!found_valid_interface) { - QNN_LOG_WARN("unable to find a valid qnn interface\n"); + QNN_LOG_WARN("unable to find a valid qnn interface"); return 6; } else { - QNN_LOG_DEBUG("find a valid qnn interface\n"); + QNN_LOG_DEBUG("find a valid qnn interface"); } BackendIdType backend_id = provider_list[0]->backendId; _lib_path_to_backend_id[lib_path] = backend_id; if (_loaded_backend.count(backend_id) > 0) { - QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n", lib_path.c_str(), backend_id); + QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists", lib_path.c_str(), backend_id); } _loaded_backend[backend_id] = provider_list[0]; if (_loaded_lib_handle.count(backend_id) > 0) { - QNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]); + QNN_LOG_WARN("closing %p", _loaded_lib_handle[backend_id]); int dlclose_error = dl_unload(_loaded_lib_handle[backend_id]); if (dlclose_error != 0) { - QNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], dl_error()); + QNN_LOG_WARN("fail to close %p with error %s", _loaded_lib_handle[backend_id], dl_error()); } } _loaded_lib_handle[backend_id] = lib_handle; @@ -827,7 +826,7 @@ class qnn_instance { for (auto &it : _loaded_lib_handle) { dlclose_error = dl_unload(it.second); if (dlclose_error != 0) { - QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, dl_error()); + QNN_LOG_WARN("failed to close QNN backend %d, error %s", it.first, dl_error()); } } diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index 0a9a367015127..833c620971e0d 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -32,10 +32,10 @@ class ggml_qnn_tensor { if (!_tensor_name.empty()) { QNN_TENSOR_SET_NAME(_qnn_tensor, _tensor_name.c_str()); } - QNN_TENSOR_SET_DIMENSIONS(_qnn_tensor, _dimensions.data()); - QNN_TENSOR_SET_DATA_FORMAT(_qnn_tensor, QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER); _dimensions = dimensions; + QNN_TENSOR_SET_DIMENSIONS(_qnn_tensor, _dimensions.data()); + QNN_TENSOR_SET_DATA_FORMAT(_qnn_tensor, QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER); update_params_from_ggml_tensor(tensor_type, data_type, rank); QNN_LOG_DEBUG("[%s][%s]created, rank: %d, dims: [%d, %d, %d, %d], type: %s", get_backend_name(device), _tensor_name.c_str(), rank, (int)_dimensions[0], (int)_dimensions[1], (int)_dimensions[2], @@ -51,7 +51,7 @@ class ggml_qnn_tensor { ~ggml_qnn_tensor() { _buffer_storage.clear(); unbind(); - _qnn_rpc_buffer.reset(); + _rpc_buffer.reset(); } bool set_data_buffer(std::vector &&buffer) { @@ -73,7 +73,7 @@ class ggml_qnn_tensor { auto qnn_interface = _qnn_instance->get_qnn_interface(); auto error = qnn_interface->qnn_tensor_create_graph_tensor(_graph_handle, &qnn_tensor); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("[%s]allocate id failed , error: %d\n", _tensor_name.c_str(), error); + QNN_LOG_WARN("[%s]allocate id failed , error: %d", _tensor_name.c_str(), error); return false; } @@ -162,21 +162,29 @@ class ggml_qnn_tensor { } if (should_use_mem_handle()) { - if (!_qnn_rpc_buffer) { - auto qnn_rpc_buffer = std::make_unique( + if (!_rpc_buffer) { + auto rpc_buffer = std::make_shared( _qnn_instance, buffer_size, QNN_TENSOR_GET_RANK(_qnn_tensor), QNN_TENSOR_GET_DIMENSIONS(_qnn_tensor), QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor)); - if (!qnn_rpc_buffer->is_valid()) { - QNN_LOG_WARN("[%s]alloc rpc mem failed", _tensor_name.c_str()); + if (!rpc_buffer->is_valid()) { + QNN_LOG_WARN("[%s][%s]alloc rpc mem failed", get_backend_name(_device), _tensor_name.c_str()); return false; } - _qnn_rpc_buffer = std::move(qnn_rpc_buffer); + _rpc_buffer = std::move(rpc_buffer); } QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_MEMHANDLE); - QNN_TENSOR_SET_MEM_HANDLE(_qnn_tensor, _qnn_rpc_buffer->get_mem_handle()); - QNN_LOG_DEBUG("[%s]use mem handle %p", _tensor_name.c_str(), QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor)); + auto mem_handle = _rpc_buffer->get_mem_handle(); + if (!mem_handle) { + QNN_LOG_WARN("[%s][%s]can't find rpcmem from qnn mem handle", get_backend_name(_device), + _tensor_name.c_str()); + return false; + } + + QNN_TENSOR_SET_MEM_HANDLE(_qnn_tensor, mem_handle); + QNN_LOG_DEBUG("[%s][%s]use mem handle %p", get_backend_name(_device), _tensor_name.c_str(), + QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor)); } else { QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); Qnn_ClientBuffer_t client_buf = {buffer, (uint32_t)buffer_size}; @@ -205,13 +213,8 @@ class ggml_qnn_tensor { return true; } - if (should_use_mem_handle()) { - if (_qnn_rpc_buffer) { - memcpy(_qnn_rpc_buffer->get_buffer(), _buffer, _buffer_size); - } else { - QNN_LOG_WARN("[%s]can't find rpcmem from qnn mem handle\n", _tensor_name.c_str()); - return false; - } + if (_rpc_buffer) { + memcpy(_rpc_buffer->get_buffer(), _buffer, _buffer_size); } // For CPU and GPU, the data is already in the tensor. @@ -226,13 +229,8 @@ class ggml_qnn_tensor { return true; } - if (should_use_mem_handle()) { - if (_qnn_rpc_buffer) { - memcpy(_buffer, _qnn_rpc_buffer->get_buffer(), _buffer_size); - } else { - QNN_LOG_WARN("[%s]can't find rpcmem from qnn mem handle", _tensor_name.c_str()); - return false; - } + if (_rpc_buffer) { + memcpy(_buffer, _rpc_buffer->get_buffer(), _buffer_size); } // For CPU and GPU, the data is already in the tensor. @@ -283,7 +281,7 @@ class ggml_qnn_tensor { Qnn_Tensor_t _qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion); qnn_dimension_array_t _dimensions = {}; Qnn_GraphHandle_t _graph_handle = nullptr; - std::unique_ptr _qnn_rpc_buffer; + qnn_buffer_ptr _rpc_buffer; DISABLE_COPY(ggml_qnn_tensor); DISABLE_MOVE(ggml_qnn_tensor); diff --git a/ggml/src/ggml-qnn/utils.cpp b/ggml/src/ggml-qnn/utils.cpp index ebfc0372375fd..eaabe60cdb262 100644 --- a/ggml/src/ggml-qnn/utils.cpp +++ b/ggml/src/ggml-qnn/utils.cpp @@ -1,6 +1,8 @@ #include "utils.hpp" +#include + #include #include "ggml-qnn.h" @@ -37,6 +39,28 @@ qnn_dimension_array_t get_internal_dimension(const ggml_dimension_array_t &dims, return internal_dims; } +qnn_dimension_array_t get_view_internal_dimension(const ggml_tensor *tensor, size_t &element_offset_out) { + + element_offset_out = 0; + + auto *parent_tensor = tensor; + while (parent_tensor->view_src) { + element_offset_out += parent_tensor->view_offs; + parent_tensor = parent_tensor->view_src; + } + + const auto rank = get_ggml_tensor_rank(tensor); + const auto parent_rank = get_ggml_tensor_rank(parent_tensor); + GGML_ASSERT(parent_tensor->type == tensor->type); + GGML_ASSERT(parent_rank == rank); + + const auto block_size = ggml_blck_size(tensor->type); + element_offset_out = + element_offset_out * block_size / tensor->nb[0]; // calculate the element offset in the view tensor + + return get_internal_dimension(parent_tensor->ne, parent_rank); +} + // TODO: mapping more ggml data type to QNN data type // ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684 Qnn_DataType_t qnn_datatype_from_ggml_datatype(ggml_type ggml_type) { @@ -199,6 +223,12 @@ intptr_t align_to(size_t alignment, intptr_t offset) { uint32_t get_ggml_tensor_data_size(const ggml_tensor *tensor) { return ggml_nbytes(tensor); } +void *page_align_alloc(size_t size) { + // TODO: fix this for other platforms + const size_t alignment = sysconf(_SC_PAGESIZE); + return align_alloc(alignment, size); +} + void *align_alloc(size_t alignment, size_t size) { size_t size_aligned = size; if ((size_aligned % alignment) != 0) { diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp index 40dff321b970e..1ec0af4c96f77 100644 --- a/ggml/src/ggml-qnn/utils.hpp +++ b/ggml/src/ggml-qnn/utils.hpp @@ -21,9 +21,11 @@ namespace qnn { using ggml_dimension_array_t = int64_t[GGML_MAX_DIMS]; +using ggml_stride_array_t = size_t[GGML_MAX_DIMS]; using qnn_dimension_array_t = std::array; qnn_dimension_array_t get_internal_dimension(const ggml_dimension_array_t &dims, uint32_t rank); +qnn_dimension_array_t get_view_internal_dimension(const ggml_tensor *tensor, size_t &element_offser_out); uint32_t get_ggml_tensor_rank(const ggml_tensor *tensor); const char *get_ggml_type_name(ggml_type type); @@ -33,6 +35,7 @@ const char *get_htparch_desc(size_t htp_arch); intptr_t align_to(size_t alignment, intptr_t offset); uint32_t get_ggml_tensor_data_size(const ggml_tensor *tensor); +void *page_align_alloc(size_t size); void *align_alloc(size_t alignment, size_t size); void align_free(void *ptr); From 79f124a6999b5931a301c7cbdecd52142c2f737a Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 14 Dec 2024 15:49:44 +0800 Subject: [PATCH 136/166] add missing op --- ggml/src/ggml-qnn/backend-ops.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 990338c953524..6bd9006851cc1 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -183,6 +183,7 @@ constexpr const char *kGgmlOpToQnnOp[] = { nullptr, // GGML_OP_POOL_2D_BACK nullptr, // GGML_OP_UPSCALE nullptr, // GGML_OP_PAD + nullptr, // GGML_OP_PAD_REFLECT_1D nullptr, // GGML_OP_ARANGE nullptr, // GGML_OP_TIMESTEP_EMBEDDING nullptr, // GGML_OP_ARGSORT @@ -196,7 +197,7 @@ constexpr const char *kGgmlOpToQnnOp[] = { nullptr, // GGML_OP_WIN_UNPART nullptr, // GGML_OP_GET_REL_POS nullptr, // GGML_OP_ADD_REL_POS - nullptr, // GGML_OP_RWKV_WKV + nullptr, // GGML_OP_RWKV_WKV6 nullptr, // GGML_OP_UNARY @@ -392,6 +393,7 @@ constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = { nullptr, // GGML_OP_POOL_2D_BACK nullptr, // GGML_OP_UPSCALE nullptr, // GGML_OP_PAD + nullptr, // GGML_OP_PAD_REFLECT_1D nullptr, // GGML_OP_ARANGE nullptr, // GGML_OP_TIMESTEP_EMBEDDING nullptr, // GGML_OP_ARGSORT @@ -405,7 +407,7 @@ constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = { nullptr, // GGML_OP_WIN_UNPART nullptr, // GGML_OP_GET_REL_POS nullptr, // GGML_OP_ADD_REL_POS - nullptr, // GGML_OP_RWKV_WKV + nullptr, // GGML_OP_RWKV_WKV6 nullptr, // GGML_OP_UNARY @@ -503,6 +505,7 @@ constexpr const ggml_qnn_binary_op_t kQnnBinaryOpsTable[] = { nullptr, // GGML_OP_POOL_2D_BACK nullptr, // GGML_OP_UPSCALE nullptr, // GGML_OP_PAD + nullptr, // GGML_OP_PAD_REFLECT_1D nullptr, // GGML_OP_ARANGE nullptr, // GGML_OP_TIMESTEP_EMBEDDING nullptr, // GGML_OP_ARGSORT From f2d8d017da4d69cb8af2faa43b8f59f828d34b10 Mon Sep 17 00:00:00 2001 From: nullname Date: Fri, 10 Jan 2025 11:13:25 +0800 Subject: [PATCH 137/166] [feat] Port ggml graph to QNN graph (#16) * more log * split graph implementation into cpp file * rename: ggml_qnn_graph -> qnn_graph * add imput/output tensor to graph * fix assert * wip * add _ggml_tensor field in qnn tensor * add comments * add set_data_buffer with raw memory buffer * use set_data_buffer * op param buffer use qnn_buffer_ptr * add qnn_mem_buffer_slice * use qnn_buffer_ptr as tensor buffer * use new set_data_buffer to reduce copy * ggml_qnn_op_config: add function to set input/output tensor before init node * remove ggml_qnn_connectable_op_config and use ggml_qnn_single_op_config instead * wip * add initialize_op_nodes without tensor params * wip * add op caps table * merge kGgmlOpToQnnOp and kOpCaps tables * wip * add cache parameter to create_tensors * add init_from_ggml_graph * disable gelu for all backend * wip * move op index calc to op config module * use the ggml_tensor as parameter of build_graph * add log * use create_operation_from_op_tensor in old build_graph function * remove unused constructors * fix parameter count * remove unused member func/var * make init_from_ggml_graph as a class member: build_graph_from_ggml_graph * move graph finalize into member function `finalize()` * get graph key from ggml op tensor directly * append output type * reduce tensor key length * add function to generate key from ggml_cgraph * simplify graph cache insert and delete * remove template param at get_qnn_graph_from_cache * wip * merge kQnnUnaryOpsTable and kQnnBinaryOpsTable * refactor device_supports_op * add log * wip * use framework function to check same shape * wip * extract some logic into separated function * wip * add execution function that runs graph * add function to create qnn graph from ggml_cgraph with cache * execute graph directly * return null graph key for empty graph * add more qualcomm chipset enums * add cap for reshape * disable some ops * try to skip GGML_OP_VIEW * moew log for view tensor * append param tensor into intermedia tensor key * use 'ordered' set * fix warning in release * wip --- ggml/src/ggml-qnn/backend-ops.cpp | 752 ++++++++++----------------- ggml/src/ggml-qnn/backend.hpp | 4 +- ggml/src/ggml-qnn/buffer.hpp | 76 ++- ggml/src/ggml-qnn/ggml-qnn.cpp | 12 +- ggml/src/ggml-qnn/graph.cpp | 386 ++++++++++++++ ggml/src/ggml-qnn/graph.hpp | 155 +----- ggml/src/ggml-qnn/op-config-base.hpp | 55 +- ggml/src/ggml-qnn/op-config-caps.cpp | 223 ++++++++ ggml/src/ggml-qnn/op-config.cpp | 226 +++----- ggml/src/ggml-qnn/op-config.hpp | 76 +-- ggml/src/ggml-qnn/qnn-types.hpp | 16 +- ggml/src/ggml-qnn/tensor.hpp | 179 +++++-- ggml/src/ggml-qnn/utils.cpp | 12 +- 13 files changed, 1310 insertions(+), 862 deletions(-) create mode 100644 ggml/src/ggml-qnn/graph.cpp create mode 100644 ggml/src/ggml-qnn/op-config-caps.cpp diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 6bd9006851cc1..1ed01bfd6851d 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -11,12 +11,10 @@ #include "tensor.hpp" #include "utils.hpp" -#ifndef NDEBUG - namespace { -bool qnn_is_valid_params(ggml_backend_qnn_device_context *ctx, const ggml_tensor *src, ggml_tensor *dst) { - if (!ctx || !src || !dst) { +bool qnn_is_op_valid(ggml_backend_qnn_device_context *ctx, const ggml_tensor *dst) { + if (!ctx || !dst) { QNN_LOG_WARN("invalid params"); return false; } @@ -27,243 +25,151 @@ bool qnn_is_valid_params(ggml_backend_qnn_device_context *ctx, const ggml_tensor return false; } - return true; -} - -bool qnn_is_valid_params(ggml_backend_qnn_device_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) { - if (!ctx || !src0 || !src1 || !dst) { - QNN_LOG_WARN("invalid params"); - return false; - } - - auto instance = ctx->instance; - if (!instance) { - QNN_LOG_WARN("invalid instance"); - return false; + const auto param_count = qnn::get_qnn_op_input_param_count(qnn::get_qnn_op_index(dst)); + switch (param_count) { + case 1: + return dst->src[0]; + case 2: + return dst->src[0] && dst->src[1]; + default: + QNN_LOG_WARN("invalid op param count %d", (int)param_count); + break; } - return true; + return false; } +#ifndef NDEBUG void print_ggml_tensor(const ggml_tensor *tensor) { QNN_LOG_DEBUG("%s: type:%s ne: %ldx%ldx%ldx%ld, nb: %ldx%ldx%ldx%ld", tensor->name, ggml_type_name(tensor->type), (long)tensor->ne[0], (long)tensor->ne[1], (long)tensor->ne[2], (long)tensor->ne[3], (long)tensor->nb[0], (long)tensor->nb[1], (long)tensor->nb[2], (long)tensor->nb[3]); } +#endif } // namespace -#define CHECK_PARAMS(ctx, ...) \ - if (!qnn_is_valid_params((ctx), __VA_ARGS__)) { \ - return false; \ - } - -#else -#define CHECK_PARAMS(ctx, ...) -#endif - namespace { -bool is_tensor_dimensions_equal(const ggml_tensor *l, const ggml_tensor *r) { - const auto dim_l = ggml_n_dims(l); - if (dim_l != ggml_n_dims(r)) { - return false; - } +typedef bool (*ggml_qnn_op_t)(ggml_backend_qnn_device_context *ctx, ggml_tensor *dst); - for (int i = 0; i < dim_l; i++) { - if (l->ne[i] != r->ne[i]) { - return false; - } +bool execute_graph(qnn::qnn_graph *graph, ggml_tensor *output) { + if (!graph->execute(output)) { + QNN_LOG_WARN("execute failed"); + return false; } return true; } -typedef bool (*ggml_qnn_unary_op_t)(ggml_backend_qnn_device_context *ctx, ggml_tensor *src, ggml_tensor *dst); -typedef bool (*ggml_qnn_binary_op_t)(ggml_backend_qnn_device_context *ctx, ggml_tensor *src0, ggml_tensor *src1, - ggml_tensor *dst); - -typedef const ggml_qnn_unary_op_t (&ggml_qnn_unary_op_array_t)[GGML_OP_COUNT + GGML_UNARY_OP_COUNT]; -typedef const ggml_qnn_binary_op_t (&ggml_qnn_binary_op_array_t)[GGML_OP_COUNT]; - -constexpr const size_t kGgmlUnaryOpStart = GGML_OP_COUNT; - -template -qnn::ggml_tensor_array_t to_ggml_tensor_array(const std::array &array) { - return qnn::ggml_tensor_array_t(array.data(), array.data() + _Size); +void append_tensor_dimensions(const ggml_tensor *tensor, std::string &output) { + char buffer[256] = {}; + const auto *type_name = qnn::get_ggml_type_name(tensor->type); + int len = 0; + switch (ggml_n_dims(tensor)) { + case 1: + len = snprintf(buffer, sizeof(buffer), "%ld%s", (long)tensor->ne[0], type_name); + break; + case 2: + len = snprintf(buffer, sizeof(buffer), "%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1], type_name); + break; + case 3: + len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1], + (long)tensor->ne[2], type_name); + break; + case 4: + default: + len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1], + (long)tensor->ne[2], (long)tensor->ne[3], type_name); + break; + } + GGML_ASSERT(len > 0 && len < (int)sizeof(buffer)); + output.append(buffer, len); } -template -bool execute_graph(qnn::ggml_qnn_graph *graph, const std::array &inputs, - ggml_tensor *output) { - if (!graph->execute(to_ggml_tensor_array<_InputSize>(inputs), to_ggml_tensor_array<1>({output}))) { - QNN_LOG_WARN("execute failed"); - return false; +void get_graph_key_from_op(const ggml_tensor *op, std::string &output) { + GGML_ASSERT(op->op != GGML_OP_NONE); + output += ggml_op_desc(op); + output += qnn::get_ggml_type_name(op->type); + const auto param_count = qnn::get_qnn_op_input_param_count(qnn::get_qnn_op_index(op)); + for (size_t i = 0; i < param_count; ++i) { + auto *input = op->src[i]; + output += '_'; + append_tensor_dimensions(input, output); } - - return true; } -template -std::string get_graph_key(const std::string &op_name, const std::array &inputs, - const std::array &outputs) { - constexpr static const auto append_dimensions = [](std::string &key, const ggml_tensor *tensor) { - char buffer[256] = {}; - snprintf(buffer, sizeof(buffer), "_%ldx%ldx%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1], - (long)tensor->ne[2], (long)tensor->ne[3], qnn::get_ggml_type_name(tensor->type)); - key += buffer; - }; - - std::string graph_key(op_name); - for (auto &input : inputs) { - append_dimensions(graph_key, input); +void get_op_key_with_src_op_desc(const ggml_tensor *op, std::string &output) { + output += ggml_op_desc(op); + output += '('; + if (op->src[0]) { + output += ggml_op_desc(op->src[0]); } - - graph_key += qnn::get_ggml_type_name(outputs.front()->type); - return graph_key; + for (size_t i = 1; i < GGML_MAX_DIMS && op->src[i]; ++i) { + output += ','; + output += ggml_op_desc(op->src[i]); + } + output += ')'; } -constexpr const char *kGgmlOpToQnnOp[] = { - nullptr, // GGML_OP_NONE - nullptr, // GGML_OP_DUP - QNN_OP_ELEMENT_WISE_ADD, // GGML_OP_ADD - nullptr, // GGML_OP_ADD1 - nullptr, // GGML_OP_ACC - QNN_OP_ELEMENT_WISE_SUBTRACT, // GGML_OP_SUB - QNN_OP_ELEMENT_WISE_MULTIPLY, // GGML_OP_MUL - QNN_OP_ELEMENT_WISE_DIVIDE, // GGML_OP_DIV - nullptr, // GGML_OP_SQR - QNN_OP_ELEMENT_WISE_SQUARE_ROOT, // GGML_OP_SQRT - QNN_OP_ELEMENT_WISE_LOG, // GGML_OP_LOG - nullptr, // GGML_OP_SIN - nullptr, // GGML_OP_COS - nullptr, // GGML_OP_SUM - nullptr, // GGML_OP_SUM_ROWS - nullptr, // GGML_OP_MEAN - nullptr, // GGML_OP_ARGMAX - nullptr, // GGML_OP_COUNT_EQUAL - nullptr, // GGML_OP_REPEAT - nullptr, // GGML_OP_REPEAT_BACK - nullptr, // GGML_OP_CONCAT - nullptr, // GGML_OP_SILU_BACK - nullptr, // GGML_OP_NORM - nullptr, // GGML_OP_RMS_NORM - nullptr, // GGML_OP_RMS_NORM_BACK - nullptr, // GGML_OP_GROUP_NORM - - QNN_OP_MAT_MUL, // GGML_OP_MUL_MAT - nullptr, // GGML_OP_MUL_MAT_ID - nullptr, // GGML_OP_OUT_PROD - - nullptr, // GGML_OP_SCALE - nullptr, // GGML_OP_SET - nullptr, // GGML_OP_CPY - nullptr, // GGML_OP_CONT - nullptr, // GGML_OP_RESHAPE - nullptr, // GGML_OP_VIEW - QNN_OP_TRANSPOSE, // GGML_OP_PERMUTE - nullptr, // GGML_OP_TRANSPOSE - nullptr, // GGML_OP_GET_ROWS - nullptr, // GGML_OP_GET_ROWS_BACK - nullptr, // GGML_OP_DIAG - nullptr, // GGML_OP_DIAG_MASK_INF - nullptr, // GGML_OP_DIAG_MASK_ZERO - nullptr, // GGML_OP_SOFT_MAX - nullptr, // GGML_OP_SOFT_MAX_BACK - nullptr, // GGML_OP_ROPE - nullptr, // GGML_OP_ROPE_BACK - nullptr, // GGML_OP_CLAMP - nullptr, // GGML_OP_CONV_TRANSPOSE_1D - nullptr, // GGML_OP_IM2COL - nullptr, // GGML_OP_IM2COL_BACK - nullptr, // GGML_OP_CONV_TRANSPOSE_2D - nullptr, // GGML_OP_POOL_1D - nullptr, // GGML_OP_POOL_2D - nullptr, // GGML_OP_POOL_2D_BACK - nullptr, // GGML_OP_UPSCALE - nullptr, // GGML_OP_PAD - nullptr, // GGML_OP_PAD_REFLECT_1D - nullptr, // GGML_OP_ARANGE - nullptr, // GGML_OP_TIMESTEP_EMBEDDING - nullptr, // GGML_OP_ARGSORT - nullptr, // GGML_OP_LEAKY_RELU - - nullptr, // GGML_OP_FLASH_ATTN_EXT - nullptr, // GGML_OP_FLASH_ATTN_BACK - nullptr, // GGML_OP_SSM_CONV - nullptr, // GGML_OP_SSM_SCAN - nullptr, // GGML_OP_WIN_PART - nullptr, // GGML_OP_WIN_UNPART - nullptr, // GGML_OP_GET_REL_POS - nullptr, // GGML_OP_ADD_REL_POS - nullptr, // GGML_OP_RWKV_WKV6 - - nullptr, // GGML_OP_UNARY - - nullptr, // GGML_OP_MAP_UNARY - nullptr, // GGML_OP_MAP_BINARY - - nullptr, // GGML_OP_MAP_CUSTOM1_F32 - nullptr, // GGML_OP_MAP_CUSTOM2_F32 - nullptr, // GGML_OP_MAP_CUSTOM3_F32 - - nullptr, // GGML_OP_MAP_CUSTOM1 - nullptr, // GGML_OP_MAP_CUSTOM2 - nullptr, // GGML_OP_MAP_CUSTOM3 +void get_graph_key_from_cgraph(const ggml_cgraph *cgraph, std::string &output) { + // generate key from the graph, the key is used to cache the graph, like: + // "MUL_MATf32_256x16x10f32_256x1x10f32#LOG#ADD#ADDf32_16x1x10f32" + if (cgraph->n_nodes == 0) { + QNN_LOG_DEBUG("empty cgraph"); + return; + } - nullptr, // GGML_OP_CROSS_ENTROPY_LOSS - nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK - nullptr, // GGML_OP_OPT_STEP_ADAMW + { + bool is_start = true; + for (int i = 0; i < cgraph->n_nodes; ++i) { + auto *op = cgraph->nodes[i]; + if (ggml_is_empty(op)) { + QNN_LOG_DEBUG("empty op in graph, skipping"); + continue; + } - // ggml_unary_op - nullptr, // GGML_UNARY_OP_ABS - nullptr, // GGML_UNARY_OP_SGN - nullptr, // GGML_UNARY_OP_NEG - nullptr, // GGML_UNARY_OP_STEP - nullptr, // GGML_UNARY_OP_TANH - nullptr, // GGML_UNARY_OP_ELU - nullptr, // GGML_UNARY_OP_RELU - nullptr, // GGML_UNARY_OP_SIGMOID - QNN_OP_GELU, // GGML_UNARY_OP_GELU - nullptr, // GGML_UNARY_OP_GELU_QUICK - nullptr, // GGML_UNARY_OP_SILU - nullptr, // GGML_UNARY_OP_HARDSWISH - nullptr, // GGML_UNARY_OP_HARDSIGMOID - nullptr, // GGML_UNARY_OP_EXP -}; + if (op->op == GGML_OP_NONE) { + QNN_LOG_DEBUG("GGML_OP_NONE in graph, skipping"); + continue; + } -static_assert(sizeof(kGgmlOpToQnnOp) / sizeof(kGgmlOpToQnnOp[0]) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), - "GGML_OP_COUNT does not match the size of the kGgmlOpToQnnOp table"); -static_assert(kGgmlOpToQnnOp[GGML_UNARY_OP_GELU + kGgmlUnaryOpStart] != nullptr, - "GGML_UNARY_OP_GELU does not correspond to QNN_OP_GELU"); + if (is_start) { + get_graph_key_from_op(cgraph->nodes[0], output); + is_start = false; + } else { + output += '#'; + get_op_key_with_src_op_desc(op, output); + } + } + } -template -qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *ctx, size_t op, - const std::array &inputs, - ggml_tensor *output) { - GGML_ASSERT(op < (GGML_OP_COUNT + GGML_UNARY_OP_COUNT)); + if (cgraph->n_nodes > 1) { + auto *last_op = cgraph->nodes[cgraph->n_nodes - 1]; + output += qnn::get_ggml_type_name(last_op->type); + output += '_'; + append_tensor_dimensions(last_op, output); + } +} +qnn::qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *ctx, ggml_tensor *output) { auto &graph_cache = ctx->qnn_graph_cache; - const auto *op_name = - op < kGgmlUnaryOpStart ? ggml_op_name(ggml_op(op)) : ggml_unary_op_name(ggml_unary_op(op - kGgmlUnaryOpStart)); - auto graph_key = get_graph_key<_InputSize, 1>(op_name, inputs, {output}); + std::string graph_key; + get_graph_key_from_op(output, graph_key); auto it = graph_cache.find(graph_key); - qnn::ggml_qnn_graph *graph_ptr = nullptr; + qnn::qnn_graph *graph_ptr = nullptr; if (it != graph_cache.end()) { QNN_LOG_DEBUG("[%s]found graph %s in cache", qnn::get_backend_name(ctx->device), graph_key.c_str()); graph_ptr = it->second.get(); } else { auto graph = - std::make_unique(graph_key, ctx->device, ctx->instance, ctx->socinfo.vtcm_size_in_mb); + std::make_unique(graph_key, ctx->device, ctx->instance, ctx->socinfo.vtcm_size_in_mb); if (!graph->is_valid()) { return nullptr; } - auto op_constructor = qnn::create_op_constructor(kGgmlOpToQnnOp[op]); - if (!graph->build_graph(op_constructor, to_ggml_tensor_array<_InputSize>(inputs), - to_ggml_tensor_array<1>({output}))) { - QNN_LOG_ERROR("[%s]build_graph failed", qnn::get_backend_name(ctx->device)); + if (!graph->build_graph_from_op(output)) { + QNN_LOG_ERROR("[%s]build_graph_from_op failed", qnn::get_backend_name(ctx->device)); return nullptr; } @@ -274,44 +180,54 @@ qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *c return graph_ptr; } -template -bool qnn_binary_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src0, ggml_tensor *src1, ggml_tensor *dst) { - static_assert(kGgmlOpToQnnOp[_GgmlOp] != nullptr, "GGML_OP does not have a corresponding QNN_OP"); +qnn::qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *ctx, const ggml_cgraph *cgraph) { + auto &graph_cache = ctx->qnn_graph_cache; + std::string graph_key; + get_graph_key_from_cgraph(cgraph, graph_key); + if (graph_key.empty()) { + QNN_LOG_DEBUG("[%s]empty graph key for cgraph: %p, size: %d", qnn::get_backend_name(ctx->device), cgraph, + (int)cgraph->n_nodes); + return nullptr; + } - CHECK_PARAMS(ctx, src0, src1, dst); + auto it = graph_cache.find(graph_key); + qnn::qnn_graph *graph_ptr = nullptr; + if (it != graph_cache.end()) { + QNN_LOG_DEBUG("[%s]found graph %s in cache", qnn::get_backend_name(ctx->device), graph_key.c_str()); + graph_ptr = it->second.get(); + } else { + auto graph = + std::make_unique(graph_key, ctx->device, ctx->instance, ctx->socinfo.vtcm_size_in_mb); + if (!graph->is_valid()) { + return nullptr; + } - bool succeed = false; - auto *graph_ptr = get_qnn_graph_from_cache<2>(ctx, _GgmlOp, {src0, src1}, dst); - if (graph_ptr) { - succeed = execute_graph<2>(graph_ptr, {src0, src1}, dst); - } + if (!graph->build_graph_from_ggml_graph(cgraph)) { + QNN_LOG_ERROR("[%s]build_graph_from_op failed", qnn::get_backend_name(ctx->device)); + return nullptr; + } -#ifndef NDEBUG - if (!succeed) { - print_ggml_tensor(src0); - print_ggml_tensor(src1); - print_ggml_tensor(dst); + graph_ptr = graph.get(); + graph_cache[graph_key] = std::move(graph); } -#endif - return succeed; + return graph_ptr; } -template -bool qnn_unary_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src, ggml_tensor *dst) { - static_assert(kGgmlOpToQnnOp[_GgmlOp] != nullptr, "GGML_OP does not have a corresponding QNN_OP"); - - CHECK_PARAMS(ctx, src, dst); - - bool succeed = false; - auto *graph_ptr = get_qnn_graph_from_cache<1>(ctx, _GgmlOp, {src}, dst); - if (graph_ptr) { - succeed = execute_graph<1>(graph_ptr, {src}, dst); +bool qnn_generic_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *dst) { + if (!qnn_is_op_valid(ctx, dst)) { + return false; } + auto *graph_ptr = get_qnn_graph_from_cache(ctx, dst); + bool succeed = graph_ptr && execute_graph(graph_ptr, dst); + #ifndef NDEBUG if (!succeed) { - print_ggml_tensor(src); + const auto param_count = qnn::get_qnn_op_input_param_count(qnn::get_qnn_op_index(dst)); + for (size_t i = 0; i < param_count; ++i) { + print_ggml_tensor(dst->src[i]); + } print_ggml_tensor(dst); } #endif @@ -319,85 +235,76 @@ bool qnn_unary_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src, g return succeed; } -bool qnn_unary_nop_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src, ggml_tensor *dst) { - GGML_UNUSED(ctx); - GGML_UNUSED(src); - GGML_UNUSED(dst); - return true; -} - -bool qnn_binary_nop_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src0, ggml_tensor *src1, ggml_tensor *dst) { +bool qnn_nop_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *dst) { GGML_UNUSED(ctx); - GGML_UNUSED(src0); - GGML_UNUSED(src1); GGML_UNUSED(dst); return true; } -constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = { - qnn_unary_nop_impl, // GGML_OP_NONE - nullptr, // GGML_OP_DUP - nullptr, // GGML_OP_ADD - nullptr, // GGML_OP_ADD1 - nullptr, // GGML_OP_ACC - nullptr, // GGML_OP_SUB - nullptr, // GGML_OP_MUL - nullptr, // GGML_OP_DIV - nullptr, // GGML_OP_SQR - qnn_unary_op_impl, // GGML_OP_SQRT - qnn_unary_op_impl, // GGML_OP_LOG - nullptr, // GGML_OP_SIN - nullptr, // GGML_OP_COS - nullptr, // GGML_OP_SUM - nullptr, // GGML_OP_SUM_ROWS - nullptr, // GGML_OP_MEAN - nullptr, // GGML_OP_ARGMAX - nullptr, // GGML_OP_COUNT_EQUAL - nullptr, // GGML_OP_REPEAT - nullptr, // GGML_OP_REPEAT_BACK - nullptr, // GGML_OP_CONCAT - nullptr, // GGML_OP_SILU_BACK - nullptr, // GGML_OP_NORM - nullptr, // GGML_OP_RMS_NORM - nullptr, // GGML_OP_RMS_NORM_BACK - nullptr, // GGML_OP_GROUP_NORM - - nullptr, // GGML_OP_MUL_MAT - nullptr, // GGML_OP_MUL_MAT_ID - nullptr, // GGML_OP_OUT_PROD - - nullptr, // GGML_OP_SCALE - nullptr, // GGML_OP_SET - nullptr, // GGML_OP_CPY - nullptr, // GGML_OP_CONT - qnn_unary_nop_impl, // GGML_OP_RESHAPE - qnn_unary_nop_impl, // GGML_OP_VIEW - qnn_unary_nop_impl, // GGML_OP_PERMUTE - qnn_unary_nop_impl, // GGML_OP_TRANSPOSE - qnn_unary_nop_impl, // GGML_OP_GET_ROWS - nullptr, // GGML_OP_GET_ROWS_BACK - nullptr, // GGML_OP_DIAG - nullptr, // GGML_OP_DIAG_MASK_INF - nullptr, // GGML_OP_DIAG_MASK_ZERO - nullptr, // GGML_OP_SOFT_MAX - nullptr, // GGML_OP_SOFT_MAX_BACK - nullptr, // GGML_OP_ROPE - nullptr, // GGML_OP_ROPE_BACK - nullptr, // GGML_OP_CLAMP - nullptr, // GGML_OP_CONV_TRANSPOSE_1D - nullptr, // GGML_OP_IM2COL - nullptr, // GGML_OP_IM2COL_BACK - nullptr, // GGML_OP_CONV_TRANSPOSE_2D - nullptr, // GGML_OP_POOL_1D - nullptr, // GGML_OP_POOL_2D - nullptr, // GGML_OP_POOL_2D_BACK - nullptr, // GGML_OP_UPSCALE - nullptr, // GGML_OP_PAD - nullptr, // GGML_OP_PAD_REFLECT_1D - nullptr, // GGML_OP_ARANGE - nullptr, // GGML_OP_TIMESTEP_EMBEDDING - nullptr, // GGML_OP_ARGSORT - nullptr, // GGML_OP_LEAKY_RELU +constexpr const ggml_qnn_op_t kQnnOpsTable[] = { + qnn_nop_impl, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + qnn_generic_op_impl, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + qnn_generic_op_impl, // GGML_OP_SUB + qnn_generic_op_impl, // GGML_OP_MUL + qnn_generic_op_impl, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + qnn_generic_op_impl, // GGML_OP_SQRT + qnn_generic_op_impl, // GGML_OP_LOG + nullptr, // GGML_OP_SIN + nullptr, // GGML_OP_COS + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_COUNT_EQUAL + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + nullptr, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM + + qnn_generic_op_impl, // GGML_OP_MUL_MAT + nullptr, // GGML_OP_MUL_MAT_ID + nullptr, // GGML_OP_OUT_PROD + + nullptr, // GGML_OP_SCALE + nullptr, // GGML_OP_SET + nullptr, // GGML_OP_CPY + nullptr, // GGML_OP_CONT + qnn_nop_impl, // GGML_OP_RESHAPE + nullptr, // GGML_OP_VIEW + nullptr, // GGML_OP_PERMUTE + nullptr, // GGML_OP_TRANSPOSE + nullptr, // GGML_OP_GET_ROWS + nullptr, // GGML_OP_GET_ROWS_BACK + nullptr, // GGML_OP_DIAG + nullptr, // GGML_OP_DIAG_MASK_INF + nullptr, // GGML_OP_DIAG_MASK_ZERO + nullptr, // GGML_OP_SOFT_MAX + nullptr, // GGML_OP_SOFT_MAX_BACK + nullptr, // GGML_OP_ROPE + nullptr, // GGML_OP_ROPE_BACK + nullptr, // GGML_OP_CLAMP + nullptr, // GGML_OP_CONV_TRANSPOSE_1D + nullptr, // GGML_OP_IM2COL + nullptr, // GGML_OP_IM2COL_BACK + nullptr, // GGML_OP_CONV_TRANSPOSE_2D + nullptr, // GGML_OP_POOL_1D + nullptr, // GGML_OP_POOL_2D + nullptr, // GGML_OP_POOL_2D_BACK + nullptr, // GGML_OP_UPSCALE + nullptr, // GGML_OP_PAD + nullptr, // GGML_OP_PAD_REFLECT_1D + nullptr, // GGML_OP_ARANGE + nullptr, // GGML_OP_TIMESTEP_EMBEDDING + nullptr, // GGML_OP_ARGSORT + nullptr, // GGML_OP_LEAKY_RELU nullptr, // GGML_OP_FLASH_ATTN_EXT nullptr, // GGML_OP_FLASH_ATTN_BACK @@ -407,7 +314,7 @@ constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = { nullptr, // GGML_OP_WIN_UNPART nullptr, // GGML_OP_GET_REL_POS nullptr, // GGML_OP_ADD_REL_POS - nullptr, // GGML_OP_RWKV_WKV6 + nullptr, // GGML_OP_RWKV_WKV nullptr, // GGML_OP_UNARY @@ -427,120 +334,34 @@ constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = { nullptr, // GGML_OP_OPT_STEP_ADAMW // ggml_unary_op - nullptr, // GGML_UNARY_OP_ABS - nullptr, // GGML_UNARY_OP_SGN - nullptr, // GGML_UNARY_OP_NEG - nullptr, // GGML_UNARY_OP_STEP - nullptr, // GGML_UNARY_OP_TANH - nullptr, // GGML_UNARY_OP_ELU - nullptr, // GGML_UNARY_OP_RELU - nullptr, // GGML_UNARY_OP_SIGMOID - qnn_unary_op_impl, // GGML_UNARY_OP_GELU - nullptr, // GGML_UNARY_OP_GELU_QUICK - nullptr, // GGML_UNARY_OP_SILU - nullptr, // GGML_UNARY_OP_HARDSWISH - nullptr, // GGML_UNARY_OP_HARDSIGMOID - nullptr, // GGML_UNARY_OP_EXP + nullptr, // GGML_UNARY_OP_ABS + nullptr, // GGML_UNARY_OP_SGN + nullptr, // GGML_UNARY_OP_NEG + nullptr, // GGML_UNARY_OP_STEP + nullptr, // GGML_UNARY_OP_TANH + nullptr, // GGML_UNARY_OP_ELU + nullptr, // GGML_UNARY_OP_RELU + nullptr, // GGML_UNARY_OP_SIGMOID + qnn_generic_op_impl, // GGML_UNARY_OP_GELU + nullptr, // GGML_UNARY_OP_GELU_QUICK + nullptr, // GGML_UNARY_OP_SILU + nullptr, // GGML_UNARY_OP_HARDSWISH + nullptr, // GGML_UNARY_OP_HARDSIGMOID + nullptr, // GGML_UNARY_OP_EXP }; -static_assert(sizeof(kQnnUnaryOpsTable) / sizeof(kQnnUnaryOpsTable[0]) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), - "GGML_OP_COUNT does not match the size of the kQnnUnaryOpsTable table"); - -constexpr const ggml_qnn_binary_op_t kQnnBinaryOpsTable[] = { - nullptr, // GGML_OP_NONE - nullptr, // GGML_OP_DUP - qnn_binary_op_impl, // GGML_OP_ADD - nullptr, // GGML_OP_ADD1 - nullptr, // GGML_OP_ACC - qnn_binary_op_impl, // GGML_OP_SUB - qnn_binary_op_impl, // GGML_OP_MUL - qnn_binary_op_impl, // GGML_OP_DIV - nullptr, // GGML_OP_SQR - nullptr, // GGML_OP_SQRT - nullptr, // GGML_OP_LOG - nullptr, // GGML_OP_SIN - nullptr, // GGML_OP_COS - nullptr, // GGML_OP_SUM - nullptr, // GGML_OP_SUM_ROWS - nullptr, // GGML_OP_MEAN - nullptr, // GGML_OP_ARGMAX - nullptr, // GGML_OP_COUNT_EQUAL - nullptr, // GGML_OP_REPEAT - nullptr, // GGML_OP_REPEAT_BACK - nullptr, // GGML_OP_CONCAT - nullptr, // GGML_OP_SILU_BACK - nullptr, // GGML_OP_NORM - nullptr, // GGML_OP_RMS_NORM - nullptr, // GGML_OP_RMS_NORM_BACK - nullptr, // GGML_OP_GROUP_NORM - - qnn_binary_op_impl, // GGML_OP_MUL_MAT - nullptr, // GGML_OP_MUL_MAT_ID - nullptr, // GGML_OP_OUT_PROD - - nullptr, // GGML_OP_SCALE - nullptr, // GGML_OP_SET - nullptr, // GGML_OP_CPY - nullptr, // GGML_OP_CONT - nullptr, // GGML_OP_RESHAPE - nullptr, // GGML_OP_VIEW - nullptr, // GGML_OP_PERMUTE - nullptr, // GGML_OP_TRANSPOSE - nullptr, // GGML_OP_GET_ROWS - nullptr, // GGML_OP_GET_ROWS_BACK - nullptr, // GGML_OP_DIAG - nullptr, // GGML_OP_DIAG_MASK_INF - nullptr, // GGML_OP_DIAG_MASK_ZERO - nullptr, // GGML_OP_SOFT_MAX - nullptr, // GGML_OP_SOFT_MAX_BACK - nullptr, // GGML_OP_ROPE - nullptr, // GGML_OP_ROPE_BACK - nullptr, // GGML_OP_CLAMP - nullptr, // GGML_OP_CONV_TRANSPOSE_1D - nullptr, // GGML_OP_IM2COL - nullptr, // GGML_OP_IM2COL_BACK - nullptr, // GGML_OP_CONV_TRANSPOSE_2D - nullptr, // GGML_OP_POOL_1D - nullptr, // GGML_OP_POOL_2D - nullptr, // GGML_OP_POOL_2D_BACK - nullptr, // GGML_OP_UPSCALE - nullptr, // GGML_OP_PAD - nullptr, // GGML_OP_PAD_REFLECT_1D - nullptr, // GGML_OP_ARANGE - nullptr, // GGML_OP_TIMESTEP_EMBEDDING - nullptr, // GGML_OP_ARGSORT - nullptr, // GGML_OP_LEAKY_RELU - - nullptr, // GGML_OP_FLASH_ATTN_EXT - nullptr, // GGML_OP_FLASH_ATTN_BACK - nullptr, // GGML_OP_SSM_CONV - nullptr, // GGML_OP_SSM_SCAN - nullptr, // GGML_OP_WIN_PART - nullptr, // GGML_OP_WIN_UNPART - nullptr, // GGML_OP_GET_REL_POS - nullptr, // GGML_OP_ADD_REL_POS - nullptr, // GGML_OP_RWKV_WKV - - nullptr, // GGML_OP_UNARY - - nullptr, // GGML_OP_MAP_UNARY - nullptr, // GGML_OP_MAP_BINARY - - nullptr, // GGML_OP_MAP_CUSTOM1_F32 - nullptr, // GGML_OP_MAP_CUSTOM2_F32 - nullptr, // GGML_OP_MAP_CUSTOM3_F32 - - nullptr, // GGML_OP_MAP_CUSTOM1 - nullptr, // GGML_OP_MAP_CUSTOM2 - nullptr, // GGML_OP_MAP_CUSTOM3 - - nullptr, // GGML_OP_CROSS_ENTROPY_LOSS - nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK - nullptr, // GGML_OP_OPT_STEP_ADAMW -}; - -static_assert(sizeof(kQnnBinaryOpsTable) / sizeof(kQnnBinaryOpsTable[0]) == GGML_OP_COUNT, - "GGML_OP_COUNT does not match the size of the kQnnBinaryOpsTable table"); +static_assert(kQnnOpsTable[GGML_OP_NONE] == qnn_nop_impl, "GGML_OP_NONE does not match the qnn_nop_impl function"); +static_assert(kQnnOpsTable[GGML_OP_ADD] == qnn_generic_op_impl, + "GGML_OP_ADD does not match the qnn_generic_op_impl function"); +static_assert(kQnnOpsTable[GGML_OP_MUL] == qnn_generic_op_impl, + "GGML_OP_MUL does not match the qnn_generic_op_impl function"); +static_assert(kQnnOpsTable[GGML_OP_MUL_MAT] == qnn_generic_op_impl, + "GGML_OP_MUL_MAT does not match the qnn_generic_op_impl function"); +static_assert(kQnnOpsTable[GGML_OP_RESHAPE] == qnn_nop_impl, + "GGML_OP_RESHAPE does not match the qnn_nop_impl function"); +static_assert(kQnnOpsTable[GGML_OP_VIEW] == nullptr, "GGML_OP_VIEW is not nullptr"); +static_assert(std::size(kQnnOpsTable) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), + "GGML_OP_COUNT does not match the size of the kQnnOpsTable table"); bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_tensor *tensor) { if (!tensor) { @@ -548,6 +369,7 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t return false; } +#ifndef NDEBUG if (tensor->view_src) { auto *src_tensor = tensor->view_src; QNN_LOG_DEBUG("[%s]tensor(%s_%dx%dx%dx%d) is a view, src: %s_%dx%dx%dx%d", qnn::get_backend_name(ctx->device), @@ -555,6 +377,7 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t ggml_get_name(src_tensor), src_tensor->ne[0], src_tensor->ne[1], src_tensor->ne[2], src_tensor->ne[3]); } +#endif switch (tensor->type) { case GGML_TYPE_F32: @@ -576,6 +399,25 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t return true; } +bool ggnl_qnn_supports_op_tensor(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) { + if (op->op == GGML_OP_NONE) { + return true; + } + + if (!ggml_qnn_supports_tensor(ctx, op)) { + return false; + } + + const auto param_count = qnn::get_qnn_op_input_param_count(qnn::get_qnn_op_index(op)); + for (size_t i = 0; i < param_count; ++i) { + if (!ggml_qnn_supports_tensor(ctx, op->src[i])) { + return false; + } + } + + return true; +} + bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) { constexpr const size_t kMaxNpuTensorSize = 8192L * 2048 + 8192 * 512 + 2048 * 512; constexpr const auto get_tensor_size = [](const ggml_tensor *tensor) -> size_t { @@ -591,11 +433,11 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggm * TODO: remove the blocker here when NPU backend supports mul_mat like this: * [ne03, ne02, n, k] * [ne03 * x, ne02 * y, m, k] -> [ne03 * x, ne02 * y, m, n] */ - QNN_LOG_DEBUG("[qnn-npu] src0 and src1 dimensions are not equal, support/unsupported: %d/%d", + QNN_LOG_DEBUG("[qnn-npu][MUL_MAT]src0 and src1 dimensions are not equal, support/unsupported: %d/%d", ctx->support_op_count.load(), ++(ctx->unsupported_op_count)); return false; } else if (get_tensor_size(src0) + get_tensor_size(src1) + get_tensor_size(op) >= kMaxNpuTensorSize) { - QNN_LOG_DEBUG("[qnn-npu] tensor size is too large, support/unsupported: %d/%d", + QNN_LOG_DEBUG("[qnn-npu][MUL_MAT]tensor size is too large, support/unsupported: %d/%d", ctx->support_op_count.load(), ++(ctx->unsupported_op_count)); return false; } @@ -604,9 +446,9 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggm case QNN_BACKEND_GPU: if (src0->type != src1->type || src0->type != op->type) { // there's no convert op for GPU. - QNN_LOG_DEBUG("[qnn-gpu]type src0(%d), src1(%d) and op(%d) are not equal, support/unsupported: %d/%d", - src0->type, src1->type, op->type, ctx->support_op_count.load(), - ++(ctx->unsupported_op_count)); + QNN_LOG_DEBUG( + "[qnn-gpu][MUL_MAT]type src0(%d), src1(%d) and op(%d) are not equal, support/unsupported: %d/%d", + src0->type, src1->type, op->type, ctx->support_op_count.load(), ++(ctx->unsupported_op_count)); return false; } break; @@ -615,12 +457,12 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggm } if ((src1->ne[2] % src0->ne[2]) != 0 || (src1->ne[3] % src0->ne[3]) != 0) { - QNN_LOG_DEBUG("[%s] src0 and src1 dimensions are not equal, support/unsupported: %d/%d", + QNN_LOG_DEBUG("[%s][MUL_MAT]src0 and src1 dimensions are not equal, support/unsupported: %d/%d", qnn::get_backend_name(ctx->device), ctx->support_op_count.load(), ++(ctx->unsupported_op_count)); return false; } - QNN_LOG_DEBUG("[%s] supported matmul op, support/unsupported: %d/%d", qnn::get_backend_name(ctx->device), + QNN_LOG_DEBUG("[%s][MUL_MAT]supported matmul op, support/unsupported: %d/%d", qnn::get_backend_name(ctx->device), ++(ctx->support_op_count), ctx->unsupported_op_count.load()); return true; } @@ -635,41 +477,30 @@ bool device_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor return true; } - auto *src0 = op->src[0]; - if (op->op == GGML_OP_UNARY) { - const auto unary_op = ggml_get_unary_op(op); - if (unary_op == GGML_UNARY_OP_GELU && ctx->device == QNN_BACKEND_NPU) { - // TODO: fix this when NPU supports GELU - QNN_LOG_DEBUG("unsupported unary op GGML_UNARY_OP_GELU for NPU"); - return false; - } + if (!kQnnOpsTable[qnn::get_qnn_op_index(op)]) { + QNN_LOG_DEBUG("[%s]unsupported op", ggml_op_name(op->op)); + return false; + } - if (!kQnnUnaryOpsTable[kGgmlUnaryOpStart + unary_op]) { - QNN_LOG_DEBUG("unsupported unary op %d", unary_op); - return false; - } + if (!ggnl_qnn_supports_op_tensor(ctx, op)) { + QNN_LOG_DEBUG("[%s]unsupported tensor", ggml_op_name(op->op)); + return false; + } - if (!ggml_qnn_supports_tensor(ctx, src0) || !ggml_qnn_supports_tensor(ctx, op)) { - QNN_LOG_DEBUG("src0 is nullptr"); + if (op->op == GGML_OP_UNARY) { + const auto unary_op = ggml_get_unary_op(op); + if (unary_op == GGML_UNARY_OP_GELU) { + // TODO: fix this + QNN_LOG_DEBUG("[GELU]unsupported unary op GGML_UNARY_OP_GELU for NPU"); return false; } } else { - if (!kQnnUnaryOpsTable[op->op] && !kQnnBinaryOpsTable[op->op]) { - QNN_LOG_DEBUG("[%s] unsupported op", ggml_op_name(op->op)); - return false; - } - + auto *src0 = op->src[0]; auto *src1 = op->src[1]; - if (!ggml_qnn_supports_tensor(ctx, src0) || !ggml_qnn_supports_tensor(ctx, op) || - (kQnnBinaryOpsTable[op->op] && !ggml_qnn_supports_tensor(ctx, src1))) { - QNN_LOG_DEBUG("[%s] unsupported tensor", ggml_op_name(op->op)); - return false; - } - switch (op->op) { case GGML_OP_ADD: - if (!is_tensor_dimensions_equal(src0, src1)) { - QNN_LOG_DEBUG("src0 and src1 dimensions are not equal"); + if (!ggml_are_same_shape(src0, src1)) { + QNN_LOG_DEBUG("[ADD] src0 and src1 dimensions are not equal"); return false; } break; @@ -686,34 +517,13 @@ bool device_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor } bool device_compute_graph(ggml_backend_qnn_device_context *ctx, ggml_cgraph *cgraph) { - QNN_LOG_DEBUG("[%s]compute graph, nodes count: %d", qnn::get_backend_name(ctx->device), (int)cgraph->n_nodes); - for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_tensor *tensor = cgraph->nodes[i]; - if (ggml_is_empty(tensor)) { - continue; - } + QNN_LOG_DEBUG("[%s]compute graph start, nodes count: %d", qnn::get_backend_name(ctx->device), (int)cgraph->n_nodes); - size_t unary_op_idx = tensor->op; - if (tensor->op == GGML_OP_UNARY) { - unary_op_idx = kGgmlUnaryOpStart + ggml_get_unary_op(tensor); - } - - bool ok = false; - auto unary_op = kQnnUnaryOpsTable[unary_op_idx]; - auto binary_op = kQnnBinaryOpsTable[tensor->op]; - if (unary_op) { - ok = unary_op(ctx, tensor->src[0], tensor); - } else if (binary_op) { - ok = binary_op(ctx, tensor->src[0], tensor->src[1], tensor); - } + auto qnn_graph = get_qnn_graph_from_cache(ctx, cgraph); + bool success = qnn_graph && qnn_graph->execute(cgraph); - if (!ok) { - QNN_LOG_WARN("[%s]unsupported op %s", qnn::get_backend_name(ctx->device), ggml_op_desc(tensor)); - return false; - } - } - - return true; + QNN_LOG_DEBUG("[%s]compute graph, success: %d", qnn::get_backend_name(ctx->device), (int)success); + return success; } } // namespace qnn diff --git a/ggml/src/ggml-qnn/backend.hpp b/ggml/src/ggml-qnn/backend.hpp index 17823ed577aaa..df5e2eb08fb8f 100644 --- a/ggml/src/ggml-qnn/backend.hpp +++ b/ggml/src/ggml-qnn/backend.hpp @@ -19,7 +19,7 @@ #include "qnn-lib.hpp" namespace qnn { -typedef std::unordered_map> ggml_qnn_graph_cache_t; +typedef std::unordered_map> qnn_graph_cache_t; } // namespace qnn struct ggml_backend_qnn_device_context { @@ -35,7 +35,7 @@ struct ggml_backend_qnn_device_context { std::shared_ptr instance; std::shared_ptr qnn_interface; - qnn::ggml_qnn_graph_cache_t qnn_graph_cache; + qnn::qnn_graph_cache_t qnn_graph_cache; #ifndef NDEBUG std::atomic_uint32_t support_op_count = 0; diff --git a/ggml/src/ggml-qnn/buffer.hpp b/ggml/src/ggml-qnn/buffer.hpp index 9573e160b4176..af165b394eefb 100644 --- a/ggml/src/ggml-qnn/buffer.hpp +++ b/ggml/src/ggml-qnn/buffer.hpp @@ -8,18 +8,65 @@ namespace qnn { +/** + * @brief An interface for managing generic QNN buffers. + * + * This abstract class defines the interface for managing generic memory buffers in a QNN context. + */ class qnn_buffer_interface { public: virtual ~qnn_buffer_interface() = default; + /** + * @brief Checks if the buffer is valid. + * + * This pure virtual function must be implemented by derived classes to check + * the validity of the buffer. + * + * @return true if the buffer is valid, false otherwise. + */ virtual bool is_valid() const = 0; + + /** + * @brief Gets the buffer pointer. + * + * This pure virtual function must be implemented by derived classes to return + * a pointer to the buffer. + * + * @return A pointer to the buffer. + */ virtual uint8_t *get_buffer() = 0; + + /** + * @brief Gets the buffer pointer. + * + * This pure virtual function must be implemented by derived classes to return + * a pointer to the buffer. + * + * @return A pointer to the buffer. + */ virtual size_t get_size() const = 0; + + /** + * @brief Gets the QNN memory handle associated with the buffer. + * + * This pure virtual function must be implemented by derived classes to return + * the memory handle associated with the buffer. + * + * @return The memory handle, or null if no valid QNN memory handle is attached. + */ virtual Qnn_MemHandle_t get_mem_handle() const = 0; }; using qnn_buffer_ptr = std::shared_ptr; +/** + * @brief A class for managing QNN RPC memory buffers. + * + * This class is responsible for allocating, registering, and managing a buffer in RPC memory. + * It ensures that the buffer is properly allocated and registered with the QNN instance, and + * handles cleanup of the buffer and its associated memory handle upon destruction. + */ class qnn_rpc_buffer : public qnn_buffer_interface { public: qnn_rpc_buffer(std::shared_ptr qnn_instance, const size_t size, const uint32_t rank, @@ -29,7 +76,7 @@ class qnn_rpc_buffer : public qnn_buffer_interface { _qnn_rpc_buffer = static_cast(qnn_instance->alloc_rpcmem(size, alignof(uint8_t *))); _qnn_rpc_mem_handle = qnn_instance->register_rpcmem(_qnn_rpc_buffer, rank, dimensions, data_type); if (!_qnn_rpc_buffer || !_qnn_rpc_mem_handle) { - QNN_LOG_WARN("register rpc mem failure"); + QNN_LOG_WARN("Failed to register RPC memory: buffer or memory handle is null"); // let the destructor free the buffer return; } @@ -64,6 +111,13 @@ class qnn_rpc_buffer : public qnn_buffer_interface { DISABLE_MOVE(qnn_rpc_buffer); }; +/** + * @brief A class for managing QNN memory buffers allocated in regular memory. + * + * This class is responsible for allocating, managing, and freeing memory buffers + * in regular (non-RPC) memory. It implements the qnn_buffer_interface to provide + * a consistent interface for buffer management. + */ class qnn_mem_buffer : public qnn_buffer_interface { public: explicit qnn_mem_buffer(const uint8_t *data, size_t size) { @@ -102,4 +156,24 @@ class qnn_mem_buffer : public qnn_buffer_interface { DISABLE_MOVE(qnn_mem_buffer); }; +class qnn_mem_buffer_slice : public qnn_buffer_interface { +public: + qnn_mem_buffer_slice(const uint8_t *buffer, size_t size) : _buffer(const_cast(buffer)), _size(size) {} + + bool is_valid() const override { return _buffer && _size; } + + uint8_t *get_buffer() override { return _buffer; } + + size_t get_size() const override { return _size; } + + Qnn_MemHandle_t get_mem_handle() const override { return nullptr; } + +private: + uint8_t *_buffer = nullptr; + size_t _size = 0; + + DISABLE_COPY(qnn_mem_buffer_slice); + DISABLE_MOVE(qnn_mem_buffer_slice); +}; + } // namespace qnn diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp index 933016a62878e..b3673eb35a5f3 100644 --- a/ggml/src/ggml-qnn/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn/ggml-qnn.cpp @@ -222,6 +222,9 @@ bool ggml_backend_qnn_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_ GGML_UNUSED(backend_dst); GGML_UNUSED(src); GGML_UNUSED(dst); + + QNN_LOG_DEBUG("opy form %s to %s, src_is_qnn: %d, dst_is_qnn: %d", ggml_get_name(src), ggml_get_name(dst), + (int)ggml_backend_is_qnn(backend_src), (int)ggml_backend_is_qnn(backend_dst)); return false; } @@ -317,8 +320,6 @@ ggml_guid_t ggml_backend_qnn_guid() { return &guid; } -bool ggml_backend_is_qnn(ggml_backend_t backend) { return ggml_guid_matches(backend->guid, ggml_backend_qnn_guid()); } - ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, const char *extend_lib_search_path) { if (!extend_lib_search_path) { extend_lib_search_path = GGML_QNN_DEFAULT_LIB_SEARCH_PATH; @@ -420,8 +421,13 @@ bool ggml_backend_qnn_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_ } bool ggml_backend_qnn_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor *op) { +#ifdef NDEBUG + GGML_UNUSED(dev); + GGML_UNUSED(op); +#else auto *device_ctx = get_device_context(dev); QNN_LOG_DEBUG("[%s][%s]offload op", qnn::get_backend_name(device_ctx->device), ggml_op_name(op->op)); +#endif return false; } @@ -509,6 +515,8 @@ const ggml_backend_reg_i ggml_backend_qnn_reg_interface = { } // namespace +bool ggml_backend_is_qnn(ggml_backend_t backend) { return ggml_guid_matches(backend->guid, ggml_backend_qnn_guid()); } + ggml_backend_reg_t ggml_backend_qnn_reg() { static ggml_backend_qnn_reg_impl reg{ggml_backend_qnn_reg_interface}; return ® diff --git a/ggml/src/ggml-qnn/graph.cpp b/ggml/src/ggml-qnn/graph.cpp new file mode 100644 index 0000000000000..0210e1554a8ab --- /dev/null +++ b/ggml/src/ggml-qnn/graph.cpp @@ -0,0 +1,386 @@ + +#include "graph.hpp" + +#include +#include + +#include "ggml-impl.h" + +#include "logger.hpp" +#include "op-config.hpp" +#include "tensor.hpp" + +namespace { +using qnn_tensor_cache_t = std::unordered_map; + +int get_op_max_rank(const ggml_tensor *op) { + int max_rank = ggml_n_dims(op); + const int count = (int)qnn::get_qnn_op_input_param_count(qnn::get_qnn_op_index(op)); + for (int i = 0; i < count; ++i) { + max_rank = std::max(max_rank, ggml_n_dims(op->src[i])); + } + + return max_rank; +} + +qnn::qnn_tensor_ptr_t create_tensor_with_cache(ggml_tensor *tensor, qnn::ggml_qnn_tensor::tensor_type_t type, int rank, + QNNBackend device, Qnn_GraphHandle_t graph_handle, + std::shared_ptr qnn_instance, + qnn_tensor_cache_t &tensor_cache) { + GGML_ASSERT(tensor); + if (tensor_cache.count(tensor)) { + return tensor_cache[tensor]; + } + + auto qnn_tensor = std::make_shared(type, tensor->name, tensor->ne, tensor->type, rank, device, + graph_handle, qnn_instance); + tensor_cache[tensor] = qnn_tensor; + return qnn_tensor; +} + +qnn::qnn_tensor_array_t create_tensors_with_cache(const qnn::ggml_tensor_array_t &ggml_tensors, + qnn::ggml_qnn_tensor::tensor_type_t type, int rank, QNNBackend device, + Qnn_GraphHandle_t graph_handle, + std::shared_ptr qnn_instance, + qnn_tensor_cache_t &tensor_cache) { + qnn::qnn_tensor_array_t tensors; + for (auto *tensor : ggml_tensors) { + tensors.push_back( + create_tensor_with_cache(tensor, type, rank, device, graph_handle, qnn_instance, tensor_cache)); + } + + return tensors; +} + +qnn::qnn_op_config_ptr_t create_operation_from_op_tensor(ggml_tensor *dst, const std::string &name, int rank, + QNNBackend device, Qnn_GraphHandle_t graph_handle, + std::shared_ptr qnn_instance, + bool is_intermediate, qnn_tensor_cache_t &tensor_cache) { + const auto op_index = qnn::get_qnn_op_index(dst); + auto qnn_op = qnn::create_op_constructor(op_index); + auto operation = qnn_op(name, qnn_instance); + + // input tensors + qnn::qnn_tensor_array_t input_qnn_tensors; + auto tensor_type = is_intermediate ? qnn::ggml_qnn_tensor::INTERMEDIATE : qnn::ggml_qnn_tensor::INPUT; + for (size_t i = 0; i < qnn::get_qnn_op_input_param_count(op_index); ++i) { + auto input_qnn_tensor = + create_tensor_with_cache(dst->src[i], tensor_type, rank, device, graph_handle, qnn_instance, tensor_cache); + input_qnn_tensors.push_back(input_qnn_tensor); + } + operation->set_input_tensors(input_qnn_tensors); + + // output tensor + tensor_type = is_intermediate ? qnn::ggml_qnn_tensor::INTERMEDIATE : qnn::ggml_qnn_tensor::OUTPUT; + qnn::qnn_tensor_array_t output_qnn_tensors = + create_tensors_with_cache({dst}, tensor_type, rank, device, graph_handle, qnn_instance, tensor_cache); + operation->set_output_tensors(output_qnn_tensors); + + // initialize operation + if (!operation->initialize_op_nodes(device, graph_handle)) { + QNN_LOG_ERROR("[%s][%s]initialize_op_nodes failed", qnn::get_backend_name(device), name.c_str()); + return nullptr; + } + + return operation; +} + +bool bind_src_tensors(ggml_tensor *op, qnn::qnn_tensor_array_t &tensor_wrappers, + std::vector &qnn_tensors) { + if (op->op == GGML_OP_NONE) { + QNN_LOG_DEBUG("op %s is not a valid op", ggml_get_name(op)); + return false; + } + + const auto param_count = qnn::get_qnn_op_input_param_count(qnn::get_qnn_op_index(op)); + GGML_ASSERT(tensor_wrappers.size() == param_count); + qnn_tensors.resize(param_count); + for (size_t i = 0; i < param_count; ++i) { + auto *ggml_tensor = op->src[i]; + if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor)) { + QNN_LOG_ERROR("bind tensor %s failed", ggml_get_name(ggml_tensor)); + return false; + } + + qnn_tensors[i] = tensor_wrappers[i]->get_qnn_tensor(); + } + + return true; +} + +int get_io_tensors_from_graph(const ggml_cgraph *cgraph, qnn::ggml_tensor_array_t &inputs, + qnn::ggml_tensor_array_t &outputs) { + using ggml_tensor_set_t = std::set; + + ggml_tensor_set_t input_set; + ggml_tensor_set_t output_set; + ggml_tensor_set_t visited_set; + int rank = 0; + for (int i = 0; i < cgraph->n_nodes; i++) { + ggml_tensor *dst = cgraph->nodes[i]; + if (ggml_is_empty(dst)) { + continue; + } + + if (dst->op == GGML_OP_NONE || dst->op == GGML_OP_VIEW) { + // TODO: remove GGML_OP_VIEW after view op is supported + continue; + } + + rank = std::max(rank, ggml_n_dims(dst)); + input_set.erase(dst); + if (!visited_set.count(dst)) { + output_set.insert(dst); + visited_set.insert(dst); + } + + for (size_t i = 0; i < GGML_MAX_DIMS && dst->src[i]; ++i) { + auto *src = dst->src[i]; + rank = std::max(rank, ggml_n_dims(src)); + output_set.erase(src); + if (!visited_set.count(src)) { + input_set.insert(src); + visited_set.insert(src); + } + } + } + + inputs.assign(input_set.begin(), input_set.end()); + outputs.assign(output_set.begin(), output_set.end()); + return rank; +} + +} // namespace + +namespace qnn { + +qnn_graph::qnn_graph(const std::string &graph_name, QNNBackend device, std::shared_ptr qnn_instance, + size_t vtcm_size_in_mb) + : _graph_name(graph_name), _device(device), _qnn_instance(qnn_instance) { + QNN_LOG_DEBUG("[%s][%s]created", get_backend_name(device), graph_name.c_str()); + + auto qnn_interface = qnn_instance->get_qnn_interface(); + auto qnn_context = qnn_instance->get_qnn_context_handle(); + Qnn_ErrorHandle_t error = QNN_SUCCESS; + Qnn_GraphHandle_t graph_handle = nullptr; + if (device == QNN_BACKEND_NPU) { + // TODO: fix graph config here for NPU + QnnHtpGraph_CustomConfig_t hvx_config; + hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; + hvx_config.numHvxThreads = 8; + QnnGraph_Config_t graph_hvx_config; + graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_hvx_config.customConfig = &hvx_config; + + QnnHtpGraph_CustomConfig_t dlbc_config; + dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; + dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; + dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC + QnnGraph_Config_t graph_dlbc_config; + graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_dlbc_config.customConfig = &dlbc_config; + + QnnHtpGraph_CustomConfig_t opt_config; + opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; + opt_config.optimizationOption.floatValue = 1; // 1 / 3 + QnnGraph_Config_t graph_opt_config; + graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_opt_config.customConfig = &opt_config; + + QnnHtpGraph_CustomConfig_t vtcm_config; + vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; + vtcm_config.vtcmSizeInMB = vtcm_size_in_mb; + QnnGraph_Config_t graph_vtcm_config; + graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_vtcm_config.customConfig = &vtcm_config; + + const QnnGraph_Config_t *graph_configs[] = {&graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config, + &graph_opt_config, nullptr}; + error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), graph_configs, &graph_handle); + } else { + error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), nullptr, &graph_handle); + } + + if (error != QNN_SUCCESS) { + QNN_LOG_ERROR("[%s][%s]failed to create qnn graph, error: %s", get_backend_name(device), graph_name.c_str(), + get_qnn_error_string(error)); + return; + } + + QNN_LOG_INFO("[%s][%s]create succeed", get_backend_name(device), graph_name.c_str()); + _graph_handle = graph_handle; + _qnn_interface = qnn_interface; +} + +qnn_graph::~qnn_graph() { QNN_LOG_DEBUG("[%s][%s]destroy", get_backend_name(_device), _graph_name.c_str()); } + +bool qnn_graph::build_graph_from_op(ggml_tensor *op) { + if (!is_valid()) { + QNN_LOG_ERROR("Invalid graph"); + return false; + } + + QNN_LOG_DEBUG("[%s][%s]build start", get_backend_name(_device), _graph_name.c_str()); + qnn_tensor_cache_t tensor_cache; + const auto rank = get_op_max_rank(op); + auto operation = create_operation_from_op_tensor(op, _graph_name, rank, _device, _graph_handle, _qnn_instance, + false, tensor_cache); + if (!operation) { + QNN_LOG_ERROR("[%s][%s]create_operation_from_op_tensor failed", get_backend_name(_device), _graph_name.c_str()); + return false; + } + + _tensor_inputs = operation->get_input_tensors(); + _tensor_outputs = operation->get_output_tensors(); + _operations.push_back(std::move(operation)); + if (!finalize()) { + return false; + } + + QNN_LOG_DEBUG("[%s][%s]build succeed", get_backend_name(_device), _graph_name.c_str()); + return true; +} + +bool qnn_graph::build_graph_from_ggml_graph(const ggml_cgraph *cgraph) { + QNN_LOG_DEBUG("[%s][%s]build start", get_backend_name(_device), _graph_name.c_str()); + + ggml_tensor_array_t inputs; + ggml_tensor_array_t outputs; + int rank = get_io_tensors_from_graph(cgraph, inputs, outputs); + QNN_LOG_DEBUG("[%s]rank: %d, input_set: %d, output_set: %d", get_backend_name(_device), rank, int(inputs.size()), + int(outputs.size())); + + { + qnn_tensor_cache_t tensor_cache; + auto input_tensors = create_tensors_with_cache(inputs, ggml_qnn_tensor::INPUT, rank, _device, _graph_handle, + _qnn_instance, tensor_cache); + auto output_tensors = create_tensors_with_cache(outputs, ggml_qnn_tensor::OUTPUT, rank, _device, _graph_handle, + _qnn_instance, tensor_cache); + qnn_op_config_array_t operations; + for (int i = 0; i < cgraph->n_nodes; i++) { + ggml_tensor *dst = cgraph->nodes[i]; + if (ggml_is_empty(dst)) { + continue; + } + + if (dst->op == GGML_OP_NONE || dst->op == GGML_OP_VIEW) { + // TODO: remove GGML_OP_VIEW after view op is supported + continue; + } + + QNN_LOG_DEBUG("[%s]create op: %s", get_backend_name(_device), get_qnn_op_name(dst->op)); + auto operation = create_operation_from_op_tensor(dst, dst->name, rank, _device, _graph_handle, + _qnn_instance, true, tensor_cache); // TODO: fix op name + operations.push_back(operation); + } + + _tensor_inputs = std::move(input_tensors); + _tensor_outputs = std::move(output_tensors); + _operations = std::move(operations); + if (!finalize()) { + return false; + } + } + + QNN_LOG_DEBUG("[%s][%s]build succeed", get_backend_name(_device), _graph_name.c_str()); + return true; +} + +bool qnn_graph::execute(ggml_tensor *op) { + if (!bind_src_tensors(op, _tensor_inputs, _qnn_tensor_inputs)) { + QNN_LOG_ERROR("[%s][%s]bind input tensors failed", get_backend_name(_device), _graph_name.c_str()); + return false; + } + + if (!qnn::bind_tensors({op}, _tensor_outputs, _qnn_tensor_outputs)) { + QNN_LOG_ERROR("[%s][%s]bind output tensors failed", get_backend_name(_device), _graph_name.c_str()); + return false; + } + + auto &qnn_tensor_inputs = _qnn_tensor_inputs; + auto &qnn_tensor_outputs = _qnn_tensor_outputs; + auto error = + _qnn_interface->qnn_graph_execute(_graph_handle, qnn_tensor_inputs.data(), qnn_tensor_inputs.size(), + qnn_tensor_outputs.data(), qnn_tensor_outputs.size(), nullptr, nullptr); + unbind_tensors(_tensor_inputs); + unbind_tensors(_tensor_outputs); + + if (error != QNN_SUCCESS) { + if (_device == QNN_BACKEND_NPU && error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) { + QNN_LOG_WARN("[%s][%s]NPU crashed. SSR detected. Caused QNN graph execute error.", + get_backend_name(_device), _graph_name.c_str()); + } else { + QNN_LOG_ERROR("[%s][%s]error: %s", get_backend_name(_device), _graph_name.c_str(), + get_qnn_error_string(error)); + } + return false; + } + + QNN_LOG_DEBUG("[%s][%s]execute succeed", get_backend_name(_device), _graph_name.c_str()); + return true; +} + +bool qnn_graph::execute(const ggml_cgraph *cgraph) { + ggml_tensor_array_t inputs; + ggml_tensor_array_t outputs; +#ifdef NDEBUG + get_io_tensors_from_graph(cgraph, inputs, outputs); +#else + int rank = get_io_tensors_from_graph(cgraph, inputs, outputs); + QNN_LOG_DEBUG("[%s]rank: %d, input_set: %d, output_set: %d", get_backend_name(_device), rank, int(inputs.size()), + int(outputs.size())); +#endif + + { + if (!qnn::bind_tensors(inputs, _tensor_inputs, _qnn_tensor_inputs)) { + QNN_LOG_ERROR("[%s][%s]bind input tensors failed", get_backend_name(_device), _graph_name.c_str()); + return false; + } + + if (!qnn::bind_tensors(outputs, _tensor_outputs, _qnn_tensor_outputs)) { + QNN_LOG_ERROR("[%s][%s]bind output tensors failed", get_backend_name(_device), _graph_name.c_str()); + return false; + } + + auto &qnn_tensor_inputs = _qnn_tensor_inputs; + auto &qnn_tensor_outputs = _qnn_tensor_outputs; + auto error = + _qnn_interface->qnn_graph_execute(_graph_handle, qnn_tensor_inputs.data(), qnn_tensor_inputs.size(), + qnn_tensor_outputs.data(), qnn_tensor_outputs.size(), nullptr, nullptr); + unbind_tensors(_tensor_inputs); + unbind_tensors(_tensor_outputs); + + if (error != QNN_SUCCESS) { + if (_device == QNN_BACKEND_NPU && error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) { + QNN_LOG_WARN("[%s][%s]NPU crashed. SSR detected. Caused QNN graph execute error.", + get_backend_name(_device), _graph_name.c_str()); + } else { + QNN_LOG_ERROR("[%s][%s]error: %s", get_backend_name(_device), _graph_name.c_str(), + get_qnn_error_string(error)); + } + return false; + } + + QNN_LOG_DEBUG("[%s][%s]execute succeed", get_backend_name(_device), _graph_name.c_str()); + return true; + } +} + +bool qnn_graph::finalize() { + if (!qnn::add_op_to_graph(_graph_handle, _operations)) { + QNN_LOG_ERROR("[%s]add nodes failed", _graph_name.c_str()); + return false; + } + + auto error = _qnn_interface->qnn_graph_finalize(_graph_handle, nullptr, nullptr); + if (error != QNN_SUCCESS) { + QNN_LOG_ERROR("[%s][%s]qnn_graph_finalize.error: %s", get_backend_name(_device), _graph_name.c_str(), + get_qnn_error_string(error)); + return false; + } + + QNN_LOG_DEBUG("[%s][%s]finalize succeed", get_backend_name(_device), _graph_name.c_str()); + return true; +} + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 1806f41126f3c..521186f790ee5 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -1,164 +1,53 @@ #pragma once -#include #include #include #include #include "ggml-qnn.h" -#include "logger.hpp" #include "op-config.hpp" #include "qnn-lib.hpp" namespace qnn { -class ggml_qnn_graph { +class qnn_graph { public: - explicit ggml_qnn_graph(const std::string &graph_name, QNNBackend device, - std::shared_ptr qnn_instance, size_t vtcm_size_in_mb) - : _graph_name(graph_name), _device(device), _qnn_instance(qnn_instance) { - QNN_LOG_DEBUG("[%s][%s]created", get_backend_name(device), graph_name.c_str()); + explicit qnn_graph(const std::string &graph_name, QNNBackend device, std::shared_ptr qnn_instance, + size_t vtcm_size_in_mb); + ~qnn_graph(); - auto qnn_interface = qnn_instance->get_qnn_interface(); - auto qnn_context = qnn_instance->get_qnn_context_handle(); - Qnn_ErrorHandle_t error = QNN_SUCCESS; - Qnn_GraphHandle_t graph_handle = nullptr; - if (device == QNN_BACKEND_NPU) { - // TODO: fix graph config here for NPU - QnnHtpGraph_CustomConfig_t hvx_config; - hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; - hvx_config.numHvxThreads = 8; - QnnGraph_Config_t graph_hvx_config; - graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_hvx_config.customConfig = &hvx_config; - - QnnHtpGraph_CustomConfig_t dlbc_config; - dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; - dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; - dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC - QnnGraph_Config_t graph_dlbc_config; - graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_dlbc_config.customConfig = &dlbc_config; - - QnnHtpGraph_CustomConfig_t opt_config; - opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; - opt_config.optimizationOption.floatValue = 1; // 1 / 3 - QnnGraph_Config_t graph_opt_config; - graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_opt_config.customConfig = &opt_config; - - QnnHtpGraph_CustomConfig_t vtcm_config; - vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; - vtcm_config.vtcmSizeInMB = vtcm_size_in_mb; - QnnGraph_Config_t graph_vtcm_config; - graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_vtcm_config.customConfig = &vtcm_config; - - const QnnGraph_Config_t *graph_configs[] = {&graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config, - &graph_opt_config, nullptr}; - error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), graph_configs, &graph_handle); - } else { - error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), nullptr, &graph_handle); - } - - if (error != QNN_SUCCESS) { - QNN_LOG_ERROR("[%s][%s]failed to create qnn graph, error: %s", get_backend_name(device), graph_name.c_str(), - get_qnn_error_string(error)); - return; - } - - QNN_LOG_INFO("[%s][%s]create succeed", get_backend_name(device), graph_name.c_str()); - _graph_handle = graph_handle; - _qnn_interface = qnn_interface; - } - - ~ggml_qnn_graph() { QNN_LOG_DEBUG("[%s][%s]destroy", get_backend_name(_device), _graph_name.c_str()); } - - bool build_graph(ggml_op_constructor_t op_constructor, const ggml_tensor_array_t &tensor_inputs, - const ggml_tensor_array_t &tensor_outputs) { - GGML_ASSERT(op_constructor); - if (!is_valid()) { - QNN_LOG_ERROR("Invalid graph"); - return false; - } - - QNN_LOG_DEBUG("[%s][%s]build_graph start", get_backend_name(_device), _graph_name.c_str()); - _op_config = op_constructor(_graph_name, _qnn_instance); - if (!_op_config->initialize_op_nodes(_device, _graph_handle, tensor_inputs, tensor_outputs)) { - QNN_LOG_ERROR("[%s][%s]initialize_op_nodes failed", get_backend_name(_device), _graph_name.c_str()); - return false; - } - - if (!_op_config->add_op_to_graph(_graph_handle)) { - QNN_LOG_ERROR("[%s]add nodes failed", _graph_name.c_str()); - return false; - } - - auto error = _qnn_interface->qnn_graph_finalize(_graph_handle, nullptr, nullptr); - if (error != QNN_SUCCESS) { - QNN_LOG_ERROR("[%s][%s]qnn_graph_finalize.error: %s", get_backend_name(_device), _graph_name.c_str(), - get_qnn_error_string(error)); - return false; - } - - QNN_LOG_DEBUG("[%s][%s]build_graph succeed", get_backend_name(_device), _graph_name.c_str()); - return true; - } - - bool execute(const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) { - if (!_op_config->bind_input_tensors(tensor_inputs)) { - QNN_LOG_ERROR("[%s][%s]bind input tensors failed", get_backend_name(_device), _graph_name.c_str()); - return false; - } - - if (!_op_config->bind_output_tensors(tensor_outputs)) { - QNN_LOG_ERROR("[%s][%s]bind output tensors failed", get_backend_name(_device), _graph_name.c_str()); - return false; - } - - auto &qnn_tensor_inputs = _op_config->get_qnn_input_tensors(); - auto &qnn_tensor_outputs = _op_config->get_qnn_output_tensors(); - - auto error = - _qnn_interface->qnn_graph_execute(_graph_handle, qnn_tensor_inputs.data(), qnn_tensor_inputs.size(), - qnn_tensor_outputs.data(), qnn_tensor_outputs.size(), nullptr, nullptr); - _op_config->unbind_input_tensors(); - _op_config->unbind_output_tensors(); - - if (error != QNN_SUCCESS) { - if (_device == QNN_BACKEND_NPU && error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) { - QNN_LOG_WARN("[%s][%s]NPU crashed. SSR detected. Caused QNN graph execute error.", - get_backend_name(_device), _graph_name.c_str()); - } else { - QNN_LOG_ERROR("[%s][%s]error: %s", get_backend_name(_device), _graph_name.c_str(), - get_qnn_error_string(error)); - } - return false; - } - - QNN_LOG_DEBUG("[%s][%s]execute succeed", get_backend_name(_device), _graph_name.c_str()); - return true; - } + bool build_graph_from_op(ggml_tensor *op); + bool build_graph_from_ggml_graph(const ggml_cgraph *cgraph); + bool execute(ggml_tensor *op); + bool execute(const ggml_cgraph *cgraph); bool is_valid() const { return _graph_handle != nullptr; } - Qnn_GraphHandle_t get_graph_handler() const { return _graph_handle; } - + std::shared_ptr get_qnn_instance() { return _qnn_instance; } const std::string &get_name() const { return _graph_name; } + QNNBackend get_device() const { return _device; } private: + bool finalize(); + const std::string _graph_name; const QNNBackend _device; Qnn_GraphHandle_t _graph_handle = nullptr; std::shared_ptr _qnn_instance; std::shared_ptr _qnn_interface; - std::unique_ptr _op_config; - std::vector _param_types; + qnn_op_config_array_t _operations; - DISABLE_COPY(ggml_qnn_graph); - DISABLE_MOVE(ggml_qnn_graph); + qnn_tensor_array_t _tensor_inputs; + qnn_tensor_array_t _tensor_outputs; + std::vector _qnn_tensor_inputs; + std::vector _qnn_tensor_outputs; + + DISABLE_COPY(qnn_graph); + DISABLE_MOVE(qnn_graph); }; +using qnn_graph_ptr_t = std::shared_ptr; + } // namespace qnn diff --git a/ggml/src/ggml-qnn/op-config-base.hpp b/ggml/src/ggml-qnn/op-config-base.hpp index 159944a7d7f60..274bb8318ff99 100644 --- a/ggml/src/ggml-qnn/op-config-base.hpp +++ b/ggml/src/ggml-qnn/op-config-base.hpp @@ -10,8 +10,6 @@ namespace qnn { -using ggml_tensor_array_t = std::vector; - /** * @class ggml_qnn_op_config * @brief Abstract base class for configuring QNN operations. @@ -23,6 +21,34 @@ class ggml_qnn_op_config { public: virtual ~ggml_qnn_op_config() {} + /** + * @brief Sets custom input tensors for the operation. This method should be called before `initialize_op_nodes`. + * If no custom input tensors are provided, the input tensors will be automatically created from the input ggml + * tensors. + * + * This pure virtual function must be overridden by derived classes to set + * the input tensors for the operation. The function takes a reference to a + * vector of qnn_tensor_ptr_t objects, which represent the input tensors. + * + * @param tensor_inputs A reference to a vector of qnn_tensor_ptr_t objects representing the input tensors. + */ + virtual void set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) = 0; + virtual void set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) = 0; + + /** + * @brief Sets custom output tensors for the operation. This method should be called before `initialize_op_nodes`. + * If no custom output tensors are provided, the output tensors will be automatically created from the output ggml + * tensors. + * + * This pure virtual function must be overridden by derived classes to set + * the output tensors for the operation. The function takes a reference to a + * vector of qnn_tensor_ptr_t objects, which represent the output tensors. + * + * @param tensor_outputs A reference to a vector of qnn_tensor_ptr_t objects representing the output tensors. + */ + virtual void set_output_tensors(qnn::qnn_tensor_array_t &tensor_outputs) = 0; + virtual void set_output_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) = 0; + /** * @brief Creates tensors and internal nodes for constructing the calculation graph. * @@ -31,36 +57,32 @@ class ggml_qnn_op_config { * the internal nodes necessary for constructing the calculation graph. It takes * input and output tensor arrays as parameters. * - * @param device The backend device where tensors will be created. - * @param graph_handle The handle to the graph where tensors and nodes will be associated. - * @param tensor_inputs An array of input tensors. - * @param tensor_outputs An array of output tensors. + * @param device + * @param graph_handle * @return true if tensors and nodes are successfully created, false otherwise. */ - virtual bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, - const ggml_tensor_array_t &tensor_inputs, - const ggml_tensor_array_t &tensor_outputs) = 0; + virtual bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) = 0; /** - * @brief Pure virtual function to retrieve the input tensors for QNN (Quantized Neural Network). + * @brief Pure virtual function to retrieve the input tensors. * * This function must be overridden by derived classes to provide the specific implementation * for retrieving the input tensors used in QNN operations. * - * @return A reference to a vector of Qnn_Tensor_t objects representing the input tensors. + * @return A reference to a vector of qnn_tensor_ptr_t objects representing the input tensors. */ - virtual std::vector &get_qnn_input_tensors() = 0; + virtual const qnn_tensor_array_t &get_input_tensors() = 0; /** - * @brief Pure virtual function to retrieve the output tensors of a QNN (Quantized Neural Network). + * @brief Pure virtual function to retrieve the output tensors of a QNN. * * This function must be overridden by any derived class to provide access to the * output tensors of the QNN. The function returns a reference to a vector of - * Qnn_Tensor_t objects, which represent the output tensors. + * qnn_tensor_ptr_t objects, which represent the output tensors. * - * @return std::vector& Reference to a vector of Qnn_Tensor_t objects. + * @return A reference to a vector of qnn_tensor_ptr_t objects representing the output tensors. */ - virtual std::vector &get_qnn_output_tensors() = 0; + virtual const qnn_tensor_array_t &get_output_tensors() = 0; /** * @brief Adds an operation to the given graph. @@ -125,5 +147,6 @@ class ggml_qnn_op_config { }; using qnn_op_config_ptr_t = std::shared_ptr; +using qnn_op_config_array_t = std::vector; } // namespace qnn diff --git a/ggml/src/ggml-qnn/op-config-caps.cpp b/ggml/src/ggml-qnn/op-config-caps.cpp new file mode 100644 index 0000000000000..aab8f65958bf1 --- /dev/null +++ b/ggml/src/ggml-qnn/op-config-caps.cpp @@ -0,0 +1,223 @@ + +#include "op-config.hpp" + +namespace { + +using op_dims_calc_func_t = void (*)(const std::vector &input_dims, + qnn::ggml_dimension_array_t &output_dims); + +void element_wise_op_dims(const std::vector &input_dims, + qnn::ggml_dimension_array_t &output_dims) { + for (size_t i = 1; i < std::size(output_dims); i++) { + output_dims[i] = input_dims.front()[i]; + } +} + +void mat_mul_op_dims(const std::vector &input_dims, + qnn::ggml_dimension_array_t &output_dims) { + GGML_ASSERT(input_dims.size() == 2); + output_dims[0] = input_dims.front()[1]; + output_dims[1] = input_dims.back()[1]; +} + +struct qnn_op_caps_t { + const char *qnn_op_name = nullptr; + const size_t input_param_count = 0; + op_dims_calc_func_t calc_dims_func = nullptr; +}; + +constexpr const qnn_op_caps_t kOpCaps[] = { + {}, // GGML_OP_NONE + {}, // GGML_OP_DUP + { + // GGML_OP_ADD + QNN_OP_ELEMENT_WISE_ADD, // qnn_op_name + 2, // input_param_count + element_wise_op_dims, // calc_dims_func + }, + {}, // GGML_OP_ADD1 + {}, // GGML_OP_ACC + { + // GGML_OP_SUB + QNN_OP_ELEMENT_WISE_SUBTRACT, // qnn_op_name + 2, // input_param_count + element_wise_op_dims, // calc_dims_func + }, + { + // GGML_OP_MUL + QNN_OP_ELEMENT_WISE_MULTIPLY, // qnn_op_name + 2, // input_param_count + element_wise_op_dims, // calc_dims_func + }, + { + // GGML_OP_DIV + QNN_OP_ELEMENT_WISE_DIVIDE, // qnn_op_name + 2, // input_param_count + element_wise_op_dims, // calc_dims_func + }, + {}, // GGML_OP_SQR + { + // GGML_OP_SQRT + QNN_OP_ELEMENT_WISE_SQUARE_ROOT, // qnn_op_name + 1, // input_param_count + element_wise_op_dims, // calc_dims_func + }, + { + // GGML_OP_LOG + QNN_OP_ELEMENT_WISE_LOG, // qnn_op_name + 1, // input_param_count + element_wise_op_dims, // calc_dims_func + }, + {}, // GGML_OP_SIN + {}, // GGML_OP_COS + {}, // GGML_OP_SUM + {}, // GGML_OP_SUM_ROWS + {}, // GGML_OP_MEAN + {}, // GGML_OP_ARGMAX + {}, // GGML_OP_COUNT_EQUAL + {}, // GGML_OP_REPEAT + {}, // GGML_OP_REPEAT_BACK + {}, // GGML_OP_CONCAT + {}, // GGML_OP_SILU_BACK + {}, // GGML_OP_NORM + {}, // GGML_OP_RMS_NORM + {}, // GGML_OP_RMS_NORM_BACK + {}, // GGML_OP_GROUP_NORM + { + // GGML_OP_MUL_MAT + QNN_OP_MAT_MUL, // qnn_op_name + 2, // input_param_count + mat_mul_op_dims, // calc_dims_func + }, + {}, // GGML_OP_MUL_MAT_ID + {}, // GGML_OP_OUT_PROD + {}, // GGML_OP_SCALE + {}, // GGML_OP_SET + {}, // GGML_OP_CPY + {}, // GGML_OP_CONT + { + // GGML_OP_RESHAPE + QNN_OP_RESHAPE, // qnn_op_name + 1, // input_param_count + nullptr, // TODO: calc_dims_func + }, + {}, // GGML_OP_VIEW + {}, // GGML_OP_PERMUTE + {}, // GGML_OP_TRANSPOSE + {}, // GGML_OP_GET_ROWS + {}, // GGML_OP_GET_ROWS_BACK + {}, // GGML_OP_DIAG + {}, // GGML_OP_DIAG_MASK_INF + {}, // GGML_OP_DIAG_MASK_ZERO + {}, // GGML_OP_SOFT_MAX + {}, // GGML_OP_SOFT_MAX_BACK + {}, // GGML_OP_ROPE + {}, // GGML_OP_ROPE_BACK + {}, // GGML_OP_CLAMP + {}, // GGML_OP_CONV_TRANSPOSE_1D + {}, // GGML_OP_IM2COL + {}, // GGML_OP_IM2COL_BACK + {}, // GGML_OP_CONV_TRANSPOSE_2D + {}, // GGML_OP_POOL_1D + {}, // GGML_OP_POOL_2D + {}, // GGML_OP_POOL_2D_BACK + {}, // GGML_OP_UPSCALE + {}, // GGML_OP_PAD + {}, // GGML_OP_PAD_REFLECT_1D + {}, // GGML_OP_ARANGE + + {}, // GGML_OP_TIMESTEP_EMBEDDING + {}, // GGML_OP_ARGSORT + {}, // GGML_OP_LEAKY_RELU + + {}, // GGML_OP_FLASH_ATTN_EXT + {}, // GGML_OP_FLASH_ATTN_BACK + {}, // GGML_OP_SSM_CONV + {}, // GGML_OP_SSM_SCAN + {}, // GGML_OP_WIN_PART + {}, // GGML_OP_WIN_UNPART + {}, // GGML_OP_GET_REL_POS + {}, // GGML_OP_ADD_REL_POS + {}, // GGML_OP_RWKV_WKV6 + + {}, // GGML_OP_UNARY + + {}, // GGML_OP_MAP_UNARY + {}, // GGML_OP_MAP_BINARY + + {}, // GGML_OP_MAP_CUSTOM1_F32 + {}, // GGML_OP_MAP_CUSTOM2_F32 + {}, // GGML_OP_MAP_CUSTOM3_F32 + + {}, // GGML_OP_MAP_CUSTOM1 + {}, // GGML_OP_MAP_CUSTOM2 + {}, // GGML_OP_MAP_CUSTOM3 + + {}, // GGML_OP_CROSS_ENTROPY_LOSS + {}, // GGML_OP_CROSS_ENTROPY_LOSS_BACK + {}, // GGML_OP_OPT_STEP_ADAMW + + // ggml_unary_op + {}, // GGML_UNARY_OP_ABS + {}, // GGML_UNARY_OP_SGN + {}, // GGML_UNARY_OP_NEG + {}, // GGML_UNARY_OP_STEP + {}, // GGML_UNARY_OP_TANH + {}, // GGML_UNARY_OP_ELU + {}, // GGML_UNARY_OP_RELU + {}, // GGML_UNARY_OP_SIGMOID + { + // GGML_UNARY_OP_GELU + QNN_OP_GELU, // qnn_op_name + 1, // input_param_count + nullptr, // TODO: calc_dims_func + }, + {}, // GGML_UNARY_OP_GELU_QUICK + {}, // GGML_UNARY_OP_SILU + {}, // GGML_UNARY_OP_HARDSWISH + {}, // GGML_UNARY_OP_HARDSIGMOID + {}, // GGML_UNARY_OP_EXP +}; + +static_assert(kOpCaps[GGML_OP_NONE].calc_dims_func == nullptr, "GGML_OP_NONE should not have calc_dims_func function"); +static_assert(kOpCaps[GGML_OP_ADD].calc_dims_func == element_wise_op_dims, + "GGML_OP_ADD does not have element_wise_op_dims function"); +static_assert(kOpCaps[GGML_OP_MUL_MAT].calc_dims_func == mat_mul_op_dims, + "GGML_OP_ADD does not have element_wise_op_dims function"); +static_assert(kOpCaps[GGML_OP_LOG].calc_dims_func == element_wise_op_dims, + "GGML_OP_LOG does not have element_wise_op_dims function"); +static_assert(std::size(kOpCaps) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), + "GGML_OP_COUNT does not match the size of the kOpCaps table"); + +} // namespace + +namespace qnn { + +size_t get_qnn_op_index(const ggml_tensor *tensor) { + if (tensor->op == GGML_OP_UNARY) { + return kGgmlUnaryOpStart + ggml_get_unary_op(tensor); + } + + return tensor->op; +} + +void get_ggml_op_output_dimensions(const std::vector &input_dims, size_t op, + ggml_dimension_array_t &output_dims) { + GGML_ASSERT(op < std::size(kOpCaps)); + auto get_dims = kOpCaps[op].calc_dims_func; + GGML_ASSERT(get_dims); + get_dims(input_dims, output_dims); +} + +const char *get_qnn_op_name(size_t op) { + GGML_ASSERT(op < std::size(kOpCaps)); + GGML_ASSERT(kOpCaps[op].qnn_op_name); + return kOpCaps[op].qnn_op_name; +} + +size_t get_qnn_op_input_param_count(size_t op) { + GGML_ASSERT(op < std::size(kOpCaps)); + return kOpCaps[op].input_param_count; +} + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/op-config.cpp b/ggml/src/ggml-qnn/op-config.cpp index b3c84b5435095..7edb4078a57df 100644 --- a/ggml/src/ggml-qnn/op-config.cpp +++ b/ggml/src/ggml-qnn/op-config.cpp @@ -24,16 +24,7 @@ qnn::qnn_dimension_array_t get_transposed_dimensions(const qnn::qnn_dimension_ar } int get_rank(const qnn::ggml_tensor_array_t &tensor_inputs, const qnn::ggml_tensor_array_t &tensor_outputs) { - int tensor_rank = 0; - // get the max tensor rank - for (auto tensor : tensor_inputs) { - tensor_rank = std::max(tensor_rank, ggml_n_dims(tensor)); - } - for (auto tensor : tensor_outputs) { - tensor_rank = std::max(tensor_rank, ggml_n_dims(tensor)); - } - - return tensor_rank; + return std::max(qnn::get_ggml_tensors_max_rank(tensor_inputs), qnn::get_ggml_tensors_max_rank(tensor_outputs)); } Qnn_DataType_t get_tensor_type(const qnn::qnn_tensor_array_t &tensors) { @@ -49,93 +40,6 @@ Qnn_DataType_t get_tensor_type(const qnn::qnn_tensor_array_t &tensors) { return type; } -struct tensor_common_params { - const char *name_prefix; - int tensor_rank; - bool is_input; - QNNBackend device; - Qnn_GraphHandle_t graph_handle; - std::shared_ptr qnn_instance; -}; - -void create_tensors_from_ggml_tensor(const tensor_common_params ¶ms, const qnn::ggml_tensor_array_t &ggml_tensors, - qnn::qnn_tensor_array_t *tensor_wrappers, std::vector *qnn_tensors) { - using namespace qnn; - - tensor_wrappers->resize(ggml_tensors.size()); - if (qnn_tensors) { - qnn_tensors->resize(ggml_tensors.size()); - } - char buffer[GGML_MAX_NAME] = {}; - auto tensor_type = params.is_input ? ggml_qnn_tensor::INPUT : ggml_qnn_tensor::OUTPUT; - for (size_t i = 0; i < ggml_tensors.size(); i++) { - snprintf(buffer, GGML_MAX_NAME, "%s%d", params.name_prefix, (int)i); - auto *ggml_tensor = ggml_tensors[i]; - (*tensor_wrappers)[i] = std::make_shared(tensor_type, std::string(buffer), ggml_tensor->ne, - ggml_tensor->type, params.tensor_rank, params.device, - params.graph_handle, params.qnn_instance); - } -} - -bool bind_tensors(const qnn::ggml_tensor_array_t &ggml_tensors, qnn::qnn_tensor_array_t &tensor_wrappers, - std::vector &qnn_tensors) { - for (size_t i = 0; i < ggml_tensors.size(); i++) { - auto *ggml_tensor = ggml_tensors[i]; - if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor)) { - QNN_LOG_ERROR("bind tensor %s failed", ggml_get_name(ggml_tensor)); - return false; - } - - qnn_tensors[i] = tensor_wrappers[i]->get_qnn_tensor(); - } - - return true; -} - -class ggml_qnn_connectable_op_config : public qnn::ggml_qnn_op_config_base { -public: - explicit ggml_qnn_connectable_op_config(const std::string &name, const std::string &package_name, - const std::string &op_type, std::shared_ptr qnn_instance) - : ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} - - bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, - const qnn::ggml_tensor_array_t &tensor_inputs, - const qnn::ggml_tensor_array_t &tensor_outputs) override { - GGML_UNUSED(device); - GGML_UNUSED(graph_handle); - GGML_UNUSED(tensor_inputs); - GGML_UNUSED(tensor_outputs); - return true; - } - - void set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) { - _tensor_inputs = tensor_inputs; - _qnn_tensor_inputs.resize(_tensor_inputs.size()); - } - - void set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) { - _tensor_inputs = std::move(tensor_inputs); - _qnn_tensor_inputs.resize(_tensor_inputs.size()); - } - - void set_output_tensors(qnn::qnn_tensor_array_t &tensor_outputs) { - _tensor_outputs = tensor_outputs; - _qnn_tensor_outputs.resize(_tensor_outputs.size()); - } - - void set_output_tensors(qnn::qnn_tensor_array_t &&tensor_outputs) { - _tensor_outputs = std::move(tensor_outputs); - _qnn_tensor_outputs.resize(_tensor_outputs.size()); - } - - qnn::qnn_tensor_array_t &get_input_tensors() { return _tensor_inputs; } - qnn::qnn_tensor_array_t &get_output_tensors() { return _tensor_outputs; } - -private: - DISABLE_COPY(ggml_qnn_connectable_op_config); - DISABLE_MOVE(ggml_qnn_connectable_op_config); -}; - } // namespace namespace qnn { @@ -161,7 +65,7 @@ bool ggml_qnn_op_config_base::add_tensor_param(const std::string &name, const qn } GGML_ASSERT(data_size > 0); - if (!param_tensor->bind_buffer(const_cast(data), data_size)) { + if (!param_tensor->set_data_buffer(data, data_size)) { QNN_LOG_ERROR("parameter tensor bind_buffer failed"); return false; } @@ -181,6 +85,26 @@ bool ggml_qnn_op_config_base::add_tensor_param(const std::string &name, const qn return true; } +void ggml_qnn_op_config_base::set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) { + _tensor_inputs = tensor_inputs; + _qnn_tensor_inputs.resize(_tensor_inputs.size()); +} + +void ggml_qnn_op_config_base::set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) { + _tensor_inputs = tensor_inputs; + _qnn_tensor_inputs.resize(_tensor_inputs.size()); +} + +void ggml_qnn_op_config_base::set_output_tensors(qnn::qnn_tensor_array_t &tensor_outputs) { + _tensor_outputs = std::move(tensor_outputs); + _qnn_tensor_outputs.resize(_tensor_outputs.size()); +} + +void ggml_qnn_op_config_base::set_output_tensors(qnn::qnn_tensor_array_t &&tensor_outputs) { + _tensor_outputs = std::move(tensor_outputs); + _qnn_tensor_outputs.resize(_tensor_outputs.size()); +} + bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) { GGML_ASSERT(_qnn_tensor_inputs.size() == _tensor_inputs.size()); GGML_ASSERT(_qnn_tensor_outputs.size() == _tensor_outputs.size()); @@ -221,12 +145,12 @@ bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) { bool ggml_qnn_op_config_base::bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) { GGML_ASSERT(tensor_inputs.size() == _tensor_inputs.size()); - return bind_tensors(tensor_inputs, _tensor_inputs, _qnn_tensor_inputs); + return qnn::bind_tensors(tensor_inputs, _tensor_inputs, _qnn_tensor_inputs); } bool ggml_qnn_op_config_base::bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) { GGML_ASSERT(tensor_outputs.size() == _tensor_outputs.size()); - return bind_tensors(tensor_outputs, _tensor_outputs, _qnn_tensor_outputs); + return qnn::bind_tensors(tensor_outputs, _tensor_outputs, _qnn_tensor_outputs); } void ggml_qnn_op_config_base::unbind_input_tensors() { @@ -257,55 +181,42 @@ Qnn_OpConfig_t ggml_qnn_op_config_base::get_op_config() { return config; } -bool ggml_qnn_single_op_config::initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, - const ggml_tensor_array_t &tensor_inputs, - const ggml_tensor_array_t &tensor_outputs) { - const auto tensor_rank = get_rank(tensor_inputs, tensor_outputs); - tensor_common_params params = {"src", tensor_rank, true, device, graph_handle, _qnn_instance}; - create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs); - params.name_prefix = "dst"; - params.is_input = false; - create_tensors_from_ggml_tensor(params, tensor_outputs, &_tensor_outputs, &_qnn_tensor_outputs); - - if (_param_buffer.size() > 0) { - // handle parameters in output tensor - auto *params = tensor_outputs.front()->op_params; - memcpy(_param_buffer.data(), params, _param_buffer.size()); - - const uint32_t count = uint32_t(_param_buffer.size() / qnn_datatype_size(_param_type)); - const qnn_dimension_array_t param_dims = {count, 1, 1, 1}; - add_tensor_param(_param_name, param_dims, 1, _param_buffer.data(), _param_type, device, graph_handle); - } - +bool ggml_qnn_single_op_config::initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) { + GGML_UNUSED(device); + GGML_UNUSED(graph_handle); return true; } -bool ggml_qnn_aggregate_op_config::bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) { - return bind_tensors(tensor_inputs, _tensor_inputs, _qnn_tensor_inputs); +void ggml_qnn_aggregate_op_config::set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) { + _tensor_inputs = tensor_inputs; } -bool ggml_qnn_aggregate_op_config::bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) { - return bind_tensors(tensor_outputs, _tensor_outputs, _qnn_tensor_outputs); +void ggml_qnn_aggregate_op_config::set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) { + _tensor_inputs = std::move(tensor_inputs); } -bool ggml_qnn_matmul_op_config::initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, - const ggml_tensor_array_t &tensor_inputs, - const ggml_tensor_array_t &tensor_outputs) { - GGML_ASSERT(tensor_inputs.size() == 2); - GGML_ASSERT(tensor_outputs.size() == 1); - const auto tensor_rank = get_rank(tensor_inputs, tensor_outputs); - GGML_ASSERT(tensor_rank >= 2); +void ggml_qnn_aggregate_op_config::set_output_tensors(qnn::qnn_tensor_array_t &tensor_outputs) { + _tensor_outputs = tensor_outputs; +} - // create input tensors - tensor_common_params params = {"src", tensor_rank, true, device, graph_handle, _qnn_instance}; - create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs); +void ggml_qnn_aggregate_op_config::set_output_tensors(qnn::qnn_tensor_array_t &&tensor_outputs) { + _tensor_outputs = std::move(tensor_outputs); +} + +bool ggml_qnn_aggregate_op_config::bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) { + return qnn::bind_tensors(tensor_inputs, _tensor_inputs); +} + +bool ggml_qnn_aggregate_op_config::bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) { + return qnn::bind_tensors(tensor_outputs, _tensor_outputs); +} - // create output tensor - params.name_prefix = "dst"; - params.is_input = false; - create_tensors_from_ggml_tensor(params, tensor_outputs, &_tensor_outputs, &_qnn_tensor_outputs); +bool ggml_qnn_matmul_op_config::initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) { + GGML_ASSERT(_tensor_inputs.size() == 2); + GGML_ASSERT(_tensor_outputs.size() == 1); // create convert nodes + const auto tensor_rank = _tensor_inputs.front()->get_rank(); qnn_tensor_array_t mat_mul_tensor_inputs = _tensor_inputs; qnn_tensor_array_t mat_mul_tensor_outputs = _tensor_outputs; if (!create_convert_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs, mat_mul_tensor_outputs)) { @@ -343,8 +254,8 @@ qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend devic auto gather_out = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, name + "_out", dimensions, tensor_input->get_data_type(), rank, device, graph_handle, qnn_instance); - auto gather_op = std::make_shared(name, QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_GATHER, qnn_instance); + auto gather_op = std::make_shared(name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_GATHER, + qnn_instance); Qnn_Scalar_t scalar = QNN_SCALAR_INIT; scalar.dataType = QNN_DATATYPE_INT_32; @@ -355,16 +266,16 @@ qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend devic // here we calculate the index mapping, will generate a 1d tensor like [0, 0, 0, 1, 1, 1, 2, 2, 2, ...], // by repeating each index [scale] times. const auto scale = dimensions[axis] / tensor_input->get_dimensions()[axis]; - std::vector index_buffer(dimensions[axis] * sizeof(uint32_t)); - for (uint32_t *curr = reinterpret_cast(index_buffer.data()), *end = curr + dimensions[axis]; + auto index_buffer = std::make_shared(dimensions[axis] * sizeof(uint32_t)); + for (uint32_t *curr = reinterpret_cast(index_buffer->get_buffer()), *end = curr + dimensions[axis]; curr < end; curr++) { - *curr = (curr - reinterpret_cast(index_buffer.data())) / scale; + *curr = (curr - reinterpret_cast(index_buffer->get_buffer())) / scale; } auto gather_index = std::make_shared( ggml_qnn_tensor::PARAMETER, name + "_index", qnn_dimension_array_t{dimensions[axis]}, QNN_DATATYPE_UINT_32, 1, device, graph_handle, qnn_instance); - gather_index->set_data_buffer(std::move(index_buffer)); + gather_index->set_data_buffer(index_buffer); gather_op->set_input_tensors({tensor_input, gather_index}); tensor_output = gather_out; @@ -409,8 +320,8 @@ bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_Grap auto convert_out = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_out", convert_in->get_dimensions(), tensor_type, rank, device, graph_handle, _qnn_instance); - auto convert = std::make_shared(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_CONVERT, _qnn_instance); + auto convert = std::make_shared(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_CONVERT, _qnn_instance); convert->set_input_tensors({convert_in}); convert->set_output_tensors({convert_out}); tensor_inputs[i] = convert_out; @@ -424,8 +335,8 @@ bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_Grap auto convert_in = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_in", convert_out->get_dimensions(), tensor_type, rank, device, graph_handle, _qnn_instance); - auto output_convert = std::make_shared( - convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_CONVERT, _qnn_instance); + auto output_convert = std::make_shared(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_CONVERT, _qnn_instance); output_convert->set_input_tensors({convert_in}); output_convert->set_output_tensors({convert_out}); tensor_outputs.front() = convert_in; @@ -495,12 +406,12 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap dst->get_data_type(), rank, device, graph_handle, _qnn_instance); // create transpose_out - auto transpose_out = std::make_shared( - _name + "_trans1", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_TRANSPOSE, _qnn_instance); + auto transpose_out = std::make_shared(_name + "_trans1", QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_TRANSPOSE, _qnn_instance); // create mat_mul - auto mat_mul = std::make_shared(_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL, - _qnn_instance); + auto mat_mul = + std::make_shared(_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL, _qnn_instance); Qnn_Scalar_t scalar = QNN_SCALAR_INIT; scalar.dataType = QNN_DATATYPE_BOOL_8; @@ -528,19 +439,20 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap return true; } -ggml_op_constructor_t create_op_constructor(const std::string &op_name) { +ggml_op_constructor_t create_op_constructor(size_t op) { + std::string op_name = get_qnn_op_name(op); if (op_name == QNN_OP_MAT_MUL) { // For QNN_OP_MAT_MUL, we need to transpose the input tensor return [](const std::string &instance_name, - std::shared_ptr qnn_instance) -> std::unique_ptr { + std::shared_ptr qnn_instance) -> std::shared_ptr { QNN_LOG_DEBUG("create QNN_OP_MAT_MUL, name %s", instance_name.c_str()); - return std::make_unique(instance_name, qnn_instance); + return std::make_shared(instance_name, qnn_instance); }; } return [op_name](const std::string &instance_name, - std::shared_ptr qnn_instance) -> std::unique_ptr { - return std::make_unique(instance_name, QNN_OP_PACKAGE_NAME_QTI_AISW, op_name, + std::shared_ptr qnn_instance) -> std::shared_ptr { + return std::make_shared(instance_name, QNN_OP_PACKAGE_NAME_QTI_AISW, op_name, qnn_instance); }; } diff --git a/ggml/src/ggml-qnn/op-config.hpp b/ggml/src/ggml-qnn/op-config.hpp index a05b75ade7e6a..ca066520bc171 100644 --- a/ggml/src/ggml-qnn/op-config.hpp +++ b/ggml/src/ggml-qnn/op-config.hpp @@ -1,7 +1,7 @@ #pragma once -#include #include +#include #include #include @@ -13,9 +13,28 @@ namespace qnn { using ggml_op_constructor_t = - std::function(const std::string &, std::shared_ptr)>; + std::function(const std::string &, std::shared_ptr)>; -ggml_op_constructor_t create_op_constructor(const std::string &op_name); +constexpr const size_t kGgmlUnaryOpStart = GGML_OP_COUNT; + +size_t get_qnn_op_index(const ggml_tensor *tensor); +void get_ggml_op_output_dimensions(const std::vector &input_dims, size_t op, + ggml_dimension_array_t &output_dims); + +const char *get_qnn_op_name(size_t op); +size_t get_qnn_op_input_param_count(size_t op); + +ggml_op_constructor_t create_op_constructor(size_t op); + +inline bool add_op_to_graph(Qnn_GraphHandle_t graph_handle, std::vector &operations) { + for (auto &op : operations) { + if (!op->add_op_to_graph(graph_handle)) { + return false; + } + } + + return true; +} class ggml_qnn_op_config_base : public ggml_qnn_op_config { public: @@ -27,13 +46,18 @@ class ggml_qnn_op_config_base : public ggml_qnn_op_config { bool add_tensor_param(const std::string &name, const qnn_dimension_array_t &dimensions, int rank, const uint8_t *data, const Qnn_DataType_t data_type, QNNBackend device, Qnn_GraphHandle_t graph_handle); + + void set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) override; + void set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) override; + void set_output_tensors(qnn::qnn_tensor_array_t &tensor_inputs) override; + void set_output_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) override; bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override; bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) override; bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) override; void unbind_input_tensors() override; void unbind_output_tensors() override; - std::vector &get_qnn_input_tensors() override { return _qnn_tensor_inputs; } - std::vector &get_qnn_output_tensors() override { return _qnn_tensor_outputs; } + const qnn_tensor_array_t &get_input_tensors() override { return _tensor_inputs; } + const qnn_tensor_array_t &get_output_tensors() override { return _tensor_outputs; } protected: Qnn_OpConfig_t get_op_config(); @@ -60,24 +84,9 @@ class ggml_qnn_single_op_config : public ggml_qnn_op_config_base { const std::string &op_type, std::shared_ptr qnn_instance) : ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} - explicit ggml_qnn_single_op_config(const std::string &name, const std::string &package_name, - const std::string &op_type, const std::string ¶m_name, - const Qnn_DataType_t param_type, const size_t param_size, - std::shared_ptr qnn_instance) - : ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance), - _param_name(param_name), - _param_type(param_type), - _param_buffer(param_size) {} - - bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, - const ggml_tensor_array_t &tensor_inputs, - const ggml_tensor_array_t &tensor_outputs) override; + bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) override; private: - const std::string _param_name; - const Qnn_DataType_t _param_type = QNN_DATATYPE_UINT_32; - std::vector _param_buffer; - DISABLE_COPY(ggml_qnn_single_op_config); DISABLE_MOVE(ggml_qnn_single_op_config); }; @@ -88,26 +97,21 @@ class ggml_qnn_aggregate_op_config : public ggml_qnn_op_config { : _name(name), _qnn_instance(qnn_instance) {} ~ggml_qnn_aggregate_op_config() { - _qnn_tensor_inputs.clear(); - _qnn_tensor_outputs.clear(); _tensor_inputs.clear(); _tensor_outputs.clear(); _operations.clear(); } + void set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) override; + void set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) override; + void set_output_tensors(qnn::qnn_tensor_array_t &tensor_inputs) override; + void set_output_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) override; bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override { - for (auto &op : _operations) { - if (!op->add_op_to_graph(graph_handle)) { - return false; - } - } - return true; + return qnn::add_op_to_graph(graph_handle, _operations); } bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) override; - bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) override; - void unbind_input_tensors() override { for (auto &tensor : _tensor_inputs) { tensor->unbind(); @@ -120,8 +124,8 @@ class ggml_qnn_aggregate_op_config : public ggml_qnn_op_config { } } - std::vector &get_qnn_input_tensors() override { return _qnn_tensor_inputs; } - std::vector &get_qnn_output_tensors() override { return _qnn_tensor_outputs; } + const qnn_tensor_array_t &get_input_tensors() override { return _tensor_inputs; } + const qnn_tensor_array_t &get_output_tensors() override { return _tensor_outputs; } protected: std::string _name; @@ -130,8 +134,6 @@ class ggml_qnn_aggregate_op_config : public ggml_qnn_op_config { std::vector _operations; qnn_tensor_array_t _tensor_inputs; qnn_tensor_array_t _tensor_outputs; - std::vector _qnn_tensor_inputs; - std::vector _qnn_tensor_outputs; private: DISABLE_COPY(ggml_qnn_aggregate_op_config); @@ -143,9 +145,7 @@ class ggml_qnn_matmul_op_config : public ggml_qnn_aggregate_op_config { ggml_qnn_matmul_op_config(const std::string &name, std::shared_ptr qnn_instance) : ggml_qnn_aggregate_op_config(name, qnn_instance) {} - bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, - const ggml_tensor_array_t &tensor_inputs, - const ggml_tensor_array_t &tensor_outputs) override; + bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) override; private: qnn_tensor_ptr_t create_gather_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, diff --git a/ggml/src/ggml-qnn/qnn-types.hpp b/ggml/src/ggml-qnn/qnn-types.hpp index 7461ac3012755..ec30602843301 100644 --- a/ggml/src/ggml-qnn/qnn-types.hpp +++ b/ggml/src/ggml-qnn/qnn-types.hpp @@ -12,7 +12,9 @@ namespace qnn { // // helper data type / data structure / macros / functions of // Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK -// ref:https://github.com/pytorch/executorch/tree/main/backends/qualcomm +// ref: +// https://github.com/pytorch/executorch/blob/ae3d558d5e6aa04fc52a3065399fe6a773702f52/backends/qualcomm/serialization/qc_schema.py#L53 +// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/overview.html#supported-snapdragon-devices // ================================================================================================= enum sdk_profile_level { profile_off = 0, profile_basic, profile_detail }; @@ -22,14 +24,18 @@ enum qcom_htp_arch { V69 = 69, V73 = 73, V75 = 75, + V79 = 79, // SD 8 Gen 4 (SM8750) }; enum qcom_chipset { UNKNOWN_SM = 0, - SM8450 = 36, // v69 - SM8475 = 42, // v69 - SM8550 = 43, // v73 - SM8650 = 57, // v75 + SM8450 = 36, // v69, SD 8 Gen 1 + SM8475 = 42, // v69, SD 8+ Gen 1 + SM8550 = 43, // v73, SD 8 Gen 2 + SSG2115P = 46, // v73 + SM8650 = 57, // v75, SD 8 Gen 3 + SA8295 = 39, // v68 + SM8750 = 69, // v79, SD 8 Gen 4 }; struct qcom_socinfo { diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index 833c620971e0d..3bd86891cb18f 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -20,9 +20,9 @@ namespace qnn { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS should be 4"); -class ggml_qnn_tensor { +class ggml_qnn_tensor : public std::enable_shared_from_this { public: - typedef enum _tensor_type { INPUT, OUTPUT, INTERMEDIATE, PARAMETER } tensor_type_t; + typedef enum _tensor_type { INPUT, OUTPUT, INTERMEDIATE, PARAMETER, BIDIRECTION } tensor_type_t; explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string &name, const qnn_dimension_array_t &dimensions, Qnn_DataType_t data_type, int rank, @@ -49,18 +49,27 @@ class ggml_qnn_tensor { qnn_datatype_from_ggml_datatype(data_type), rank, device, graph_handle, qnn_instance) {} ~ggml_qnn_tensor() { - _buffer_storage.clear(); - unbind(); _rpc_buffer.reset(); + unbind(); } - bool set_data_buffer(std::vector &&buffer) { - if (!bind_buffer_impl(buffer.data(), buffer.size())) { - return false; + bool set_data_buffer(const uint8_t *buffer, const size_t buffer_size) { + auto qnn_buffer = std::make_shared(buffer, buffer_size); + if (bind_buffer_impl(qnn_buffer)) { + return true; } - _buffer_storage = std::move(buffer); - return true; + can_unbind = false; + return false; + } + + bool set_data_buffer(qnn_buffer_ptr buffer) { + if (bind_buffer_impl(buffer)) { + return true; + } + + can_unbind = false; + return false; } bool alloc_qnn_tensor_id() { @@ -83,23 +92,32 @@ class ggml_qnn_tensor { return true; } - bool bind_buffer(uint8_t *buffer, const size_t buffer_size) { - if (!_buffer_storage.empty()) { + bool bind_ggml_tensor(ggml_tensor *tensor) { + if (!can_unbind) { QNN_LOG_DEBUG("[%s]already has buffer storage, skip bind", _tensor_name.c_str()); return true; } - return bind_buffer_impl(buffer, buffer_size); - } +#ifndef NDEBUG + if (tensor->view_src) { + auto *src = tensor->view_src; + QNN_LOG_DEBUG("[%s]tensor(%s_%dx%dx%dx%d) is a view, src: %s_%dx%dx%dx%d", get_backend_name(_device), + tensor->name, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], src->name, + src->ne[0], src->ne[1], src->ne[2], src->ne[3]); + } +#endif - bool bind_ggml_tensor(ggml_tensor *tensor) { - if (!bind_buffer(reinterpret_cast(tensor->data), ggml_nbytes(tensor))) { + auto buffer = + std::make_shared(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); + if (!bind_buffer_impl(buffer)) { QNN_LOG_WARN("[%s]failed to bind ggml tensor(%s)", _tensor_name.c_str(), ggml_get_name(tensor)); return false; } QNN_LOG_DEBUG("[%s][%s]bind to ggml tensor(%s)", get_backend_name(_device), _tensor_name.c_str(), ggml_get_name(tensor)); + tensor->extra = this; + _ggml_tensor = tensor; return true; } @@ -110,7 +128,7 @@ class ggml_qnn_tensor { } if (!_buffer) { - QNN_LOG_DEBUG("[%s]bound to ggml tensor", _tensor_name.c_str()); + QNN_LOG_DEBUG("[%s]unbind to ggml tensor", _tensor_name.c_str()); return true; } @@ -119,7 +137,7 @@ class ggml_qnn_tensor { return false; } - if (!_buffer_storage.empty()) { + if (!can_unbind) { QNN_LOG_DEBUG("[%s]already has buffer storage, stop unbind", _tensor_name.c_str()); return true; } @@ -132,26 +150,32 @@ class ggml_qnn_tensor { } QNN_LOG_DEBUG("[%s][%s]unbind from buffer: %p, size: %d", get_backend_name(_device), _tensor_name.c_str(), - _buffer, (int)_buffer_size); - _buffer = nullptr; - _buffer_size = 0; + _buffer.get(), (int)_buffer->get_size()); + _buffer.reset(); + + if (_ggml_tensor) { + _ggml_tensor->extra = nullptr; + _ggml_tensor = nullptr; + } + return true; } const Qnn_Tensor_t &get_qnn_tensor() const { return _qnn_tensor; } Qnn_DataType_t get_data_type() const { return QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor); } const qnn_dimension_array_t &get_dimensions() const { return _dimensions; } + uint32_t get_rank() const { return QNN_TENSOR_GET_RANK(_qnn_tensor); } uint32_t get_qnn_tensor_id() const { return QNN_TENSOR_GET_ID(_qnn_tensor); } private: - bool bind_buffer_impl(uint8_t *buffer, const size_t buffer_size) { + bool bind_buffer_impl(qnn_buffer_ptr buffer) { if (_buffer) { if (_buffer != buffer) { - QNN_LOG_WARN("[%s]has been bound to another buffer %p", _tensor_name.c_str(), _buffer); + QNN_LOG_WARN("[%s]has been bound to another buffer %p", _tensor_name.c_str(), _buffer.get()); return false; } - QNN_LOG_DEBUG("[%s]already bound to same ggml tensor %p", _tensor_name.c_str(), _buffer); + QNN_LOG_DEBUG("[%s]already bound to same ggml tensor %p", _tensor_name.c_str(), _buffer.get()); return true; } @@ -164,7 +188,7 @@ class ggml_qnn_tensor { if (should_use_mem_handle()) { if (!_rpc_buffer) { auto rpc_buffer = std::make_shared( - _qnn_instance, buffer_size, QNN_TENSOR_GET_RANK(_qnn_tensor), + _qnn_instance, buffer->get_size(), QNN_TENSOR_GET_RANK(_qnn_tensor), QNN_TENSOR_GET_DIMENSIONS(_qnn_tensor), QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor)); if (!rpc_buffer->is_valid()) { QNN_LOG_WARN("[%s][%s]alloc rpc mem failed", get_backend_name(_device), _tensor_name.c_str()); @@ -187,22 +211,21 @@ class ggml_qnn_tensor { QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor)); } else { QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); - Qnn_ClientBuffer_t client_buf = {buffer, (uint32_t)buffer_size}; + Qnn_ClientBuffer_t client_buf = {buffer->get_buffer(), (uint32_t)buffer->get_size()}; QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); QNN_LOG_DEBUG("[%s]use client buffer %p size %d", _tensor_name.c_str(), client_buf.data, (int)client_buf.dataSize); } _buffer = buffer; - _buffer_size = buffer_size; if (!write_to_qnn_tensor()) { QNN_LOG_WARN("[%s]write to qnn tensor failed", _tensor_name.c_str()); return false; } - QNN_LOG_DEBUG("[%s][%s]bind to buffer: %p, size: %d", get_backend_name(_device), _tensor_name.c_str(), buffer, - (int)buffer_size); + QNN_LOG_DEBUG("[%s][%s]bind to buffer: %p, size: %d", get_backend_name(_device), _tensor_name.c_str(), + buffer.get(), (int)buffer->get_size()); return true; } @@ -214,7 +237,7 @@ class ggml_qnn_tensor { } if (_rpc_buffer) { - memcpy(_rpc_buffer->get_buffer(), _buffer, _buffer_size); + memcpy(_rpc_buffer->get_buffer(), _buffer->get_buffer(), _buffer->get_size()); } // For CPU and GPU, the data is already in the tensor. @@ -230,7 +253,7 @@ class ggml_qnn_tensor { } if (_rpc_buffer) { - memcpy(_buffer, _rpc_buffer->get_buffer(), _buffer_size); + memcpy(_buffer->get_buffer(), _rpc_buffer->get_buffer(), _buffer->get_size()); } // For CPU and GPU, the data is already in the tensor. @@ -258,6 +281,9 @@ class ggml_qnn_tensor { case PARAMETER: new_tensor_type = QNN_TENSOR_TYPE_STATIC; break; + case BIDIRECTION: + new_tensor_type = QNN_TENSOR_TYPE_APP_READWRITE; + break; case INTERMEDIATE: default: new_tensor_type = QNN_TENSOR_TYPE_NATIVE; @@ -273,15 +299,15 @@ class ggml_qnn_tensor { } std::string _tensor_name; - uint8_t *_buffer = nullptr; - size_t _buffer_size = 0; - std::vector _buffer_storage; + qnn_buffer_ptr _buffer; + bool can_unbind = true; QNNBackend _device; std::shared_ptr _qnn_instance; Qnn_Tensor_t _qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion); qnn_dimension_array_t _dimensions = {}; Qnn_GraphHandle_t _graph_handle = nullptr; qnn_buffer_ptr _rpc_buffer; + ggml_tensor *_ggml_tensor = nullptr; DISABLE_COPY(ggml_qnn_tensor); DISABLE_MOVE(ggml_qnn_tensor); @@ -289,5 +315,92 @@ class ggml_qnn_tensor { using qnn_tensor_ptr_t = std::shared_ptr; using qnn_tensor_array_t = std::vector; +using ggml_tensor_array_t = std::vector; + +inline qnn_tensor_ptr_t get_qnn_tensor_ptr(ggml_tensor *ggml_tensor) { + return ggml_tensor->extra ? reinterpret_cast(ggml_tensor->extra)->shared_from_this() + : qnn_tensor_ptr_t(); +} + +inline int get_ggml_tensors_max_rank(const qnn::ggml_tensor_array_t &tensors) { + int max_rank = 0; + for (auto tensor : tensors) { + max_rank = std::max(max_rank, ggml_n_dims(tensor)); + } + + return max_rank; +} + +inline bool bind_tensors(const ggml_tensor_array_t &ggml_tensors, qnn_tensor_array_t &tensor_wrappers, + std::vector &qnn_tensors) { + GGML_ASSERT(tensor_wrappers.size() == ggml_tensors.size()); + qnn_tensors.resize(ggml_tensors.size()); + for (size_t i = 0; i < ggml_tensors.size(); i++) { + auto *ggml_tensor = ggml_tensors[i]; + if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor)) { + QNN_LOG_ERROR("bind tensor %s failed", ggml_get_name(ggml_tensor)); + return false; + } + + qnn_tensors[i] = tensor_wrappers[i]->get_qnn_tensor(); + } + + return true; +} + +inline bool bind_tensors(const ggml_tensor_array_t &ggml_tensors, qnn_tensor_array_t &tensor_wrappers) { + GGML_ASSERT(tensor_wrappers.size() == ggml_tensors.size()); + for (size_t i = 0; i < ggml_tensors.size(); i++) { + auto *ggml_tensor = ggml_tensors[i]; + if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor)) { + QNN_LOG_ERROR("bind tensor %s failed", ggml_get_name(ggml_tensor)); + return false; + } + } + + return true; +} + +inline void unbind_tensors(qnn_tensor_array_t &tensor_wrappers) { + for (auto &tensor : tensor_wrappers) { + tensor->unbind(); + } +} + +struct tensor_create_common_params { + const char *name_prefix; + int tensor_rank; + bool is_input; + QNNBackend device; + Qnn_GraphHandle_t graph_handle; + std::shared_ptr qnn_instance; +}; + +inline void create_tensors_from_ggml_tensor(const tensor_create_common_params ¶ms, + const ggml_tensor_array_t &ggml_tensors, + qnn_tensor_array_t *tensor_wrappers, + std::vector *qnn_tensors) { + if (qnn_tensors) { + qnn_tensors->resize(ggml_tensors.size()); + } + + if (!tensor_wrappers->empty()) { + QNN_LOG_DEBUG("tensor_wrappers is not empty, skip create tensors"); + GGML_ASSERT(tensor_wrappers->size() == ggml_tensors.size()); + return; + } + + tensor_wrappers->resize(ggml_tensors.size()); + + char buffer[GGML_MAX_NAME] = {}; + auto tensor_type = params.is_input ? ggml_qnn_tensor::INPUT : ggml_qnn_tensor::OUTPUT; + for (size_t i = 0; i < ggml_tensors.size(); i++) { + snprintf(buffer, GGML_MAX_NAME, "%s%d", params.name_prefix, (int)i); + auto *ggml_tensor = ggml_tensors[i]; + (*tensor_wrappers)[i] = std::make_shared(tensor_type, std::string(buffer), ggml_tensor->ne, + ggml_tensor->type, params.tensor_rank, params.device, + params.graph_handle, params.qnn_instance); + } +} } // namespace qnn diff --git a/ggml/src/ggml-qnn/utils.cpp b/ggml/src/ggml-qnn/utils.cpp index eaabe60cdb262..6e77ee5f5f287 100644 --- a/ggml/src/ggml-qnn/utils.cpp +++ b/ggml/src/ggml-qnn/utils.cpp @@ -188,13 +188,15 @@ const char *get_backend_name(QNNBackend device_index) { const char *get_chipset_desc(uint32_t chipset_id) { switch (chipset_id) { case SM8450: - return "SM8450"; + return "SD 8 Gen 1 (SM8450)"; case SM8475: - return "SM8475"; + return "SD 8+ Gen 1 (SM8475)"; case SM8550: - return "SM8550"; + return "SD 8 Gen 2 (SM8550)"; case SM8650: - return "SM8650"; + return "SD 8 Gen 3 (SM8650)"; + case SM8750: + return "SD 8 Gen 4 (SM8750)"; default: return "unknown"; } @@ -210,6 +212,8 @@ const char *get_htparch_desc(size_t htp_arch) { return "QCOM_HTP_V73"; case V75: return "QCOM_HTP_V75"; + case V79: + return "QCOM_HTP_V79"; default: return "unknown"; } From 5f93376f6703829aa15b068be52a84748507fca2 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 10 Jan 2025 11:30:00 +0800 Subject: [PATCH 138/166] fix compiling error after merged --- ggml/src/ggml-qnn/backend-ops.cpp | 3 ++- ggml/src/ggml-qnn/op-config-caps.cpp | 1 + src/llama.cpp | 4 ---- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 1ed01bfd6851d..75c90e235bbc2 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -314,7 +314,8 @@ constexpr const ggml_qnn_op_t kQnnOpsTable[] = { nullptr, // GGML_OP_WIN_UNPART nullptr, // GGML_OP_GET_REL_POS nullptr, // GGML_OP_ADD_REL_POS - nullptr, // GGML_OP_RWKV_WKV + nullptr, // GGML_OP_RWKV_WKV6 + nullptr, // GGML_OP_GATED_LINEAR_ATTN nullptr, // GGML_OP_UNARY diff --git a/ggml/src/ggml-qnn/op-config-caps.cpp b/ggml/src/ggml-qnn/op-config-caps.cpp index aab8f65958bf1..7fa3d11affc18 100644 --- a/ggml/src/ggml-qnn/op-config-caps.cpp +++ b/ggml/src/ggml-qnn/op-config-caps.cpp @@ -139,6 +139,7 @@ constexpr const qnn_op_caps_t kOpCaps[] = { {}, // GGML_OP_GET_REL_POS {}, // GGML_OP_ADD_REL_POS {}, // GGML_OP_RWKV_WKV6 + {}, // GGML_OP_GATED_LINEAR_ATTN {}, // GGML_OP_UNARY diff --git a/src/llama.cpp b/src/llama.cpp index 2a3409eacbfc7..a364861d3c803 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -11848,10 +11848,6 @@ struct llama_sampler_chain_params llama_sampler_chain_default_params() { size_t llama_max_devices(void) { return 16; } -#if defined(GGML_USE_QNN) - return GGML_QNN_MAX_DEVICES; -#else -#endif bool llama_supports_mmap(void) { return llama_mmap::SUPPORTED; From 10bd671c08f7094c97316edbd12a59e207e0da34 Mon Sep 17 00:00:00 2001 From: nullname Date: Sat, 18 Jan 2025 22:15:27 +0800 Subject: [PATCH 139/166] [feat]add more op support (#18) * disable rpc buffer for npu * append input/output tensor size into unsupported op log * log dimensions for unsupported tensor * wip * split op config classes into separated file * fix reshape * wip * add op_constructor_with_type_param * set parameter for op_constructor_with_type_param func --- ggml/src/ggml-qnn/backend-ops.cpp | 24 +- ggml/src/ggml-qnn/graph.cpp | 12 +- ggml/src/ggml-qnn/op-config-caps.cpp | 208 +++++++++++++++++- .../{op-config.cpp => op-config-impl.cpp} | 27 +-- ggml/src/ggml-qnn/op-config-impl.hpp | 151 +++++++++++++ ggml/src/ggml-qnn/op-config.hpp | 136 +----------- ggml/src/ggml-qnn/tensor.hpp | 4 +- 7 files changed, 384 insertions(+), 178 deletions(-) rename ggml/src/ggml-qnn/{op-config.cpp => op-config-impl.cpp} (95%) create mode 100644 ggml/src/ggml-qnn/op-config-impl.hpp diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 75c90e235bbc2..8bbf26da5275e 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -25,7 +25,7 @@ bool qnn_is_op_valid(ggml_backend_qnn_device_context *ctx, const ggml_tensor *ds return false; } - const auto param_count = qnn::get_qnn_op_input_param_count(qnn::get_qnn_op_index(dst)); + const auto param_count = qnn::get_qnn_op_input_param_count(dst); switch (param_count) { case 1: return dst->src[0]; @@ -91,9 +91,13 @@ void get_graph_key_from_op(const ggml_tensor *op, std::string &output) { GGML_ASSERT(op->op != GGML_OP_NONE); output += ggml_op_desc(op); output += qnn::get_ggml_type_name(op->type); - const auto param_count = qnn::get_qnn_op_input_param_count(qnn::get_qnn_op_index(op)); + const auto param_count = qnn::get_qnn_op_input_param_count(op); for (size_t i = 0; i < param_count; ++i) { auto *input = op->src[i]; + if (!input) { + break; + } + output += '_'; append_tensor_dimensions(input, output); } @@ -224,7 +228,7 @@ bool qnn_generic_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *dst) #ifndef NDEBUG if (!succeed) { - const auto param_count = qnn::get_qnn_op_input_param_count(qnn::get_qnn_op_index(dst)); + const auto param_count = qnn::get_qnn_op_input_param_count(dst); for (size_t i = 0; i < param_count; ++i) { print_ggml_tensor(dst->src[i]); } @@ -409,7 +413,7 @@ bool ggnl_qnn_supports_op_tensor(ggml_backend_qnn_device_context *ctx, const ggm return false; } - const auto param_count = qnn::get_qnn_op_input_param_count(qnn::get_qnn_op_index(op)); + const auto param_count = qnn::get_qnn_op_input_param_count(op); for (size_t i = 0; i < param_count; ++i) { if (!ggml_qnn_supports_tensor(ctx, op->src[i])) { return false; @@ -479,12 +483,20 @@ bool device_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor } if (!kQnnOpsTable[qnn::get_qnn_op_index(op)]) { - QNN_LOG_DEBUG("[%s]unsupported op", ggml_op_name(op->op)); +#ifndef NDEBUG + std::string op_key; + get_graph_key_from_op(op, op_key); + QNN_LOG_DEBUG("[%s]unsupported op", op_key.c_str()); +#endif return false; } if (!ggnl_qnn_supports_op_tensor(ctx, op)) { - QNN_LOG_DEBUG("[%s]unsupported tensor", ggml_op_name(op->op)); +#ifndef NDEBUG + std::string tensor_dims; + append_tensor_dimensions(op, tensor_dims); + QNN_LOG_DEBUG("[%s]unsupported tensor(%s)", ggml_op_name(op->op), tensor_dims.c_str()); +#endif return false; } diff --git a/ggml/src/ggml-qnn/graph.cpp b/ggml/src/ggml-qnn/graph.cpp index 0210e1554a8ab..680f5e23bd9f3 100644 --- a/ggml/src/ggml-qnn/graph.cpp +++ b/ggml/src/ggml-qnn/graph.cpp @@ -15,7 +15,7 @@ using qnn_tensor_cache_t = std::unordered_mapsrc[i])); } @@ -56,14 +56,12 @@ qnn::qnn_op_config_ptr_t create_operation_from_op_tensor(ggml_tensor *dst, const QNNBackend device, Qnn_GraphHandle_t graph_handle, std::shared_ptr qnn_instance, bool is_intermediate, qnn_tensor_cache_t &tensor_cache) { - const auto op_index = qnn::get_qnn_op_index(dst); - auto qnn_op = qnn::create_op_constructor(op_index); - auto operation = qnn_op(name, qnn_instance); + auto operation = qnn::create_op(dst, name, qnn_instance); // input tensors qnn::qnn_tensor_array_t input_qnn_tensors; auto tensor_type = is_intermediate ? qnn::ggml_qnn_tensor::INTERMEDIATE : qnn::ggml_qnn_tensor::INPUT; - for (size_t i = 0; i < qnn::get_qnn_op_input_param_count(op_index); ++i) { + for (size_t i = 0; i < qnn::get_qnn_op_input_param_count(dst); ++i) { auto input_qnn_tensor = create_tensor_with_cache(dst->src[i], tensor_type, rank, device, graph_handle, qnn_instance, tensor_cache); input_qnn_tensors.push_back(input_qnn_tensor); @@ -92,7 +90,7 @@ bool bind_src_tensors(ggml_tensor *op, qnn::qnn_tensor_array_t &tensor_wrappers, return false; } - const auto param_count = qnn::get_qnn_op_input_param_count(qnn::get_qnn_op_index(op)); + const auto param_count = qnn::get_qnn_op_input_param_count(op); GGML_ASSERT(tensor_wrappers.size() == param_count); qnn_tensors.resize(param_count); for (size_t i = 0; i < param_count; ++i) { @@ -268,7 +266,7 @@ bool qnn_graph::build_graph_from_ggml_graph(const ggml_cgraph *cgraph) { continue; } - QNN_LOG_DEBUG("[%s]create op: %s", get_backend_name(_device), get_qnn_op_name(dst->op)); + QNN_LOG_DEBUG("[%s]create op: %s", get_backend_name(_device), get_qnn_op_name(dst)); auto operation = create_operation_from_op_tensor(dst, dst->name, rank, _device, _graph_handle, _qnn_instance, true, tensor_cache); // TODO: fix op name operations.push_back(operation); diff --git a/ggml/src/ggml-qnn/op-config-caps.cpp b/ggml/src/ggml-qnn/op-config-caps.cpp index 7fa3d11affc18..9b28a76dd1dcf 100644 --- a/ggml/src/ggml-qnn/op-config-caps.cpp +++ b/ggml/src/ggml-qnn/op-config-caps.cpp @@ -1,8 +1,10 @@ -#include "op-config.hpp" +#include "op-config-impl.hpp" namespace { +using op_constructor_t = std::shared_ptr (*)(const ggml_tensor *, const std::string &, + std::shared_ptr); using op_dims_calc_func_t = void (*)(const std::vector &input_dims, qnn::ggml_dimension_array_t &output_dims); @@ -24,6 +26,7 @@ struct qnn_op_caps_t { const char *qnn_op_name = nullptr; const size_t input_param_count = 0; op_dims_calc_func_t calc_dims_func = nullptr; + const char *qnn_param_name = nullptr; }; constexpr const qnn_op_caps_t kOpCaps[] = { @@ -80,7 +83,13 @@ constexpr const qnn_op_caps_t kOpCaps[] = { {}, // GGML_OP_CONCAT {}, // GGML_OP_SILU_BACK {}, // GGML_OP_NORM - {}, // GGML_OP_RMS_NORM + { + // GGML_OP_RMS_NORM + QNN_OP_RMS_NORM, // qnn_op_name + 1, // input_param_count + nullptr, // TODO: calc_dims_func + QNN_OP_RMS_NORM_PARAM_EPSILON, // qnn_param_name + }, {}, // GGML_OP_RMS_NORM_BACK {}, // GGML_OP_GROUP_NORM { @@ -187,9 +196,172 @@ static_assert(kOpCaps[GGML_OP_MUL_MAT].calc_dims_func == mat_mul_op_dims, "GGML_OP_ADD does not have element_wise_op_dims function"); static_assert(kOpCaps[GGML_OP_LOG].calc_dims_func == element_wise_op_dims, "GGML_OP_LOG does not have element_wise_op_dims function"); +static_assert(kOpCaps[GGML_OP_COUNT + GGML_UNARY_OP_GELU].input_param_count == 1, + "GGML_UNARY_OP_GELU does not have 1 input parameter"); static_assert(std::size(kOpCaps) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), "GGML_OP_COUNT does not match the size of the kOpCaps table"); +std::shared_ptr mat_mul_op_constructor(const ggml_tensor *op, const std::string &instance_name, + std::shared_ptr qnn_instance) { + GGML_UNUSED(op); + QNN_LOG_DEBUG("create QNN_OP_MAT_MUL, name %s", instance_name.c_str()); + return std::make_shared(instance_name, qnn_instance); +} + +template +std::shared_ptr generic_op_constructor(const ggml_tensor *op, const std::string &instance_name, + std::shared_ptr qnn_instance) { + GGML_UNUSED(op); + static_assert(_op < std::size(kOpCaps)); + static_assert(kOpCaps[_op].qnn_op_name != nullptr); + return std::make_shared(instance_name, QNN_OP_PACKAGE_NAME_QTI_AISW, + kOpCaps[_op].qnn_op_name, qnn_instance); +} + +void add_type_parameters(std::shared_ptr op, const char *name, float value) { + Qnn_Scalar_t scalar = QNN_SCALAR_INIT; + scalar.dataType = QNN_DATATYPE_FLOAT_32; + scalar.floatValue = value; + op->add_scalar_param(name, scalar); +} + +template +std::shared_ptr op_constructor_with_type_param( + const ggml_tensor *op, const std::string &instance_name, std::shared_ptr qnn_instance) { + static_assert(std::is_base_of::value); + static_assert(_op < std::size(kOpCaps)); + + constexpr auto &op_caps = kOpCaps[_op]; + static_assert(op_caps.qnn_op_name != nullptr); + + _ggml_op_param_type op_param; + memcpy(&op_param, op->op_params, sizeof(op_param)); + auto qnn_op = std::make_shared<_qnn_op_type_name>(instance_name, QNN_OP_PACKAGE_NAME_QTI_AISW, op_caps.qnn_op_name, + qnn_instance); + if (op_caps.qnn_param_name) { + add_type_parameters(qnn_op, op_caps.qnn_param_name, op_param); + } + return qnn_op; +} + +constexpr const op_constructor_t kOpConstructors[] = { + nullptr, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + generic_op_constructor, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + generic_op_constructor, // GGML_OP_SUB + generic_op_constructor, // GGML_OP_MUL + generic_op_constructor, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + generic_op_constructor, // GGML_OP_SQRT + generic_op_constructor, // GGML_OP_LOG + nullptr, // GGML_OP_SIN + nullptr, // GGML_OP_COS + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_COUNT_EQUAL + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + op_constructor_with_type_param, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM + + mat_mul_op_constructor, // GGML_OP_MUL_MAT + nullptr, // GGML_OP_MUL_MAT_ID + nullptr, // GGML_OP_OUT_PROD + + nullptr, // GGML_OP_SCALE + nullptr, // GGML_OP_SET + nullptr, // GGML_OP_CPY + nullptr, // GGML_OP_CONT + generic_op_constructor, // GGML_OP_RESHAPE + nullptr, // GGML_OP_VIEW + nullptr, // GGML_OP_PERMUTE + nullptr, // GGML_OP_TRANSPOSE + nullptr, // GGML_OP_GET_ROWS + nullptr, // GGML_OP_GET_ROWS_BACK + nullptr, // GGML_OP_DIAG + nullptr, // GGML_OP_DIAG_MASK_INF + nullptr, // GGML_OP_DIAG_MASK_ZERO + nullptr, // GGML_OP_SOFT_MAX + nullptr, // GGML_OP_SOFT_MAX_BACK + nullptr, // GGML_OP_ROPE + nullptr, // GGML_OP_ROPE_BACK + nullptr, // GGML_OP_CLAMP + nullptr, // GGML_OP_CONV_TRANSPOSE_1D + nullptr, // GGML_OP_IM2COL + nullptr, // GGML_OP_IM2COL_BACK + nullptr, // GGML_OP_CONV_TRANSPOSE_2D + nullptr, // GGML_OP_POOL_1D + nullptr, // GGML_OP_POOL_2D + nullptr, // GGML_OP_POOL_2D_BACK + nullptr, // GGML_OP_UPSCALE + nullptr, // GGML_OP_PAD + nullptr, // GGML_OP_PAD_REFLECT_1D + nullptr, // GGML_OP_ARANGE + nullptr, // GGML_OP_TIMESTEP_EMBEDDING + nullptr, // GGML_OP_ARGSORT + nullptr, // GGML_OP_LEAKY_RELU + + nullptr, // GGML_OP_FLASH_ATTN_EXT + nullptr, // GGML_OP_FLASH_ATTN_BACK + nullptr, // GGML_OP_SSM_CONV + nullptr, // GGML_OP_SSM_SCAN + nullptr, // GGML_OP_WIN_PART + nullptr, // GGML_OP_WIN_UNPART + nullptr, // GGML_OP_GET_REL_POS + nullptr, // GGML_OP_ADD_REL_POS + nullptr, // GGML_OP_RWKV_WKV6 + nullptr, // GGML_OP_GATED_LINEAR_ATTN + + nullptr, // GGML_OP_UNARY + + nullptr, // GGML_OP_MAP_UNARY + nullptr, // GGML_OP_MAP_BINARY + + nullptr, // GGML_OP_MAP_CUSTOM1_F32 + nullptr, // GGML_OP_MAP_CUSTOM2_F32 + nullptr, // GGML_OP_MAP_CUSTOM3_F32 + + nullptr, // GGML_OP_MAP_CUSTOM1 + nullptr, // GGML_OP_MAP_CUSTOM2 + nullptr, // GGML_OP_MAP_CUSTOM3 + + nullptr, // GGML_OP_CROSS_ENTROPY_LOSS + nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK + nullptr, // GGML_OP_OPT_STEP_ADAMW + + // ggml_unary_op + nullptr, // GGML_UNARY_OP_ABS + nullptr, // GGML_UNARY_OP_SGN + nullptr, // GGML_UNARY_OP_NEG + nullptr, // GGML_UNARY_OP_STEP + nullptr, // GGML_UNARY_OP_TANH + nullptr, // GGML_UNARY_OP_ELU + nullptr, // GGML_UNARY_OP_RELU + nullptr, // GGML_UNARY_OP_SIGMOID + nullptr, // GGML_UNARY_OP_GELU + nullptr, // GGML_UNARY_OP_GELU_QUICK + nullptr, // GGML_UNARY_OP_SILU + nullptr, // GGML_UNARY_OP_HARDSWISH + nullptr, // GGML_UNARY_OP_HARDSIGMOID + nullptr, // GGML_UNARY_OP_EXP +}; + +static_assert(kOpConstructors[GGML_OP_NONE] == nullptr, "GGML_OP_NONE does not match the nullptr function"); +static_assert(kOpConstructors[GGML_OP_ADD] == generic_op_constructor, + "GGML_OP_ADD does not match the generic_op_constructor function"); +static_assert(kOpConstructors[GGML_OP_MUL_MAT] == mat_mul_op_constructor, + "GGML_OP_MUL_MAT does not match the mat_mul_op_constructor function"); +static_assert(std::size(kOpConstructors) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), + "GGML_OP_COUNT does not match the size of the kOpConstructors table"); + } // namespace namespace qnn { @@ -202,23 +374,35 @@ size_t get_qnn_op_index(const ggml_tensor *tensor) { return tensor->op; } -void get_ggml_op_output_dimensions(const std::vector &input_dims, size_t op, +void get_ggml_op_output_dimensions(const std::vector &input_dims, const ggml_tensor *op, ggml_dimension_array_t &output_dims) { - GGML_ASSERT(op < std::size(kOpCaps)); - auto get_dims = kOpCaps[op].calc_dims_func; + auto op_index = get_qnn_op_index(op); + GGML_ASSERT(op_index < std::size(kOpCaps)); + auto get_dims = kOpCaps[op_index].calc_dims_func; GGML_ASSERT(get_dims); get_dims(input_dims, output_dims); } -const char *get_qnn_op_name(size_t op) { - GGML_ASSERT(op < std::size(kOpCaps)); - GGML_ASSERT(kOpCaps[op].qnn_op_name); - return kOpCaps[op].qnn_op_name; +const char *get_qnn_op_name(const ggml_tensor *op) { + auto op_index = get_qnn_op_index(op); + GGML_ASSERT(op_index < std::size(kOpCaps)); + GGML_ASSERT(kOpCaps[op_index].qnn_op_name); + return kOpCaps[op_index].qnn_op_name; +} + +size_t get_qnn_op_input_param_count(const ggml_tensor *op) { + auto op_index = get_qnn_op_index(op); + GGML_ASSERT(op_index < std::size(kOpCaps)); + return kOpCaps[op_index].input_param_count; } -size_t get_qnn_op_input_param_count(size_t op) { - GGML_ASSERT(op < std::size(kOpCaps)); - return kOpCaps[op].input_param_count; +std::shared_ptr create_op(const ggml_tensor *op, const std::string &name, + std::shared_ptr qnn_instance) { + auto op_index = get_qnn_op_index(op); + GGML_ASSERT(op_index < std::size(kOpCaps)); + auto op_constructor = kOpConstructors[op_index]; + GGML_ASSERT(op_constructor); + return op_constructor(op, name, qnn_instance); } } // namespace qnn diff --git a/ggml/src/ggml-qnn/op-config.cpp b/ggml/src/ggml-qnn/op-config-impl.cpp similarity index 95% rename from ggml/src/ggml-qnn/op-config.cpp rename to ggml/src/ggml-qnn/op-config-impl.cpp index 7edb4078a57df..19a1bf46ee9dc 100644 --- a/ggml/src/ggml-qnn/op-config.cpp +++ b/ggml/src/ggml-qnn/op-config-impl.cpp @@ -1,4 +1,4 @@ -#include "op-config.hpp" +#include "op-config-impl.hpp" #include @@ -187,6 +187,13 @@ bool ggml_qnn_single_op_config::initialize_op_nodes(QNNBackend device, Qnn_Graph return true; } +bool ggml_qnn_rmsnorm_op_config::initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) { + constexpr const uint32_t kAxes[] = {0}; + add_tensor_param(QNN_OP_RMS_NORM_PARAM_AXES, {1}, 1, reinterpret_cast(kAxes), QNN_DATATYPE_UINT_32, + device, graph_handle); + return true; +} + void ggml_qnn_aggregate_op_config::set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) { _tensor_inputs = tensor_inputs; } @@ -439,22 +446,4 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap return true; } -ggml_op_constructor_t create_op_constructor(size_t op) { - std::string op_name = get_qnn_op_name(op); - if (op_name == QNN_OP_MAT_MUL) { - // For QNN_OP_MAT_MUL, we need to transpose the input tensor - return [](const std::string &instance_name, - std::shared_ptr qnn_instance) -> std::shared_ptr { - QNN_LOG_DEBUG("create QNN_OP_MAT_MUL, name %s", instance_name.c_str()); - return std::make_shared(instance_name, qnn_instance); - }; - } - - return [op_name](const std::string &instance_name, - std::shared_ptr qnn_instance) -> std::shared_ptr { - return std::make_shared(instance_name, QNN_OP_PACKAGE_NAME_QTI_AISW, op_name, - qnn_instance); - }; -} - } // namespace qnn diff --git a/ggml/src/ggml-qnn/op-config-impl.hpp b/ggml/src/ggml-qnn/op-config-impl.hpp new file mode 100644 index 0000000000000..4a00ed2cc7ac3 --- /dev/null +++ b/ggml/src/ggml-qnn/op-config-impl.hpp @@ -0,0 +1,151 @@ +#pragma once + +#include +#include +#include +#include + +#include "op-config.hpp" +#include "qnn-lib.hpp" +#include "qnn-types.hpp" +#include "tensor.hpp" + +namespace qnn { + +class ggml_qnn_op_config_base : public ggml_qnn_op_config { +public: + explicit ggml_qnn_op_config_base(const std::string &name, const std::string &package_name, + const std::string &op_type, std::shared_ptr qnn_instance) + : _name(name), _package_name(package_name), _op_type(op_type), _qnn_instance(qnn_instance) {} + + void add_scalar_param(const std::string &name, const Qnn_Scalar_t scalar); + bool add_tensor_param(const std::string &name, const qnn_dimension_array_t &dimensions, int rank, + const uint8_t *data, const Qnn_DataType_t data_type, QNNBackend device, + Qnn_GraphHandle_t graph_handle); + + void set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) override; + void set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) override; + void set_output_tensors(qnn::qnn_tensor_array_t &tensor_inputs) override; + void set_output_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) override; + bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override; + bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) override; + bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) override; + void unbind_input_tensors() override; + void unbind_output_tensors() override; + const qnn_tensor_array_t &get_input_tensors() override { return _tensor_inputs; } + const qnn_tensor_array_t &get_output_tensors() override { return _tensor_outputs; } + +protected: + Qnn_OpConfig_t get_op_config(); + + std::string _name; + std::string _package_name; + std::string _op_type; + std::shared_ptr _qnn_instance; + qnn_tensor_array_t _tensor_inputs; + qnn_tensor_array_t _tensor_outputs; + qnn_tensor_array_t _tensor_parameters; + std::vector _qnn_tensor_inputs; + std::vector _qnn_tensor_outputs; + std::vector _qnn_parameters; + std::vector _param_names; + + DISABLE_COPY(ggml_qnn_op_config_base); + DISABLE_MOVE(ggml_qnn_op_config_base); +}; + +class ggml_qnn_single_op_config : public ggml_qnn_op_config_base { +public: + explicit ggml_qnn_single_op_config(const std::string &name, const std::string &package_name, + const std::string &op_type, std::shared_ptr qnn_instance) + : ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} + + bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) override; + +private: + DISABLE_COPY(ggml_qnn_single_op_config); + DISABLE_MOVE(ggml_qnn_single_op_config); +}; + +class ggml_qnn_rmsnorm_op_config : public ggml_qnn_op_config_base { +public: + explicit ggml_qnn_rmsnorm_op_config(const std::string &name, const std::string &package_name, + const std::string &op_type, std::shared_ptr qnn_instance) + : ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} + + bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) override; + +private: + DISABLE_COPY(ggml_qnn_rmsnorm_op_config); + DISABLE_MOVE(ggml_qnn_rmsnorm_op_config); +}; + +class ggml_qnn_aggregate_op_config : public ggml_qnn_op_config { +public: + explicit ggml_qnn_aggregate_op_config(const std::string &name, std::shared_ptr qnn_instance) + : _name(name), _qnn_instance(qnn_instance) {} + + ~ggml_qnn_aggregate_op_config() { + _tensor_inputs.clear(); + _tensor_outputs.clear(); + _operations.clear(); + } + + void set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) override; + void set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) override; + void set_output_tensors(qnn::qnn_tensor_array_t &tensor_inputs) override; + void set_output_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) override; + bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override { + return qnn::add_op_to_graph(graph_handle, _operations); + } + + bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) override; + bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) override; + void unbind_input_tensors() override { + for (auto &tensor : _tensor_inputs) { + tensor->unbind(); + } + } + + void unbind_output_tensors() override { + for (auto &tensor : _tensor_outputs) { + tensor->unbind(); + } + } + + const qnn_tensor_array_t &get_input_tensors() override { return _tensor_inputs; } + const qnn_tensor_array_t &get_output_tensors() override { return _tensor_outputs; } + +protected: + std::string _name; + std::shared_ptr _qnn_instance; + + std::vector _operations; + qnn_tensor_array_t _tensor_inputs; + qnn_tensor_array_t _tensor_outputs; + +private: + DISABLE_COPY(ggml_qnn_aggregate_op_config); + DISABLE_MOVE(ggml_qnn_aggregate_op_config); +}; + +class ggml_qnn_matmul_op_config : public ggml_qnn_aggregate_op_config { +public: + ggml_qnn_matmul_op_config(const std::string &name, std::shared_ptr qnn_instance) + : ggml_qnn_aggregate_op_config(name, qnn_instance) {} + + bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) override; + +private: + qnn_tensor_ptr_t create_gather_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, + qnn_tensor_ptr_t tensor_input, qnn_dimension_array_t output_dimensions); + bool create_convert_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, + qnn_tensor_array_t &tensor_inputs, qnn_tensor_array_t &tensor_outputs); + bool create_mat_mul_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, + qnn_tensor_array_t &tensor_inputs, qnn_tensor_array_t &tensor_outputs); + + DISABLE_COPY(ggml_qnn_matmul_op_config); + DISABLE_MOVE(ggml_qnn_matmul_op_config); +}; + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/op-config.hpp b/ggml/src/ggml-qnn/op-config.hpp index ca066520bc171..075c56fed6e13 100644 --- a/ggml/src/ggml-qnn/op-config.hpp +++ b/ggml/src/ggml-qnn/op-config.hpp @@ -12,19 +12,16 @@ namespace qnn { -using ggml_op_constructor_t = - std::function(const std::string &, std::shared_ptr)>; - constexpr const size_t kGgmlUnaryOpStart = GGML_OP_COUNT; size_t get_qnn_op_index(const ggml_tensor *tensor); -void get_ggml_op_output_dimensions(const std::vector &input_dims, size_t op, +void get_ggml_op_output_dimensions(const std::vector &input_dims, const ggml_tensor *op, ggml_dimension_array_t &output_dims); -const char *get_qnn_op_name(size_t op); -size_t get_qnn_op_input_param_count(size_t op); - -ggml_op_constructor_t create_op_constructor(size_t op); +const char *get_qnn_op_name(const ggml_tensor *op); +size_t get_qnn_op_input_param_count(const ggml_tensor *op); +std::shared_ptr create_op(const ggml_tensor *op, const std::string &name, + std::shared_ptr qnn_instance); inline bool add_op_to_graph(Qnn_GraphHandle_t graph_handle, std::vector &operations) { for (auto &op : operations) { @@ -36,127 +33,4 @@ inline bool add_op_to_graph(Qnn_GraphHandle_t graph_handle, std::vector qnn_instance) - : _name(name), _package_name(package_name), _op_type(op_type), _qnn_instance(qnn_instance) {} - - void add_scalar_param(const std::string &name, const Qnn_Scalar_t scalar); - bool add_tensor_param(const std::string &name, const qnn_dimension_array_t &dimensions, int rank, - const uint8_t *data, const Qnn_DataType_t data_type, QNNBackend device, - Qnn_GraphHandle_t graph_handle); - - void set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) override; - void set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) override; - void set_output_tensors(qnn::qnn_tensor_array_t &tensor_inputs) override; - void set_output_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) override; - bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override; - bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) override; - bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) override; - void unbind_input_tensors() override; - void unbind_output_tensors() override; - const qnn_tensor_array_t &get_input_tensors() override { return _tensor_inputs; } - const qnn_tensor_array_t &get_output_tensors() override { return _tensor_outputs; } - -protected: - Qnn_OpConfig_t get_op_config(); - - std::string _name; - std::string _package_name; - std::string _op_type; - std::shared_ptr _qnn_instance; - qnn_tensor_array_t _tensor_inputs; - qnn_tensor_array_t _tensor_outputs; - qnn_tensor_array_t _tensor_parameters; - std::vector _qnn_tensor_inputs; - std::vector _qnn_tensor_outputs; - std::vector _qnn_parameters; - std::vector _param_names; - - DISABLE_COPY(ggml_qnn_op_config_base); - DISABLE_MOVE(ggml_qnn_op_config_base); -}; - -class ggml_qnn_single_op_config : public ggml_qnn_op_config_base { -public: - explicit ggml_qnn_single_op_config(const std::string &name, const std::string &package_name, - const std::string &op_type, std::shared_ptr qnn_instance) - : ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} - - bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) override; - -private: - DISABLE_COPY(ggml_qnn_single_op_config); - DISABLE_MOVE(ggml_qnn_single_op_config); -}; - -class ggml_qnn_aggregate_op_config : public ggml_qnn_op_config { -public: - explicit ggml_qnn_aggregate_op_config(const std::string &name, std::shared_ptr qnn_instance) - : _name(name), _qnn_instance(qnn_instance) {} - - ~ggml_qnn_aggregate_op_config() { - _tensor_inputs.clear(); - _tensor_outputs.clear(); - _operations.clear(); - } - - void set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) override; - void set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) override; - void set_output_tensors(qnn::qnn_tensor_array_t &tensor_inputs) override; - void set_output_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) override; - bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override { - return qnn::add_op_to_graph(graph_handle, _operations); - } - - bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) override; - bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) override; - void unbind_input_tensors() override { - for (auto &tensor : _tensor_inputs) { - tensor->unbind(); - } - } - - void unbind_output_tensors() override { - for (auto &tensor : _tensor_outputs) { - tensor->unbind(); - } - } - - const qnn_tensor_array_t &get_input_tensors() override { return _tensor_inputs; } - const qnn_tensor_array_t &get_output_tensors() override { return _tensor_outputs; } - -protected: - std::string _name; - std::shared_ptr _qnn_instance; - - std::vector _operations; - qnn_tensor_array_t _tensor_inputs; - qnn_tensor_array_t _tensor_outputs; - -private: - DISABLE_COPY(ggml_qnn_aggregate_op_config); - DISABLE_MOVE(ggml_qnn_aggregate_op_config); -}; - -class ggml_qnn_matmul_op_config : public ggml_qnn_aggregate_op_config { -public: - ggml_qnn_matmul_op_config(const std::string &name, std::shared_ptr qnn_instance) - : ggml_qnn_aggregate_op_config(name, qnn_instance) {} - - bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) override; - -private: - qnn_tensor_ptr_t create_gather_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, - qnn_tensor_ptr_t tensor_input, qnn_dimension_array_t output_dimensions); - bool create_convert_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, - qnn_tensor_array_t &tensor_inputs, qnn_tensor_array_t &tensor_outputs); - bool create_mat_mul_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, - qnn_tensor_array_t &tensor_inputs, qnn_tensor_array_t &tensor_outputs); - - DISABLE_COPY(ggml_qnn_matmul_op_config); - DISABLE_MOVE(ggml_qnn_matmul_op_config); -}; - } // namespace qnn diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index 3bd86891cb18f..9720e682c81d2 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -294,9 +294,7 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { new_tensor_type); } - bool should_use_mem_handle() const { - return _device == QNN_BACKEND_NPU && QNN_TENSOR_GET_TYPE(_qnn_tensor) != QNN_TENSOR_TYPE_STATIC; - } + bool should_use_mem_handle() const { return false; } std::string _tensor_name; qnn_buffer_ptr _buffer; From a822d0075392defba5a83524f3a3564dc71c7f72 Mon Sep 17 00:00:00 2001 From: nullname Date: Mon, 24 Feb 2025 10:47:47 +0800 Subject: [PATCH 140/166] feat: run on win (#24) * move qnn_instance function implementation into cpp * wip * wip * move dl related function into separated file * use cast op for gpu * Revert "use cast op for gpu" This reverts commit 05df7362a15c022d05940d682e84cf480a082c6a. * Reapply "use cast op for gpu" This reverts commit 2520e5922a216faceb6d7efcde23dafe6947a4b3. * fix compiling error in win * fix align_alloc in win * fix compiling error * add get sys free/total mem for win * wip * suppress warning in win * add missing chrono header * set the correct qnn lib name for windows * add flag to control cpu backend * wip * wip * Revert "Reapply "use cast op for gpu"" This reverts commit f56519c374a7d46faac706cf214de48ff5fc5139. * fix compiling error for linux build * fix cdsprpc dynamic library name * wip * skip rpc load fail * fix page_align_alloc * suppress some warning in gcc * wip * reuse align to function * more log * add log and fix warning * wip * fix asan errors and memory leaks * fix the get_io_tensors_from_graph * improve comment * print GGML_QNN_DEFAULT_LIB_SEARCH_PATH * revert some unused changes * move library search path setter into qnn module * fix android library loading * skip qnn_device_get_platform_info for npu emulator --- ggml/src/ggml-qnn/CMakeLists.txt | 18 +- ggml/src/ggml-qnn/backend-ops.cpp | 2 +- ggml/src/ggml-qnn/buffer.hpp | 3 + ggml/src/ggml-qnn/dl_loader.hpp | 71 ++++ ggml/src/ggml-qnn/ggml-qnn.cpp | 72 +--- ggml/src/ggml-qnn/graph.cpp | 73 +++- ggml/src/ggml-qnn/logger.cpp | 15 +- ggml/src/ggml-qnn/op-config-caps.cpp | 15 +- ggml/src/ggml-qnn/op-config-impl.cpp | 2 +- ggml/src/ggml-qnn/op-config.hpp | 3 - ggml/src/ggml-qnn/qnn-lib.cpp | 521 ++++++++++++++++++++++++++- ggml/src/ggml-qnn/qnn-lib.hpp | 469 ++---------------------- ggml/src/ggml-qnn/tensor.hpp | 15 +- ggml/src/ggml-qnn/utils.cpp | 84 +++-- ggml/src/ggml-qnn/utils.hpp | 9 +- 15 files changed, 781 insertions(+), 591 deletions(-) create mode 100644 ggml/src/ggml-qnn/dl_loader.hpp diff --git a/ggml/src/ggml-qnn/CMakeLists.txt b/ggml/src/ggml-qnn/CMakeLists.txt index 7bbb9be76b4f6..ccf51e1a55a07 100644 --- a/ggml/src/ggml-qnn/CMakeLists.txt +++ b/ggml/src/ggml-qnn/CMakeLists.txt @@ -4,12 +4,15 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Android") find_library(LOG_LIB log) set(QNN_LINK_LIBRARIES ${LOG_LIB}) set(QNN_DEFAULT_LIB_SEARCH_PATH "/data/local/tmp/" CACHE STRING "customized library search path for QNN backend") +elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows" OR CMAKE_SYSTEM_NAME STREQUAL "Linux") + set(QNN_DEFAULT_LIB_SEARCH_PATH "" CACHE STRING "customized library search path for QNN backend") else() - message(FATAL_ERROR "QNN now only available on Android") + message(FATAL_ERROR "QNN now only available on Android, Windows and Linux") endif() if(NOT DEFINED GGML_QNN_SDK_PATH) # try read from environment variable + # TODO: create a function to search for the SDK path if(DEFINED ENV{QNN_SDK_PATH}) set(GGML_QNN_SDK_PATH $ENV{QNN_SDK_PATH}) else() @@ -29,5 +32,14 @@ ggml_add_backend_library(ggml-qnn target_include_directories(ggml-qnn PRIVATE ${GGML_QNN_SDK_PATH}/include/QNN ${CMAKE_CURRENT_LIST_DIR}) target_link_libraries(ggml-qnn PRIVATE ${QNN_LINK_LIBRARIES}) -string(REGEX REPLACE "/$" "" GGML_QNN_DEFAULT_LIB_SEARCH_PATH "${QNN_DEFAULT_LIB_SEARCH_PATH}") -target_compile_definitions(ggml-qnn PRIVATE GGML_QNN_DEFAULT_LIB_SEARCH_PATH="${QNN_DEFAULT_LIB_SEARCH_PATH}/") +if(NOT "${QNN_DEFAULT_LIB_SEARCH_PATH}" STREQUAL "") + string(REGEX REPLACE "/$" "" QNN_DEFAULT_LIB_SEARCH_PATH "${QNN_DEFAULT_LIB_SEARCH_PATH}") +endif() + +message("GGML_QNN_DEFAULT_LIB_SEARCH_PATH: ${QNN_DEFAULT_LIB_SEARCH_PATH}") +target_compile_definitions(ggml-qnn PRIVATE GGML_QNN_DEFAULT_LIB_SEARCH_PATH="${QNN_DEFAULT_LIB_SEARCH_PATH}") + +if(GGML_QNN_ENABLE_CPU_BACKEND) + message("GGML_QNN_ENABLE_CPU_BACKEND is enabled") + target_compile_definitions(ggml-qnn PRIVATE GGML_QNN_ENABLE_CPU_BACKEND) +endif() diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 8bbf26da5275e..f62fc60d5c055 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -389,7 +389,7 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t case GGML_TYPE_F16: case GGML_TYPE_Q8_0: case GGML_TYPE_Q4_0: - if (!(ctx->supported_types & (1 << tensor->type))) { + if (!(ctx->supported_types & (uint64_t(1) << tensor->type))) { QNN_LOG_DEBUG("[%s]unsupported data type %s, supported_types: 0x%x", qnn::get_backend_name(ctx->device), ggml_type_name(tensor->type), ctx->supported_types); return false; diff --git a/ggml/src/ggml-qnn/buffer.hpp b/ggml/src/ggml-qnn/buffer.hpp index af165b394eefb..ce796cbe4df08 100644 --- a/ggml/src/ggml-qnn/buffer.hpp +++ b/ggml/src/ggml-qnn/buffer.hpp @@ -133,11 +133,14 @@ class qnn_mem_buffer : public qnn_buffer_interface { if (data) { memcpy(_buffer, data, size); } + + QNN_LOG_DEBUG("alloc buffer: %p, size: %ld", _buffer, size); } explicit qnn_mem_buffer(size_t size) : qnn_mem_buffer(nullptr, size) {} ~qnn_mem_buffer() { + QNN_LOG_DEBUG("free buffer: %p, size: %ld", _buffer, _size); // the free will do nothing if the _buffer is nullptr qnn::align_free(_buffer); } diff --git a/ggml/src/ggml-qnn/dl_loader.hpp b/ggml/src/ggml-qnn/dl_loader.hpp new file mode 100644 index 0000000000000..1beec8866ba4c --- /dev/null +++ b/ggml/src/ggml-qnn/dl_loader.hpp @@ -0,0 +1,71 @@ +#pragma once + +#ifdef __linux__ +#include +#include +#elif defined(_WIN32) +#define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX +#define NOMINMAX +#endif +#include +#endif + +#include + +namespace qnn { + +#ifdef __linux__ +typedef void *dl_handler_t; + +inline qnn::dl_handler_t dl_load(const std::string &lib_path) { + return dlopen(lib_path.c_str(), RTLD_NOW | RTLD_LOCAL); +} + +inline void *dl_sym(qnn::dl_handler_t handle, const std::string &symbol) { return dlsym(handle, symbol.c_str()); } + +inline bool dl_unload(qnn::dl_handler_t handle) { return dlclose(handle) == 0; } + +inline const char *dl_error() { return dlerror(); } +#elif defined(_WIN32) +using dl_handler_t = HMODULE; + +inline qnn::dl_handler_t dl_load(const std::string &lib_path) { + // suppress error dialogs for missing DLLs + auto old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); + SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); + + auto handle = LoadLibraryA(lib_path.c_str()); // TODO: use wstring version for unicode paths + + SetErrorMode(old_mode); + return handle; +} + +inline void *dl_sym(qnn::dl_handler_t handle, const std::string &symbol) { + auto old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); + SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); + + void *p = (void *)GetProcAddress(handle, symbol.c_str()); + + SetErrorMode(old_mode); + return p; +} + +inline bool dl_unload(qnn::dl_handler_t handle) { + FreeLibrary(handle); + return true; +} + +inline const char *dl_error() { + // TODO: implement dl_error for Windows + return nullptr; +} + +#endif + +template +Fn dl_sym_typed(qnn::dl_handler_t handle, const std::string &function_name) { + return reinterpret_cast(dl_sym(handle, function_name)); +} + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp index b3673eb35a5f3..8150dcb9ea240 100644 --- a/ggml/src/ggml-qnn/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn/ggml-qnn.cpp @@ -1,23 +1,7 @@ #include "ggml-qnn.h" -#include -#include -#include -#include #include -#include -#include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include #include "ggml-backend-impl.h" @@ -44,6 +28,16 @@ namespace { +#ifdef _WIN32 +constexpr const char *kQnnCpuLibName = "QnnCpu.dll"; +constexpr const char *kQnnGpuLibName = "QnnGpu.dll"; +constexpr const char *kQnnNpuLibName = "QnnHtp.dll"; +#else +constexpr const char *kQnnCpuLibName = "libQnnCpu.so"; +constexpr const char *kQnnGpuLibName = "libQnnGpu.so"; +constexpr const char *kQnnNpuLibName = "libQnnHtp.so"; +#endif + struct qnn_device_caps { const char *name; const char *description; @@ -59,7 +53,7 @@ constexpr const qnn_device_caps kDeviceCaps[] = { // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/CpuOpDefSupplement.html#matmul "qnn-cpu", "Qualcomm Kryo CPU", - "libQnnCpu.so", + kQnnCpuLibName, GGML_BACKEND_DEVICE_TYPE_CPU, (1 << GGML_TYPE_I8) | (1 << GGML_TYPE_F32), }, @@ -67,7 +61,7 @@ constexpr const qnn_device_caps kDeviceCaps[] = { // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/GpuOpDefSupplement.html#matmul "qnn-gpu", "Qualcomm Adreno GPU", - "libQnnGpu.so", + kQnnGpuLibName, GGML_BACKEND_DEVICE_TYPE_GPU, (1 << GGML_TYPE_F32) | (1 << GGML_TYPE_F16), }, @@ -75,7 +69,7 @@ constexpr const qnn_device_caps kDeviceCaps[] = { // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/HtpOpDefSupplement.html#matmul "qnn-npu", "Qualcomm NPU", - "libQnnHtp.so", + kQnnNpuLibName, GGML_BACKEND_DEVICE_TYPE_ACCEL, (1 << GGML_TYPE_F32) | (1 << GGML_TYPE_F16) | (1 << GGML_TYPE_I16) | (1 << GGML_TYPE_I8), }, @@ -214,6 +208,8 @@ void ggml_backend_qnn_free(ggml_backend_t backend) { instance->qnn_finalize(); instance.reset(); } + + delete backend; } bool ggml_backend_qnn_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor *src, @@ -332,42 +328,10 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, const auto device = dev_ctx->device; QNN_LOG_DEBUG("device %s", qnn::get_backend_name(device)); QNN_LOG_DEBUG("extend_lib_search_path %s", extend_lib_search_path); - std::string path = extend_lib_search_path; - -// TODO: Fix this for other platforms -#if defined(__ANDROID__) || defined(ANDROID) - if (device == QNN_BACKEND_NPU) { - if (setenv("LD_LIBRARY_PATH", - (path + ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/" - "dsp:/vendor/dsp/images") - .c_str(), - 1) == 0) { - QNN_LOG_DEBUG("QNN NPU backend setenv successfully"); - } else { - QNN_LOG_ERROR("QNN NPU backend setenv failure"); - } - if (setenv("ADSP_LIBRARY_PATH", - (path + ";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/" - "rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp") - .c_str(), - 1) == 0) { - QNN_LOG_DEBUG("QNN NPU backend setenv successfully"); - } else { - QNN_LOG_ERROR("QNN NPU backend setenv failure"); - } - } else { - if (setenv("LD_LIBRARY_PATH", path.c_str(), 1) == 0) { - QNN_LOG_DEBUG("%s backend setenv successfully", qnn::get_backend_name(device)); - } else { - QNN_LOG_ERROR("%s backend setenv failure", qnn::get_backend_name(device)); - } - } -#endif - - auto instance = std::make_shared(path, dev_ctx->lib_name, "ggml"); + auto instance = std::make_shared(extend_lib_search_path, dev_ctx->lib_name); auto result = instance->qnn_init(nullptr); if (result != 0) { - QNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why", qnn::get_backend_name(device)); + QNN_LOG_WARN("failed to init qnn backend %s", qnn::get_backend_name(device)); return nullptr; } auto qnn_interface = instance->get_qnn_interface(); @@ -466,6 +430,7 @@ struct ggml_backend_qnn_reg_impl : ggml_backend_reg { QNN_LOG_DEBUG("qnn backend registry init"); for (size_t i = 0; i < QNN_BACKEND_COUNT; i++) { const auto device_enum = (QNNBackend)(QNN_BACKEND_COUNT - 1 - i); // init from the last device, i.e. NPU +#ifndef GGML_QNN_ENABLE_CPU_BACKEND if (device_enum == QNN_BACKEND_CPU) { /* * here we skip the initialization of CPU device, @@ -473,6 +438,7 @@ struct ggml_backend_qnn_reg_impl : ggml_backend_reg { */ continue; } +#endif device_contexts.emplace_back(std::make_unique( /* .device = */ device_enum, // init from the last device, i.e. NPU diff --git a/ggml/src/ggml-qnn/graph.cpp b/ggml/src/ggml-qnn/graph.cpp index 680f5e23bd9f3..25ce5b8fb2754 100644 --- a/ggml/src/ggml-qnn/graph.cpp +++ b/ggml/src/ggml-qnn/graph.cpp @@ -1,7 +1,7 @@ #include "graph.hpp" -#include +#include #include #include "ggml-impl.h" @@ -106,13 +106,29 @@ bool bind_src_tensors(ggml_tensor *op, qnn::qnn_tensor_array_t &tensor_wrappers, return true; } +/** + * @brief Extracts input and output tensors from a computational graph. + * + * This function identifies the input and output tensors of a computational graph by analyzing the connectivity between + * tensor nodes. It does this by iterating over each node in the graph, using a connectivity map that associates every + * tensor with its number of incoming connections (in_degree), outgoing connections (out_degree), and an insertion index + * that preserves order. The insertion index is used later to sort the tensors in their original discovery order. + * + * TODO: this algorithm is not perfect and may not work for all cases. It assumes that the tensors are + * connected in a way that allows for unambiguous categorization. + * It also assumes that the tensors are connected in a way that allows for unambiguous categorization. + */ int get_io_tensors_from_graph(const ggml_cgraph *cgraph, qnn::ggml_tensor_array_t &inputs, qnn::ggml_tensor_array_t &outputs) { - using ggml_tensor_set_t = std::set; + struct _tensor_connectivity_info { + size_t in_degree = 0; + size_t out_degree = 0; + size_t insert_index = 0; + }; - ggml_tensor_set_t input_set; - ggml_tensor_set_t output_set; - ggml_tensor_set_t visited_set; + using ggml_tensor_connectivity_map_t = std::unordered_map; + + ggml_tensor_connectivity_map_t connectivity_map; int rank = 0; for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor *dst = cgraph->nodes[i]; @@ -126,25 +142,50 @@ int get_io_tensors_from_graph(const ggml_cgraph *cgraph, qnn::ggml_tensor_array_ } rank = std::max(rank, ggml_n_dims(dst)); - input_set.erase(dst); - if (!visited_set.count(dst)) { - output_set.insert(dst); - visited_set.insert(dst); + if (connectivity_map.count(dst) == 0) { + connectivity_map[dst] = { + 1, // in-degree, at least 1 + 0, + connectivity_map.size(), + }; + } else { + ++(connectivity_map[dst].in_degree); } for (size_t i = 0; i < GGML_MAX_DIMS && dst->src[i]; ++i) { auto *src = dst->src[i]; rank = std::max(rank, ggml_n_dims(src)); - output_set.erase(src); - if (!visited_set.count(src)) { - input_set.insert(src); - visited_set.insert(src); + + if (connectivity_map.count(src) == 0) { + connectivity_map[src] = { + 0, + 1, // out-degree, at least 1 + connectivity_map.size(), + }; + } else { + ++(connectivity_map[src].out_degree); } } } - inputs.assign(input_set.begin(), input_set.end()); - outputs.assign(output_set.begin(), output_set.end()); + for (const auto &kv : connectivity_map) { + if (kv.second.in_degree == 0) { + inputs.push_back(kv.first); + } + + if (kv.second.out_degree == 0) { + outputs.push_back(kv.first); + } + } + + std::sort(inputs.begin(), inputs.end(), [&connectivity_map](ggml_tensor *lhs, ggml_tensor *rhs) { + return connectivity_map[lhs].insert_index < connectivity_map[rhs].insert_index; + }); + + std::sort(outputs.begin(), outputs.end(), [&connectivity_map](ggml_tensor *lhs, ggml_tensor *rhs) { + return connectivity_map[lhs].insert_index < connectivity_map[rhs].insert_index; + }); + return rank; } @@ -187,7 +228,7 @@ qnn_graph::qnn_graph(const std::string &graph_name, QNNBackend device, std::shar QnnHtpGraph_CustomConfig_t vtcm_config; vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; - vtcm_config.vtcmSizeInMB = vtcm_size_in_mb; + vtcm_config.vtcmSizeInMB = (uint32_t)vtcm_size_in_mb; QnnGraph_Config_t graph_vtcm_config; graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; graph_vtcm_config.customConfig = &vtcm_config; diff --git a/ggml/src/ggml-qnn/logger.cpp b/ggml/src/ggml-qnn/logger.cpp index 1e781721d629c..23a3f305c060f 100644 --- a/ggml/src/ggml-qnn/logger.cpp +++ b/ggml/src/ggml-qnn/logger.cpp @@ -1,8 +1,7 @@ #include "logger.hpp" -#include - +#include #include #if defined(__ANDROID__) || defined(ANDROID) @@ -23,10 +22,12 @@ void qnn::internal_log(ggml_log_level level, const char * /*file*/, const char * int len = vsnprintf(s_qnn_internal_log_buf + len_prefix, QNN_LOGBUF_LEN - len_prefix, format, args); if (len < (QNN_LOGBUF_LEN - len_prefix)) { #if defined(__ANDROID__) || defined(ANDROID) - // for Android APK + // print to android logcat __android_log_print(level, "ggml-qnn", "%s\n", s_qnn_internal_log_buf); +#else + (void)level; #endif - // for Android command line application or WoA(Windows on ARM) + // print to stdout printf("%s\n", s_qnn_internal_log_buf); } va_end(args); @@ -36,7 +37,7 @@ void qnn::internal_log(ggml_log_level level, const char * /*file*/, const char * #if ENABLE_QNNSDK_LOG void qnn::sdk_logcallback(const char *fmt, QnnLog_Level_t level, uint64_t /*timestamp*/, va_list argp) { static std::mutex log_mutex; - static unsigned char s_ggml_qnn_logbuf[QNN_LOGBUF_LEN]; + static char s_ggml_qnn_logbuf[QNN_LOGBUF_LEN]; const char *log_level_desc = ""; switch (level) { @@ -62,9 +63,7 @@ void qnn::sdk_logcallback(const char *fmt, QnnLog_Level_t level, uint64_t /*time { std::lock_guard lock(log_mutex); - - memset(s_ggml_qnn_logbuf, 0, QNN_LOGBUF_LEN); - vsnprintf(reinterpret_cast(s_ggml_qnn_logbuf), QNN_LOGBUF_LEN, fmt, argp); + vsnprintf(s_ggml_qnn_logbuf, QNN_LOGBUF_LEN, fmt, argp); QNN_LOG_INFO("[%s]%s", log_level_desc, s_ggml_qnn_logbuf); } } diff --git a/ggml/src/ggml-qnn/op-config-caps.cpp b/ggml/src/ggml-qnn/op-config-caps.cpp index 9b28a76dd1dcf..b250c214a3ad9 100644 --- a/ggml/src/ggml-qnn/op-config-caps.cpp +++ b/ggml/src/ggml-qnn/op-config-caps.cpp @@ -5,17 +5,17 @@ namespace { using op_constructor_t = std::shared_ptr (*)(const ggml_tensor *, const std::string &, std::shared_ptr); -using op_dims_calc_func_t = void (*)(const std::vector &input_dims, +using op_dims_calc_func_t = void (*)(const std::vector &input_dims, qnn::ggml_dimension_array_t &output_dims); -void element_wise_op_dims(const std::vector &input_dims, +void element_wise_op_dims(const std::vector &input_dims, qnn::ggml_dimension_array_t &output_dims) { for (size_t i = 1; i < std::size(output_dims); i++) { output_dims[i] = input_dims.front()[i]; } } -void mat_mul_op_dims(const std::vector &input_dims, +void mat_mul_op_dims(const std::vector &input_dims, qnn::ggml_dimension_array_t &output_dims) { GGML_ASSERT(input_dims.size() == 2); output_dims[0] = input_dims.front()[1]; @@ -374,15 +374,6 @@ size_t get_qnn_op_index(const ggml_tensor *tensor) { return tensor->op; } -void get_ggml_op_output_dimensions(const std::vector &input_dims, const ggml_tensor *op, - ggml_dimension_array_t &output_dims) { - auto op_index = get_qnn_op_index(op); - GGML_ASSERT(op_index < std::size(kOpCaps)); - auto get_dims = kOpCaps[op_index].calc_dims_func; - GGML_ASSERT(get_dims); - get_dims(input_dims, output_dims); -} - const char *get_qnn_op_name(const ggml_tensor *op) { auto op_index = get_qnn_op_index(op); GGML_ASSERT(op_index < std::size(kOpCaps)); diff --git a/ggml/src/ggml-qnn/op-config-impl.cpp b/ggml/src/ggml-qnn/op-config-impl.cpp index 19a1bf46ee9dc..934dbadfdcaf8 100644 --- a/ggml/src/ggml-qnn/op-config-impl.cpp +++ b/ggml/src/ggml-qnn/op-config-impl.cpp @@ -276,7 +276,7 @@ qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend devic auto index_buffer = std::make_shared(dimensions[axis] * sizeof(uint32_t)); for (uint32_t *curr = reinterpret_cast(index_buffer->get_buffer()), *end = curr + dimensions[axis]; curr < end; curr++) { - *curr = (curr - reinterpret_cast(index_buffer->get_buffer())) / scale; + *curr = uint32_t((curr - reinterpret_cast(index_buffer->get_buffer())) / scale); } auto gather_index = std::make_shared( diff --git a/ggml/src/ggml-qnn/op-config.hpp b/ggml/src/ggml-qnn/op-config.hpp index 075c56fed6e13..6b8c6946b8e86 100644 --- a/ggml/src/ggml-qnn/op-config.hpp +++ b/ggml/src/ggml-qnn/op-config.hpp @@ -15,9 +15,6 @@ namespace qnn { constexpr const size_t kGgmlUnaryOpStart = GGML_OP_COUNT; size_t get_qnn_op_index(const ggml_tensor *tensor); -void get_ggml_op_output_dimensions(const std::vector &input_dims, const ggml_tensor *op, - ggml_dimension_array_t &output_dims); - const char *get_qnn_op_name(const ggml_tensor *op); size_t get_qnn_op_input_param_count(const ggml_tensor *op); std::shared_ptr create_op(const ggml_tensor *op, const std::string &name, diff --git a/ggml/src/ggml-qnn/qnn-lib.cpp b/ggml/src/ggml-qnn/qnn-lib.cpp index a7553c4ac2b75..1f9a68333c05b 100644 --- a/ggml/src/ggml-qnn/qnn-lib.cpp +++ b/ggml/src/ggml-qnn/qnn-lib.cpp @@ -1,35 +1,536 @@ #include "qnn-lib.hpp" +#include + +#if defined(__linux__) +#include +#endif + +namespace { + +#ifdef _WIN32 +constexpr const char *kQnnSystemLibName = "QnnSystem.dll"; +constexpr const char *kQnnRpcLibName = "libcdsprpc.dll"; +#else +constexpr const char *kQnnSystemLibName = "libQnnSystem.so"; +constexpr const char *kQnnRpcLibName = "libcdsprpc.so"; + +#endif + +void insert_path(std::string &path, std::string insert_path, const char separator = ':') { + if (!insert_path.empty() && !path.empty()) { + insert_path += separator; + } + + path.insert(0, insert_path); +} + +// TODO: Fix this for other platforms, or use a more portable way to set the library search path +bool set_qnn_lib_search_path(const std::string &custom_lib_search_path) { +#if defined(__linux__) + { + auto *original = getenv("LD_LIBRARY_PATH"); + std::string lib_search_path = original ? original : ""; + insert_path(lib_search_path, + "/vendor/dsp/cdsp:/vendor/lib64:" + "/vendor/dsp/dsp:/vendor/dsp/images"); + insert_path(lib_search_path, custom_lib_search_path); + if (setenv("LD_LIBRARY_PATH", lib_search_path.c_str(), 1)) { + return false; + } + } + +#if defined(__ANDROID__) || defined(ANDROID) + { + // See also: https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-2/dsp_runtime.html + std::string adsp_lib_search_path = custom_lib_search_path + + ";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/" + "rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp"; + if (setenv("ADSP_LIBRARY_PATH", adsp_lib_search_path.c_str(), 1)) { + return false; + } + + QNN_LOG_DEBUG("ADSP_LIBRARY_PATH=%s", getenv("ADSP_LIBRARY_PATH")); + } +#endif + + QNN_LOG_DEBUG("LD_LIBRARY_PATH=%s", getenv("LD_LIBRARY_PATH")); +#else + (void)custom_lib_search_path; +#endif + + return true; +} + +qnn::dl_handler_t load_lib_with_fallback(const std::string &lib_path, const std::string &load_directory) { + std::filesystem::path full_path(load_directory); + full_path /= std::filesystem::path(lib_path).filename(); + auto handle = qnn::dl_load(full_path.string()); + if (!handle) { + QNN_LOG_WARN("failed to load %s, fallback to %s", full_path.c_str(), lib_path.c_str()); + handle = qnn::dl_load(lib_path); + } + + return handle; +} + +} // namespace + namespace qnn { -qnn_system_interface::qnn_system_interface(const QnnSystemInterface_t &qnn_sys_interface, dl_handler_t lib_handle) : - _qnn_sys_interface(qnn_sys_interface), _lib_handle(lib_handle) { +qnn_system_interface::qnn_system_interface(const QnnSystemInterface_t &qnn_sys_interface, dl_handler_t lib_handle) + : _qnn_sys_interface(qnn_sys_interface), _lib_handle(lib_handle) { qnn_system_context_create(&_qnn_system_handle); if (_qnn_system_handle) { - QNN_LOG_INFO("initialize qnn system successfully\n"); + QNN_LOG_INFO("initialize qnn system successfully"); } else { - QNN_LOG_WARN("can not create QNN system contenxt\n"); + QNN_LOG_WARN("can not create QNN system contenxt"); } } qnn_system_interface::~qnn_system_interface() { if (_qnn_system_handle) { if (qnn_system_context_free(_qnn_system_handle) != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN system context\n"); + QNN_LOG_WARN("failed to free QNN system context"); } } else { - QNN_LOG_WARN("system handle is null\n"); + QNN_LOG_WARN("system handle is null"); } if (_lib_handle) { - int dlclose_error = dl_unload(_lib_handle); - if (dlclose_error != 0) { - QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", dl_error()); + if (!dl_unload(_lib_handle)) { + QNN_LOG_WARN("failed to close QnnSystem library, error %s", dl_error()); + } + } else { + QNN_LOG_WARN("system lib handle is null"); + } +} + +qnn_instance::qnn_instance(const std::string &lib_path, const std::string &backend_lib_name) + : _additional_lib_load_path(lib_path), _backend_lib_name(std::move(backend_lib_name)) { + if (set_qnn_lib_search_path(lib_path)) { + QNN_LOG_DEBUG("[%s] set_qnn_lib_search_path succeed", _backend_lib_name.c_str()); + } else { + QNN_LOG_ERROR("[%s] set_qnn_lib_search_path failed", _backend_lib_name.c_str()); + } +} + +int qnn_instance::qnn_init(const QnnSaver_Config_t **saver_config) { + BackendIdType backend_id = QNN_BACKEND_ID_NULL; + QNN_LOG_DEBUG("enter qnn_init"); + + std::lock_guard lock(_init_mutex); + if (load_system() != 0) { + QNN_LOG_WARN("failed to load QNN system lib"); + return 1; + } else { + QNN_LOG_DEBUG("load QNN system lib successfully"); + } + + std::string backend_lib_path = _backend_lib_name; + if (_lib_path_to_backend_id.count(backend_lib_path) == 0) { + if (load_backend(backend_lib_path, saver_config) != 0) { + QNN_LOG_WARN("failed to load QNN backend"); + return 2; + } + } + + backend_id = _lib_path_to_backend_id[backend_lib_path]; + if (_loaded_backend.count(backend_id) == 0 || _loaded_lib_handle.count(backend_id) == 0) { + QNN_LOG_WARN( + "library %s is loaded but loaded backend count=%zu, " + "loaded lib_handle count=%zu", + backend_lib_path.c_str(), _loaded_backend.count(backend_id), _loaded_lib_handle.count(backend_id)); + return 3; + } + + _qnn_interface = std::make_shared(*_loaded_backend[backend_id]); + _qnn_interface->qnn_log_create(qnn::sdk_logcallback, _qnn_log_level, &_qnn_log_handle); + if (!_qnn_log_handle) { + // NPU backend not work on Qualcomm SoC equipped low-end phone + QNN_LOG_WARN("why failed to initialize qnn log"); + return 4; + } else { + QNN_LOG_DEBUG("initialize qnn log successfully"); + } + + std::vector temp_backend_config; + _qnn_interface->qnn_backend_create( + _qnn_log_handle, temp_backend_config.empty() ? nullptr : temp_backend_config.data(), &_qnn_backend_handle); + if (!_qnn_backend_handle) { + QNN_LOG_WARN("why failed to initialize qnn backend"); + return 5; + } else { + QNN_LOG_DEBUG("initialize qnn backend successfully"); + } + + auto qnn_status = _qnn_interface->qnn_property_has_capability(QNN_PROPERTY_GROUP_DEVICE); + if (QNN_PROPERTY_NOT_SUPPORTED == qnn_status) { + QNN_LOG_WARN("device property is not supported"); + } + if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnn_status) { + QNN_LOG_WARN("device property is not known to backend"); + } + + qnn_status = QNN_SUCCESS; + if (_backend_lib_name.find("Htp") != _backend_lib_name.npos) { + const QnnDevice_PlatformInfo_t *p_info = nullptr; + qnn_status = _qnn_interface->qnn_device_get_platform_info(nullptr, &p_info); + if (qnn_status == QNN_SUCCESS) { + QNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices); + QnnDevice_HardwareDeviceInfo_t *infos = p_info->v1.hwDevices; + QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = {}; + for (uint32_t i = 0; i < p_info->v1.numHwDevices; i++) { + QNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId, infos[i].v1.deviceType, + infos[i].v1.numCores); + QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension; + chipinfo = devinfo->onChipDevice; + size_t htp_arch = (size_t)chipinfo.arch; + QNN_LOG_INFO("htp_type:%d(%s)", devinfo->devType, + (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : ""); + QNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB", chipinfo.socModel, + qnn::get_chipset_desc(chipinfo.socModel), htp_arch, qnn::get_htparch_desc(htp_arch), + chipinfo.vtcmSize); + _soc_info = {chipinfo.socModel, htp_arch, chipinfo.vtcmSize}; + } + _qnn_interface->qnn_device_free_platform_info(nullptr, p_info); + } else { + // For emulator, we can't get platform info + QNN_LOG_WARN("failed to get platform info, are we in emulator?"); + _soc_info = {NONE, UNKNOWN_SM, 0}; + } + + QnnHtpDevice_CustomConfig_t soc_customconfig; + soc_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_SOC; + soc_customconfig.socModel = _soc_info.soc_model; + QnnDevice_Config_t soc_devconfig; + soc_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; + soc_devconfig.customConfig = &soc_customconfig; + + QnnHtpDevice_CustomConfig_t arch_customconfig; + arch_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_ARCH; + arch_customconfig.arch.arch = (QnnHtpDevice_Arch_t)_soc_info.htp_arch; + arch_customconfig.arch.deviceId = 0; // Id of device to be used. 0 will use by default. + QnnDevice_Config_t arch_devconfig; + arch_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; + arch_devconfig.customConfig = &arch_customconfig; + + const QnnDevice_Config_t *p_deviceconfig[] = {&soc_devconfig, &arch_devconfig, nullptr}; + qnn_status = _qnn_interface->qnn_device_create(_qnn_log_handle, p_deviceconfig, &_qnn_device_handle); + } else { + qnn_status = _qnn_interface->qnn_device_create(_qnn_log_handle, nullptr, &_qnn_device_handle); + } + if (QNN_SUCCESS != qnn_status && QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnn_status) { + QNN_LOG_WARN("failed to create QNN device"); + } else { + QNN_LOG_INFO("create QNN device successfully"); + } + + if (_profile_level != sdk_profile_level::profile_off) { + QNN_LOG_INFO("profiling turned on; level = %d", _profile_level); + auto profile_level = + _profile_level == sdk_profile_level::profile_detail ? QNN_PROFILE_LEVEL_DETAILED : QNN_PROFILE_LEVEL_BASIC; + + if (QNN_PROFILE_NO_ERROR != + _qnn_interface->qnn_profile_create(_qnn_backend_handle, profile_level, &_qnn_profile_handle)) { + QNN_LOG_WARN("unable to create profile handle in the backend"); + return 6; + } else { + QNN_LOG_DEBUG("initialize qnn profile successfully"); + } + } + + _rpc_lib_handle = load_lib_with_fallback(kQnnRpcLibName, _additional_lib_load_path); + if (_rpc_lib_handle) { + _pfn_rpc_mem_alloc = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_alloc")); + _pfn_rpc_mem_free = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_free")); + _pfn_rpc_mem_to_fd = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_to_fd")); + if (!_pfn_rpc_mem_alloc || !_pfn_rpc_mem_free || !_pfn_rpc_mem_to_fd) { + QNN_LOG_WARN("unable to access symbols in QNN RPC lib. error: %s", dl_error()); + dl_unload(_rpc_lib_handle); + return 9; + } + + _pfn_rpc_mem_init = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_init")); + _pfn_rpc_mem_deinit = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_deinit")); + if (_pfn_rpc_mem_init) { + _pfn_rpc_mem_init(); + } + + _rpcmem_initialized = true; + QNN_LOG_DEBUG("load rpcmem lib successfully"); + } else { + QNN_LOG_WARN("failed to load qualcomm rpc lib, skipping, error:%s", dl_error()); + } + + /* TODO: not used, keep it for further usage + QnnContext_Config_t qnn_context_config = QNN_CONTEXT_CONFIG_INIT; + qnn_context_config.priority = QNN_PRIORITY_DEFAULT; + const QnnContext_Config_t * context_configs[] = {&qnn_context_config, nullptr}; + */ + _qnn_interface->qnn_context_create(_qnn_backend_handle, _qnn_device_handle, nullptr, &_qnn_context_handle); + if (nullptr == _qnn_context_handle) { + QNN_LOG_WARN("why failed to initialize qnn context"); + return 10; + } else { + QNN_LOG_DEBUG("initialize qnn context successfully"); + } + + if (_backend_lib_name.find("Htp") != _backend_lib_name.npos) { + // TODO: faster approach to probe the accurate capacity of rpc ion memory + size_t candidate_size = 0; + uint8_t *rpc_buffer = nullptr; + const int size_in_mb = (1 << 20); + size_t probe_slots[] = {1024, 1536, 2048 - 48, 2048}; + size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); + for (size_t idx = 0; idx < probe_counts; idx++) { + rpc_buffer = static_cast(alloc_rpcmem(probe_slots[idx] * size_in_mb, sizeof(void *))); + if (!rpc_buffer) { + QNN_LOG_DEBUG("alloc rpcmem %d (MB) failure, %s", probe_slots[idx], strerror(errno)); + break; + } else { + candidate_size = probe_slots[idx]; + free_rpcmem(rpc_buffer); + rpc_buffer = nullptr; + } + } + + _rpcmem_capacity = std::max(candidate_size, _rpcmem_capacity); + QNN_LOG_INFO("capacity of QNN rpc ion memory is about %d MB", _rpcmem_capacity); + + if (init_htp_perfinfra() != 0) { + QNN_LOG_WARN("initialize HTP performance failure"); } + if (set_rpc_polling() != 0) { + QNN_LOG_WARN("set RPC polling failure"); + } + if (set_high_performance_mode() != 0) { + QNN_LOG_WARN("set HTP high performance mode failure"); + } + } + + QNN_LOG_DEBUG("leave qnn_init"); + + return 0; +} + +int qnn_instance::qnn_finalize() { + int ret_status = 0; + Qnn_ErrorHandle_t error = QNN_SUCCESS; + + if (_rpc_lib_handle) { + if (_pfn_rpc_mem_deinit) { + _pfn_rpc_mem_deinit(); + _pfn_rpc_mem_deinit = nullptr; + } + + if (dl_unload(_rpc_lib_handle)) { + QNN_LOG_DEBUG("succeed to close rpcmem lib"); + } else { + QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s", dl_error()); + } + } + + if (_backend_lib_name.find("Htp") != _backend_lib_name.npos) { + _qnn_htp_perfinfra->destroyPowerConfigId(_qnn_power_configid); + } + + if (_qnn_context_handle) { + error = _qnn_interface->qnn_context_free(_qnn_context_handle, _qnn_profile_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d", _qnn_interface->get_backend_id(), + QNN_GET_ERROR_CODE(error)); + } + _qnn_context_handle = nullptr; + } + + if (_qnn_profile_handle) { + error = _qnn_interface->qnn_profile_free(_qnn_profile_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d", _qnn_interface->get_backend_id(), + QNN_GET_ERROR_CODE(error)); + } + _qnn_profile_handle = nullptr; + } + + if (_qnn_device_handle) { + error = _qnn_interface->qnn_device_free(_qnn_device_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d", _qnn_interface->get_backend_id(), + QNN_GET_ERROR_CODE(error)); + } + _qnn_device_handle = nullptr; + } + + if (_qnn_backend_handle) { + error = _qnn_interface->qnn_backend_free(_qnn_backend_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d", _qnn_interface->get_backend_id(), + QNN_GET_ERROR_CODE(error)); + } + _qnn_backend_handle = nullptr; + } + + if (nullptr != _qnn_log_handle) { + error = _qnn_interface->qnn_log_free(_qnn_log_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d", _qnn_interface->get_backend_id(), + QNN_GET_ERROR_CODE(error)); + } + _qnn_log_handle = nullptr; + } + + unload_backend(); + + _qnn_sys_interface.reset(); + + return ret_status; +} + +int qnn_instance::load_system() { + QNN_LOG_DEBUG("[%s]lib: %s", _backend_lib_name.c_str(), kQnnSystemLibName); + auto system_lib_handle = load_lib_with_fallback(kQnnSystemLibName, _additional_lib_load_path); + if (!system_lib_handle) { + QNN_LOG_WARN("can not load QNN library %s, error: %s", kQnnSystemLibName, dl_error()); + return 1; + } + + auto *get_providers = + dl_sym_typed(system_lib_handle, "QnnSystemInterface_getProviders"); + if (!get_providers) { + QNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s", dl_error()); + return 2; + } + + uint32_t num_providers = 0; + const QnnSystemInterface_t **provider_list = nullptr; + Qnn_ErrorHandle_t error = get_providers(&provider_list, &num_providers); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error)); + return 3; + } + + QNN_LOG_DEBUG("num_providers: %d", num_providers); + if (num_providers != _required_num_providers) { + QNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers); + return 4; + } + + if (!provider_list) { + QNN_LOG_WARN("can not get providers"); + return 5; + } + + QNN_SYSTEM_INTERFACE_VER_TYPE qnn_system_interface; + bool found_valid_system_interface = false; + for (size_t idx = 0; idx < num_providers; idx++) { + if (QNN_SYSTEM_API_VERSION_MAJOR == provider_list[idx]->systemApiVersion.major && + QNN_SYSTEM_API_VERSION_MINOR <= provider_list[idx]->systemApiVersion.minor) { + found_valid_system_interface = true; + qnn_system_interface = provider_list[idx]->QNN_SYSTEM_INTERFACE_VER_NAME; + break; + } + } + + if (!found_valid_system_interface) { + QNN_LOG_WARN("unable to find a valid qnn system interface"); + return 6; + } else { + QNN_LOG_DEBUG("find a valid qnn system interface"); + } + + auto qnn_sys_interface = std::make_shared(*provider_list[0], system_lib_handle); + if (!qnn_sys_interface->is_valid()) { + QNN_LOG_WARN("failed to create QNN system interface"); + return 7; + } + + _qnn_sys_interface = qnn_sys_interface; + return 0; +} + +int qnn_instance::load_backend(std::string &lib_path, const QnnSaver_Config_t ** /*saver_config*/) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + QNN_LOG_DEBUG("lib_path:%s", lib_path.c_str()); + + auto lib_handle = load_lib_with_fallback(lib_path, _additional_lib_load_path); + if (!lib_handle) { + QNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dl_error()); + return 1; + } + + auto get_providers = dl_sym_typed(lib_handle, "QnnInterface_getProviders"); + if (!get_providers) { + QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dl_error()); + return 2; + } + + std::uint32_t num_providers = 0; + const QnnInterface_t **provider_list = nullptr; + error = get_providers(&provider_list, &num_providers); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error)); + return 3; + } + QNN_LOG_DEBUG("num_providers=%d", num_providers); + if (num_providers != _required_num_providers) { + QNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers); + return 4; + } + + if (!provider_list) { + QNN_LOG_WARN("failed to get qnn interface providers"); + return 5; + } + bool found_valid_interface = false; + QNN_INTERFACE_VER_TYPE qnn_interface; + for (size_t idx = 0; idx < num_providers; idx++) { + if (QNN_API_VERSION_MAJOR == provider_list[idx]->apiVersion.coreApiVersion.major && + QNN_API_VERSION_MINOR <= provider_list[idx]->apiVersion.coreApiVersion.minor) { + found_valid_interface = true; + qnn_interface = provider_list[idx]->QNN_INTERFACE_VER_NAME; + break; + } + } + + if (!found_valid_interface) { + QNN_LOG_WARN("unable to find a valid qnn interface"); + return 6; } else { - QNN_LOG_WARN("system lib handle is null\n"); + QNN_LOG_DEBUG("find a valid qnn interface"); + } + + BackendIdType backend_id = provider_list[0]->backendId; + _lib_path_to_backend_id[lib_path] = backend_id; + if (_loaded_backend.count(backend_id) > 0) { + QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists", lib_path.c_str(), backend_id); } + _loaded_backend[backend_id] = provider_list[0]; + if (_loaded_lib_handle.count(backend_id) > 0) { + QNN_LOG_WARN("closing %p", _loaded_lib_handle[backend_id]); + if (!dl_unload(_loaded_lib_handle[backend_id])) { + QNN_LOG_WARN("fail to close %p with error %s", _loaded_lib_handle[backend_id], dl_error()); + } + } + _loaded_lib_handle[backend_id] = lib_handle; + _backend_id = backend_id; + + return 0; +} + +int qnn_instance::unload_backend() { + for (auto &it : _loaded_lib_handle) { + if (!dl_unload(it.second)) { + QNN_LOG_WARN("failed to close QNN backend %d, error %s", it.first, dl_error()); + } + } + + _loaded_lib_handle.clear(); + _lib_path_to_backend_id.clear(); + _loaded_backend.clear(); + + return 0; } } // namespace qnn diff --git a/ggml/src/ggml-qnn/qnn-lib.hpp b/ggml/src/ggml-qnn/qnn-lib.hpp index 454c0c6aa32c5..968df5bcf297d 100644 --- a/ggml/src/ggml-qnn/qnn-lib.hpp +++ b/ggml/src/ggml-qnn/qnn-lib.hpp @@ -1,8 +1,10 @@ #pragma once -#include - #include +#include +#include +#include +#include #include #include #include @@ -22,27 +24,12 @@ #include #include +#include "dl_loader.hpp" #include "qnn-types.hpp" #include "utils.hpp" namespace qnn { -// TODO: those function should be moved to a separate file, and have separate implementation for each platform -typedef void *dl_handler_t; - -inline dl_handler_t dl_load(const std::string &lib_path) { return dlopen(lib_path.c_str(), RTLD_NOW | RTLD_LOCAL); } - -inline void *dl_sym(dl_handler_t handle, const std::string &symbol) { return dlsym(handle, symbol.c_str()); } - -inline int dl_unload(dl_handler_t handle) { return dlclose(handle); } - -inline const char *dl_error() { return dlerror(); } - -template -Fn dl_sym_typed(dl_handler_t handle, const std::string &function_name) { - return reinterpret_cast(dl_sym(handle, function_name)); -} - // ================================================================================================= // // wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK @@ -52,6 +39,7 @@ Fn dl_sym_typed(dl_handler_t handle, const std::string &function_name) { // TODO: fix this for other compilers #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wextra-semi" +#pragma GCC diagnostic ignored "-Wpedantic" class qnn_system_interface { @@ -188,273 +176,10 @@ class qnn_instance { public: using BackendIdType = decltype(QnnInterface_t{}.backendId); - explicit qnn_instance(const std::string &lib_path, const std::string &backend_name, const std::string &model_name) - : _lib_path(std::move(lib_path)), _backend_name(std::move(backend_name)), _model_name(std::move(model_name)) {} - + explicit qnn_instance(const std::string &lib_path, const std::string &backend_lib_name); ~qnn_instance() {} - - int qnn_init(const QnnSaver_Config_t **saver_config) { - BackendIdType backend_id = QNN_BACKEND_ID_NULL; - QNN_LOG_DEBUG("enter qnn_init"); - - std::lock_guard lock(_init_mutex); - if (load_system() != 0) { - QNN_LOG_WARN("can not load QNN system lib, pls check why?"); - return 1; - } else { - QNN_LOG_DEBUG("load QNN system lib successfully"); - } - - std::string backend_lib_path = _lib_path + _backend_name; - if (_lib_path_to_backend_id.count(backend_lib_path) == 0) { - int is_load_ok = load_backend(backend_lib_path, saver_config); - if (is_load_ok != 0) { - QNN_LOG_WARN("failed to load QNN backend"); - return 2; - } - } - - backend_id = _lib_path_to_backend_id[backend_lib_path]; - if (_loaded_backend.count(backend_id) == 0 || _loaded_lib_handle.count(backend_id) == 0) { - QNN_LOG_WARN( - "library %s is loaded but loaded backend count=%zu, " - "loaded lib_handle count=%zu", - backend_lib_path.c_str(), _loaded_backend.count(backend_id), _loaded_lib_handle.count(backend_id)); - return 3; - } - - _qnn_interface = std::make_shared(*_loaded_backend[backend_id]); - _qnn_interface->qnn_log_create(qnn::sdk_logcallback, _qnn_log_level, &_qnn_log_handle); - if (nullptr == _qnn_log_handle) { - // NPU backend not work on Qualcomm SoC equipped low-end phone - QNN_LOG_WARN("why failed to initialize qnn log"); - return 4; - } else { - QNN_LOG_DEBUG("initialize qnn log successfully"); - } - - std::vector temp_backend_config; - _qnn_interface->qnn_backend_create( - _qnn_log_handle, temp_backend_config.empty() ? nullptr : temp_backend_config.data(), &_qnn_backend_handle); - if (nullptr == _qnn_backend_handle) { - QNN_LOG_WARN("why failed to initialize qnn backend"); - return 5; - } else { - QNN_LOG_DEBUG("initialize qnn backend successfully"); - } - - Qnn_ErrorHandle_t qnn_status = _qnn_interface->qnn_property_has_capability(QNN_PROPERTY_GROUP_DEVICE); - if (QNN_PROPERTY_NOT_SUPPORTED == qnn_status) { - QNN_LOG_WARN("device property is not supported"); - } - if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnn_status) { - QNN_LOG_WARN("device property is not known to backend"); - } - - qnn_status = QNN_SUCCESS; - if (_backend_name.find("Htp") != _backend_name.npos) { - const QnnDevice_PlatformInfo_t *p_info = nullptr; - _qnn_interface->qnn_device_get_platform_info(nullptr, &p_info); - QNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices); - QnnDevice_HardwareDeviceInfo_t *infos = p_info->v1.hwDevices; - QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = {}; - for (uint32_t i = 0; i < p_info->v1.numHwDevices; i++) { - QNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId, infos[i].v1.deviceType, - infos[i].v1.numCores); - QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension; - chipinfo = devinfo->onChipDevice; - QnnHtpDevice_Arch_t htp_arch = chipinfo.arch; - QNN_LOG_INFO("htp_type:%d(%s)", devinfo->devType, - (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : ""); - QNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB", chipinfo.socModel, - qnn::get_chipset_desc(chipinfo.socModel), htp_arch, qnn::get_htparch_desc(htp_arch), - chipinfo.vtcmSize); - _soc_info = {chipinfo.socModel, htp_arch, chipinfo.vtcmSize}; - } - _qnn_interface->qnn_device_free_platform_info(nullptr, p_info); - - QnnHtpDevice_CustomConfig_t soc_customconfig; - soc_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_SOC; - soc_customconfig.socModel = chipinfo.socModel; - QnnDevice_Config_t soc_devconfig; - soc_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; - soc_devconfig.customConfig = &soc_customconfig; - - QnnHtpDevice_CustomConfig_t arch_customconfig; - arch_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_ARCH; - arch_customconfig.arch.arch = chipinfo.arch; - arch_customconfig.arch.deviceId = 0; // Id of device to be used. If single device is used by default 0. - QnnDevice_Config_t arch_devconfig; - arch_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; - arch_devconfig.customConfig = &arch_customconfig; - - const QnnDevice_Config_t *p_deviceconfig[] = {&soc_devconfig, &arch_devconfig, nullptr}; - qnn_status = _qnn_interface->qnn_device_create(_qnn_log_handle, p_deviceconfig, &_qnn_device_handle); - } else { - qnn_status = _qnn_interface->qnn_device_create(_qnn_log_handle, nullptr, &_qnn_device_handle); - } - if (QNN_SUCCESS != qnn_status && QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnn_status) { - QNN_LOG_WARN("failed to create QNN device"); - } else { - QNN_LOG_INFO("create QNN device successfully"); - } - - if (_profile_level != sdk_profile_level::profile_off) { - QNN_LOG_INFO("profiling turned on; level = %d", _profile_level); - auto profile_level = _profile_level == sdk_profile_level::profile_detail ? QNN_PROFILE_LEVEL_DETAILED - : QNN_PROFILE_LEVEL_BASIC; - - if (QNN_PROFILE_NO_ERROR != - _qnn_interface->qnn_profile_create(_qnn_backend_handle, profile_level, &_qnn_profile_handle)) { - QNN_LOG_WARN("unable to create profile handle in the backend"); - return 6; - } else { - QNN_LOG_DEBUG("initialize qnn profile successfully"); - } - } - - _rpc_lib_handle = dl_load("libcdsprpc.so"); - if (nullptr == _rpc_lib_handle) { - QNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s", dl_error()); - return 8; - } else { - QNN_LOG_DEBUG("load rpcmem lib successfully"); - set_rpcmem_initialized(true); - } - _pfn_rpc_mem_init = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_init")); - _pfn_rpc_mem_deinit = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_deinit")); - _pfn_rpc_mem_alloc = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_alloc")); - _pfn_rpc_mem_free = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_free")); - _pfn_rpc_mem_to_fd = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_to_fd")); - if (nullptr == _pfn_rpc_mem_alloc || nullptr == _pfn_rpc_mem_free || nullptr == _pfn_rpc_mem_to_fd) { - QNN_LOG_WARN("unable to access symbols in QNN RPC lib. error: %s", dl_error()); - dl_unload(_rpc_lib_handle); - return 9; - } - - if (nullptr != _pfn_rpc_mem_init) { // make Qualcomm's SoC equipped low-end phone happy - _pfn_rpc_mem_init(); - } - - /* TODO: not used, keep it for further usage - QnnContext_Config_t qnn_context_config = QNN_CONTEXT_CONFIG_INIT; - qnn_context_config.priority = QNN_PRIORITY_DEFAULT; - const QnnContext_Config_t * context_configs[] = {&qnn_context_config, nullptr}; - */ - _qnn_interface->qnn_context_create(_qnn_backend_handle, _qnn_device_handle, nullptr, &_qnn_context_handle); - if (nullptr == _qnn_context_handle) { - QNN_LOG_WARN("why failed to initialize qnn context"); - return 10; - } else { - QNN_LOG_DEBUG("initialize qnn context successfully"); - } - - if (_backend_name.find("Htp") != _backend_name.npos) { - // TODO: faster approach to probe the accurate capacity of rpc ion memory - size_t candidate_size = 0; - uint8_t *rpc_buffer = nullptr; - const int size_in_mb = (1 << 20); - size_t probe_slots[] = {1024, 1536, 2048 - 48, 2048}; - size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); - for (size_t idx = 0; idx < probe_counts; idx++) { - rpc_buffer = static_cast(alloc_rpcmem(probe_slots[idx] * size_in_mb, sizeof(void *))); - if (!rpc_buffer) { - QNN_LOG_DEBUG("alloc rpcmem %d (MB) failure, %s", probe_slots[idx], strerror(errno)); - break; - } else { - candidate_size = probe_slots[idx]; - free_rpcmem(rpc_buffer); - rpc_buffer = nullptr; - } - } - - _rpcmem_capacity = std::max(candidate_size, _rpcmem_capacity); - QNN_LOG_INFO("capacity of QNN rpc ion memory is about %d MB", _rpcmem_capacity); - - if (0 != init_htp_perfinfra()) { - QNN_LOG_WARN("initialize HTP performance failure"); - } - if (0 != set_rpc_polling()) { - QNN_LOG_WARN("set RPC polling failure"); - } - if (0 != set_high_performance_mode()) { - QNN_LOG_WARN("set HTP high performance mode failure"); - } - } - - QNN_LOG_DEBUG("leave qnn_init"); - - return 0; - } - - int qnn_finalize() { - int ret_status = 0; - Qnn_ErrorHandle_t error = QNN_SUCCESS; - - if (nullptr != _pfn_rpc_mem_deinit) // make Qualcomm's SoC equipped low-end phone happy - _pfn_rpc_mem_deinit(); - - if (dl_unload(_rpc_lib_handle) != 0) { - QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s", dl_error()); - } else { - QNN_LOG_DEBUG("succeed to close rpcmem lib"); - } - - if (_backend_name.find("Htp") != _backend_name.npos) { - _qnn_htp_perfinfra->destroyPowerConfigId(_qnn_power_configid); - } - - if (nullptr != _qnn_context_handle) { - error = _qnn_interface->qnn_context_free(_qnn_context_handle, _qnn_profile_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d", _qnn_interface->get_backend_id(), - QNN_GET_ERROR_CODE(error)); - } - _qnn_context_handle = nullptr; - } - - if (nullptr != _qnn_profile_handle) { - error = _qnn_interface->qnn_profile_free(_qnn_profile_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d", _qnn_interface->get_backend_id(), - QNN_GET_ERROR_CODE(error)); - } - _qnn_profile_handle = nullptr; - } - - if (nullptr != _qnn_device_handle) { - error = _qnn_interface->qnn_device_free(_qnn_device_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d", _qnn_interface->get_backend_id(), - QNN_GET_ERROR_CODE(error)); - } - _qnn_device_handle = nullptr; - } - - if (nullptr != _qnn_backend_handle) { - error = _qnn_interface->qnn_backend_free(_qnn_backend_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d", _qnn_interface->get_backend_id(), - QNN_GET_ERROR_CODE(error)); - } - _qnn_backend_handle = nullptr; - } - - if (nullptr != _qnn_log_handle) { - error = _qnn_interface->qnn_log_free(_qnn_log_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d", _qnn_interface->get_backend_id(), - QNN_GET_ERROR_CODE(error)); - } - _qnn_log_handle = nullptr; - } - - unload_backend(); - - _qnn_sys_interface.reset(); - - return ret_status; - } + int qnn_init(const QnnSaver_Config_t **saver_config); + int qnn_finalize(); std::shared_ptr get_qnn_interface() { if (!_qnn_interface) { @@ -477,7 +202,7 @@ class qnn_instance { int init_htp_perfinfra() { QnnDevice_Infrastructure_t device_infra = nullptr; - int error = _qnn_interface->qnn_device_get_infrastructure(&device_infra); + auto error = _qnn_interface->qnn_device_get_infrastructure(&device_infra); if (error != QNN_SUCCESS) { QNN_LOG_WARN("failed to get qnn device infra"); return 1; @@ -578,8 +303,6 @@ class qnn_instance { bool is_rpcmem_initialized() { return _rpcmem_initialized; } - void set_rpcmem_initialized(bool initialized) { _rpcmem_initialized = initialized; } - size_t get_rpcmem_capacity() { return _rpcmem_capacity; } void *alloc_rpcmem(size_t bytes, size_t alignment) { @@ -665,7 +388,7 @@ class qnn_instance { } void unregister_rpcmem(Qnn_MemHandle_t mem_handle) { - Qnn_ErrorHandle_t error = _qnn_interface->qnn_mem_de_register(&mem_handle, 1); + auto error = _qnn_interface->qnn_mem_de_register(&mem_handle, 1); if (error != QNN_SUCCESS) { QNN_LOG_WARN("failed to unregister shared memory, error %d", QNN_GET_ERROR_CODE(error)); } @@ -686,163 +409,15 @@ class qnn_instance { const qnn::qcom_socinfo &get_soc_info() { return _soc_info; } private: - int load_system() { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - - std::string system_lib_path = _lib_path + "libQnnSystem.so"; - QNN_LOG_DEBUG("system_lib_path:%s", system_lib_path.c_str()); - - auto system_lib_handle = dl_load(system_lib_path); - if (!system_lib_handle) { - QNN_LOG_WARN("can not load QNN library %s, error: %s", system_lib_path.c_str(), dl_error()); - return 1; - } - - auto *get_providers = dl_sym_typed( - system_lib_handle, "QnnSystemInterface_getProviders"); - if (!get_providers) { - QNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s", dl_error()); - return 2; - } - - uint32_t num_providers = 0; - const QnnSystemInterface_t **provider_list = nullptr; - error = get_providers(&provider_list, &num_providers); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error)); - return 3; - } - - if (num_providers != _required_num_providers) { - QNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers); - return 4; - } - - if (!provider_list) { - QNN_LOG_WARN("can not get providers"); - return 5; - } - - QNN_SYSTEM_INTERFACE_VER_TYPE qnn_system_interface; - bool found_valid_system_interface = false; - for (size_t idx = 0; idx < num_providers; idx++) { - if (QNN_SYSTEM_API_VERSION_MAJOR == provider_list[idx]->systemApiVersion.major && - QNN_SYSTEM_API_VERSION_MINOR <= provider_list[idx]->systemApiVersion.minor) { - found_valid_system_interface = true; - qnn_system_interface = provider_list[idx]->QNN_SYSTEM_INTERFACE_VER_NAME; - break; - } - } - if (!found_valid_system_interface) { - QNN_LOG_WARN("unable to find a valid qnn system interface"); - return 6; - } else { - QNN_LOG_DEBUG("find a valid qnn system interface"); - } - - auto qnn_sys_interface = std::make_shared(*provider_list[0], system_lib_handle); - if (!qnn_sys_interface->is_valid()) { - QNN_LOG_WARN("failed to create QNN system interface"); - return 7; - } - - _qnn_sys_interface = qnn_sys_interface; - return 0; - } - - int load_backend(std::string &lib_path, const QnnSaver_Config_t ** /*saver_config*/) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - QNN_LOG_DEBUG("lib_path:%s", lib_path.c_str()); - - auto lib_handle = dl_load(lib_path.c_str()); - if (!lib_handle) { - QNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dl_error()); - return 1; - } - - auto get_providers = - qnn::dl_sym_typed(lib_handle, "QnnInterface_getProviders"); - if (!get_providers) { - QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dl_error()); - return 2; - } - - std::uint32_t num_providers = 0; - const QnnInterface_t **provider_list = nullptr; - error = get_providers(&provider_list, &num_providers); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error)); - return 3; - } - QNN_LOG_DEBUG("num_providers=%d", num_providers); - if (num_providers != _required_num_providers) { - QNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers); - return 4; - } - - if (!provider_list) { - QNN_LOG_WARN("failed to get qnn interface providers"); - return 5; - } - bool found_valid_interface = false; - QNN_INTERFACE_VER_TYPE qnn_interface; - for (size_t idx = 0; idx < num_providers; idx++) { - if (QNN_API_VERSION_MAJOR == provider_list[idx]->apiVersion.coreApiVersion.major && - QNN_API_VERSION_MINOR <= provider_list[idx]->apiVersion.coreApiVersion.minor) { - found_valid_interface = true; - qnn_interface = provider_list[idx]->QNN_INTERFACE_VER_NAME; - break; - } - } - - if (!found_valid_interface) { - QNN_LOG_WARN("unable to find a valid qnn interface"); - return 6; - } else { - QNN_LOG_DEBUG("find a valid qnn interface"); - } - - BackendIdType backend_id = provider_list[0]->backendId; - _lib_path_to_backend_id[lib_path] = backend_id; - if (_loaded_backend.count(backend_id) > 0) { - QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists", lib_path.c_str(), backend_id); - } - _loaded_backend[backend_id] = provider_list[0]; - if (_loaded_lib_handle.count(backend_id) > 0) { - QNN_LOG_WARN("closing %p", _loaded_lib_handle[backend_id]); - int dlclose_error = dl_unload(_loaded_lib_handle[backend_id]); - if (dlclose_error != 0) { - QNN_LOG_WARN("fail to close %p with error %s", _loaded_lib_handle[backend_id], dl_error()); - } - } - _loaded_lib_handle[backend_id] = lib_handle; - _backend_id = backend_id; - - return 0; - } - - int unload_backend() { - int dlclose_error = 0; - for (auto &it : _loaded_lib_handle) { - dlclose_error = dl_unload(it.second); - if (dlclose_error != 0) { - QNN_LOG_WARN("failed to close QNN backend %d, error %s", it.first, dl_error()); - } - } - - _loaded_lib_handle.clear(); - _lib_path_to_backend_id.clear(); - _loaded_backend.clear(); - - return 0; - } + int load_system(); + int load_backend(std::string &lib_path, const QnnSaver_Config_t ** /*saver_config*/); + int unload_backend(); private: static constexpr const int _required_num_providers = 1; - std::string _lib_path; - std::string _backend_name; - std::string _model_name; // Qualcomm's dedicated prebuilt model name, keep it for further usage + std::string _additional_lib_load_path; + std::string _backend_lib_name; BackendIdType _backend_id; QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_DEBUG; @@ -874,17 +449,17 @@ class qnn_instance { std::unordered_map _qnn_rpc_buffer_to_handles; std::mutex _init_mutex; - std::unordered_map _loaded_lib_handle; + std::unordered_map _loaded_lib_handle; std::unordered_map _lib_path_to_backend_id; std::unordered_map _loaded_backend; dl_handler_t _rpc_lib_handle = nullptr; std::atomic_bool _rpcmem_initialized{false}; - qnn::pfn_rpc_mem_alloc _pfn_rpc_mem_alloc; - qnn::pfn_rpc_mem_free _pfn_rpc_mem_free; - qnn::pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd; - qnn::pfn_rpc_mem_init _pfn_rpc_mem_init; - qnn::pfn_rpc_mem_deinit _pfn_rpc_mem_deinit; + qnn::pfn_rpc_mem_alloc _pfn_rpc_mem_alloc = nullptr; + qnn::pfn_rpc_mem_free _pfn_rpc_mem_free = nullptr; + qnn::pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd = nullptr; + qnn::pfn_rpc_mem_init _pfn_rpc_mem_init = nullptr; + qnn::pfn_rpc_mem_deinit _pfn_rpc_mem_deinit = nullptr; std::unordered_map _rpcmem_store_map; size_t _rpcmem_capacity = 512; diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index 9720e682c81d2..423c3ba7fa8c1 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -59,7 +59,7 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { return true; } - can_unbind = false; + _can_unbind = false; return false; } @@ -68,7 +68,7 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { return true; } - can_unbind = false; + _can_unbind = false; return false; } @@ -93,7 +93,7 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { } bool bind_ggml_tensor(ggml_tensor *tensor) { - if (!can_unbind) { + if (!_can_unbind) { QNN_LOG_DEBUG("[%s]already has buffer storage, skip bind", _tensor_name.c_str()); return true; } @@ -137,7 +137,7 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { return false; } - if (!can_unbind) { + if (!_can_unbind) { QNN_LOG_DEBUG("[%s]already has buffer storage, stop unbind", _tensor_name.c_str()); return true; } @@ -294,11 +294,14 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { new_tensor_type); } - bool should_use_mem_handle() const { return false; } + bool should_use_mem_handle() const { + // TODO: figure out how to set rpc mem to multiple tensor + return false; + } std::string _tensor_name; qnn_buffer_ptr _buffer; - bool can_unbind = true; + bool _can_unbind = true; QNNBackend _device; std::shared_ptr _qnn_instance; Qnn_Tensor_t _qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion); diff --git a/ggml/src/ggml-qnn/utils.cpp b/ggml/src/ggml-qnn/utils.cpp index 6e77ee5f5f287..e9aa4d37374a6 100644 --- a/ggml/src/ggml-qnn/utils.cpp +++ b/ggml/src/ggml-qnn/utils.cpp @@ -1,8 +1,6 @@ #include "utils.hpp" -#include - #include #include "ggml-qnn.h" @@ -10,11 +8,23 @@ #include "QnnGraph.h" #include "qnn-types.hpp" -#ifdef __linux__ +#ifdef _WIN32 +#include +#else #include #include #endif +namespace { + +template +_Ty align_to_generic(size_t alignment, _Ty offset) { + return offset % alignment == 0 ? offset + : offset + (static_cast<_Ty>(alignment) - (offset % static_cast<_Ty>(alignment))); +} + +} // namespace + namespace qnn { qnn_dimension_array_t get_internal_dimension(const ggml_dimension_array_t &dims, uint32_t rank) { @@ -33,7 +43,7 @@ qnn_dimension_array_t get_internal_dimension(const ggml_dimension_array_t &dims, * The ggml tensor will have dimensions [3, 2], while the qnn tensor will have dimensions [2, 3]. */ for (uint32_t i = 0; i < rank; i++) { - internal_dims[i] = std::max(dims[rank - 1 - i], 1); + internal_dims[i] = std::max((uint32_t)dims[rank - 1 - i], 1); } return internal_dims; @@ -219,37 +229,41 @@ const char *get_htparch_desc(size_t htp_arch) { } } -intptr_t align_to(size_t alignment, intptr_t offset) { - return offset % alignment == 0 - ? offset - : offset + (static_cast(alignment) - (offset % static_cast(alignment))); -} +intptr_t align_to(size_t alignment, intptr_t offset) { return align_to_generic(alignment, offset); } -uint32_t get_ggml_tensor_data_size(const ggml_tensor *tensor) { return ggml_nbytes(tensor); } +uint32_t get_ggml_tensor_data_size(const ggml_tensor *tensor) { return (uint32_t)ggml_nbytes(tensor); } -void *page_align_alloc(size_t size) { - // TODO: fix this for other platforms - const size_t alignment = sysconf(_SC_PAGESIZE); - return align_alloc(alignment, size); +#ifdef _WIN32 +static void *_align_alloc(size_t alignment, size_t size) { return _aligned_malloc(size, alignment); } + +static size_t _get_page_size() { + SYSTEM_INFO si; + GetSystemInfo(&si); + return si.dwPageSize; } -void *align_alloc(size_t alignment, size_t size) { - size_t size_aligned = size; - if ((size_aligned % alignment) != 0) { - size_aligned += (alignment - (size_aligned % alignment)); - } +void align_free(void *ptr) { _aligned_free(ptr); } +#else +static void *_align_alloc(size_t alignment, size_t size) { return std::aligned_alloc(alignment, size); } - void *data = std::aligned_alloc(alignment, size_aligned); +static size_t _get_page_size() { return sysconf(_SC_PAGESIZE); } + +void align_free(void *ptr) { std::free(ptr); } +#endif + +void *page_align_alloc(size_t size) { + const size_t alignment = _get_page_size(); + size_t size_aligned = align_to_generic(alignment, size); + QNN_LOG_DEBUG("_align_alloc success, alignment: %ld, size: %ld, size_aligned: %ld", alignment, size, size_aligned); + void *data = _align_alloc(alignment, size_aligned); if (!data) { - QNN_LOG_WARN("aligned_alloc failed\n"); + QNN_LOG_WARN("_align_alloc failed, alignment: %ld, size: %ld, size_aligned: %ld", alignment, size, size_aligned); return nullptr; } return data; } -void align_free(void *ptr) { std::free(ptr); } - // ================================================================================================= // // QNN backend internal helper functions @@ -359,7 +373,29 @@ const char *get_qnn_error_string(Qnn_ErrorHandle_t error) { } } -#ifdef __linux__ +#ifdef _WIN32 + +size_t get_system_total_memory_in_bytes() { + MEMORYSTATUSEX mem = {}; + mem.dwLength = sizeof(mem); + if (GlobalMemoryStatusEx(&mem)) { + return mem.ullTotalPhys; + } + + return 0; +} + +size_t get_system_free_memory_in_bytes() { + MEMORYSTATUSEX mem = {}; + mem.dwLength = sizeof(mem); + if (GlobalMemoryStatusEx(&mem)) { + return mem.ullAvailPhys; + } + + return 0; +} + +#else size_t get_system_total_memory_in_bytes() { struct sysinfo info = {}; diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp index 1ec0af4c96f77..cdff53e77314d 100644 --- a/ggml/src/ggml-qnn/utils.hpp +++ b/ggml/src/ggml-qnn/utils.hpp @@ -1,12 +1,8 @@ #pragma once -#include -#include -#include -#include -#include - #include +#include +#include #include #include "ggml.h" @@ -36,7 +32,6 @@ intptr_t align_to(size_t alignment, intptr_t offset); uint32_t get_ggml_tensor_data_size(const ggml_tensor *tensor); void *page_align_alloc(size_t size); -void *align_alloc(size_t alignment, size_t size); void align_free(void *ptr); const char *opname_from_ggmlop(enum ggml_op ggmlop); From ff033e1e23d91f332d5ef3ec29fcdfaa9c8a6051 Mon Sep 17 00:00:00 2001 From: nullname Date: Tue, 25 Feb 2025 19:46:48 +0800 Subject: [PATCH 141/166] opt mulmat base on official doc (#25) https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md --- ggml/src/ggml-qnn/op-config-impl.cpp | 44 +++++++--------------------- 1 file changed, 11 insertions(+), 33 deletions(-) diff --git a/ggml/src/ggml-qnn/op-config-impl.cpp b/ggml/src/ggml-qnn/op-config-impl.cpp index 934dbadfdcaf8..1b05b3581a419 100644 --- a/ggml/src/ggml-qnn/op-config-impl.cpp +++ b/ggml/src/ggml-qnn/op-config-impl.cpp @@ -385,36 +385,26 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap * [5, 4], * ]) * # Perform matrix multiplication - * result = torch.matmul(A, B.T) - * print(result.T) + * C = torch.matmul(A, B.T) + * print(C.T) * ``` * Here, the B.T is the transpose of B. + * So C.T = A * B.T which is equivalent to C = B * A.T. + * See: https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md * * So here we need to create graph like: * ```mermaid * graph TD; - * i1>ggml_tensor_in0] --src0--> mat_mul0; - * i2>ggml_tensor_in1] --src1--> mat_mul0; - * mat_mul0 --dst_trans--> transpose_out; - * transpose1 --dst0--> o1>ggml_tensor_out]; + * i1>ggml_tensor_in0] --src1--> mat_mul0; + * i2>ggml_tensor_in1] --src0.T--> mat_mul0; + * mat_mul0 --dst0--> o1>ggml_tensor_out]; * ``` */ // create src0_trans tensor - auto src1 = tensor_inputs.back(); static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS does not match the expected value"); - - qnn_dimension_array_t dimensions = get_transposed_dimensions(src1->get_dimensions(), rank); - - // create dst_trans tensor - auto dst = tensor_outputs.front(); - dimensions = get_transposed_dimensions(dst->get_dimensions(), rank); - auto dst_trans = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, "dst_trans", dimensions, - dst->get_data_type(), rank, device, graph_handle, _qnn_instance); - - // create transpose_out - auto transpose_out = std::make_shared(_name + "_trans1", QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_TRANSPOSE, _qnn_instance); + GGML_ASSERT(tensor_inputs.size() == 2); + GGML_ASSERT(tensor_outputs.size() == 1); // create mat_mul auto mat_mul = @@ -425,24 +415,12 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap scalar.bool8Value = 1; mat_mul->add_scalar_param(QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1, scalar); - // set transpose_out parameters - auto *params_data = reinterpret_cast(kTransposeParamData[rank - 1].data()); - const qnn_dimension_array_t param_dims = {(uint32_t)rank, 1, 1, 1}; - transpose_out->add_tensor_param(QNN_OP_TRANSPOSE_PARAM_PERM, param_dims, 1, params_data, QNN_DATATYPE_UINT_32, - device, graph_handle); - // set tensor to mat_mul + std::swap(tensor_inputs[0], tensor_inputs[1]); mat_mul->set_input_tensors(tensor_inputs); - qnn_tensor_array_t tensors = {dst_trans}; - mat_mul->set_output_tensors(tensors); - - // set tensor to transpose_out - tensors = {dst_trans}; - transpose_out->set_input_tensors(tensors); - transpose_out->set_output_tensors(tensor_outputs); + mat_mul->set_output_tensors(tensor_outputs); _operations.push_back(mat_mul); - _operations.push_back(transpose_out); return true; } From c8676412228f932b087775d69f82312ee370be13 Mon Sep 17 00:00:00 2001 From: nullname Date: Thu, 27 Feb 2025 23:16:08 +0800 Subject: [PATCH 142/166] feat: fix some TODO item in upstream PR #26 (#27) * fix warning * wip * add todo for graph key generate * rename some file to meet upstream guideline * remove local .clang-format * expend supported/unsupported counter to all ops * append device name to log * port to ggml logger * fix warning after adapt to ggml logger * append \n to all log * use case op instead of convert * Revert "use case op instead of convert" This reverts commit e662fc2dfee41719aaf7bc9d75e03e8d0f7ded0f. * fix op that needs same shape * opt kQnnOpsTable * refresh params name field when getting op config * opt npu log print * remove unused functions --- ggml/src/ggml-qnn/.clang-format | 65 ---- ggml/src/ggml-qnn/backend-ops.cpp | 506 +++++++++++++-------------- ggml/src/ggml-qnn/backend-ops.hpp | 9 +- ggml/src/ggml-qnn/backend.hpp | 30 +- ggml/src/ggml-qnn/buffer.hpp | 65 ++-- ggml/src/ggml-qnn/dl-loader.hpp | 76 ++++ ggml/src/ggml-qnn/dl_loader.hpp | 71 ---- ggml/src/ggml-qnn/ggml-qnn.cpp | 221 ++++++------ ggml/src/ggml-qnn/graph.cpp | 235 +++++-------- ggml/src/ggml-qnn/graph.hpp | 44 ++- ggml/src/ggml-qnn/logger.cpp | 63 +--- ggml/src/ggml-qnn/logger.hpp | 45 +-- ggml/src/ggml-qnn/op-config-base.hpp | 23 +- ggml/src/ggml-qnn/op-config-caps.cpp | 344 +++++++++--------- ggml/src/ggml-qnn/op-config-impl.cpp | 197 +++++------ ggml/src/ggml-qnn/op-config-impl.hpp | 136 +++---- ggml/src/ggml-qnn/op-config.hpp | 14 +- ggml/src/ggml-qnn/qnn-lib.cpp | 278 +++++++-------- ggml/src/ggml-qnn/qnn-lib.hpp | 231 ++++++------ ggml/src/ggml-qnn/qnn-types.hpp | 52 +-- ggml/src/ggml-qnn/tensor.hpp | 206 +++++------ ggml/src/ggml-qnn/utils.cpp | 104 +++--- ggml/src/ggml-qnn/utils.hpp | 165 ++++----- 23 files changed, 1508 insertions(+), 1672 deletions(-) delete mode 100644 ggml/src/ggml-qnn/.clang-format create mode 100644 ggml/src/ggml-qnn/dl-loader.hpp delete mode 100644 ggml/src/ggml-qnn/dl_loader.hpp diff --git a/ggml/src/ggml-qnn/.clang-format b/ggml/src/ggml-qnn/.clang-format deleted file mode 100644 index 0c67c54239623..0000000000000 --- a/ggml/src/ggml-qnn/.clang-format +++ /dev/null @@ -1,65 +0,0 @@ ---- -BasedOnStyle: Google -IndentWidth: 4 -AccessModifierOffset: -4 -AlignAfterOpenBracket: Align -AlignConsecutiveMacros: false -AlignConsecutiveAssignments: false -AlignConsecutiveDeclarations: false -AlignEscapedNewlines: Left -AlignOperands: true -AlignTrailingComments: true -AllowAllArgumentsOnNextLine: true -AllowAllConstructorInitializersOnNextLine: true -AllowAllParametersOfDeclarationOnNextLine: true -AllowShortBlocksOnASingleLine: Never -AllowShortCaseLabelsOnASingleLine: false -AllowShortFunctionsOnASingleLine: All -AllowShortLambdasOnASingleLine: All -AllowShortIfStatementsOnASingleLine: WithoutElse -AllowShortLoopsOnASingleLine: true -AlwaysBreakAfterDefinitionReturnType: None -AlwaysBreakAfterReturnType: None -AlwaysBreakBeforeMultilineStrings: true -AlwaysBreakTemplateDeclarations: Yes -BinPackArguments: true -BinPackParameters: true -BraceWrapping: - AfterCaseLabel: false - AfterClass: false - AfterControlStatement: false - AfterEnum: false - AfterFunction: false - AfterNamespace: false - AfterObjCDeclaration: false - AfterStruct: false - AfterUnion: false - AfterExternBlock: false - BeforeCatch: false - BeforeElse: false - IndentBraces: false - SplitEmptyFunction: true - SplitEmptyRecord: true - SplitEmptyNamespace: true -ColumnLimit: 120 -ConstructorInitializerIndentWidth: 4 -ContinuationIndentWidth: 4 -Cpp11BracedListStyle: true -DerivePointerAlignment: false -IncludeCategories: - - Regex: '^<.*\.h>' - Priority: 1 - - Regex: '^<.*' - Priority: 2 - - Regex: '^"ggml\.h"' - Priority: 3 - - Regex: '^"ggml-.+\.h"' - Priority: 4 - - Regex: '.*' - Priority: 5 -KeepEmptyLinesAtTheStartOfBlocks: true -MaxEmptyLinesToKeep: 1 -PointerAlignment: Right -SortIncludes: true -SpacesBeforeTrailingComments: 1 -UseTab: Never diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index f62fc60d5c055..3a401dd037b97 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -4,7 +4,6 @@ #include #include "ggml-impl.h" - #include "graph.hpp" #include "logger.hpp" #include "op-config.hpp" @@ -13,15 +12,15 @@ namespace { -bool qnn_is_op_valid(ggml_backend_qnn_device_context *ctx, const ggml_tensor *dst) { +bool qnn_is_op_valid(ggml_backend_qnn_device_context * ctx, const ggml_tensor * dst) { if (!ctx || !dst) { - QNN_LOG_WARN("invalid params"); + QNN_LOG_WARN("invalid params\n"); return false; } auto instance = ctx->instance; if (!instance) { - QNN_LOG_WARN("invalid instance"); + QNN_LOG_WARN("invalid instance\n"); return false; } @@ -32,7 +31,7 @@ bool qnn_is_op_valid(ggml_backend_qnn_device_context *ctx, const ggml_tensor *ds case 2: return dst->src[0] && dst->src[1]; default: - QNN_LOG_WARN("invalid op param count %d", (int)param_count); + QNN_LOG_WARN("invalid op param count %d\n", (int) param_count); break; } @@ -40,60 +39,51 @@ bool qnn_is_op_valid(ggml_backend_qnn_device_context *ctx, const ggml_tensor *ds } #ifndef NDEBUG -void print_ggml_tensor(const ggml_tensor *tensor) { - QNN_LOG_DEBUG("%s: type:%s ne: %ldx%ldx%ldx%ld, nb: %ldx%ldx%ldx%ld", tensor->name, ggml_type_name(tensor->type), - (long)tensor->ne[0], (long)tensor->ne[1], (long)tensor->ne[2], (long)tensor->ne[3], - (long)tensor->nb[0], (long)tensor->nb[1], (long)tensor->nb[2], (long)tensor->nb[3]); +void print_ggml_tensor(const ggml_tensor * tensor) { + QNN_LOG_DEBUG("%s: type:%s ne: %ldx%ldx%ldx%ld, nb: %ldx%ldx%ldx%ld\n", tensor->name, ggml_type_name(tensor->type), + (long) tensor->ne[0], (long) tensor->ne[1], (long) tensor->ne[2], (long) tensor->ne[3], + (long) tensor->nb[0], (long) tensor->nb[1], (long) tensor->nb[2], (long) tensor->nb[3]); } #endif -} // namespace +} // namespace namespace { -typedef bool (*ggml_qnn_op_t)(ggml_backend_qnn_device_context *ctx, ggml_tensor *dst); - -bool execute_graph(qnn::qnn_graph *graph, ggml_tensor *output) { - if (!graph->execute(output)) { - QNN_LOG_WARN("execute failed"); - return false; - } - - return true; -} +typedef bool (*ggml_qnn_op_t)(ggml_backend_qnn_device_context * ctx, ggml_tensor * dst); -void append_tensor_dimensions(const ggml_tensor *tensor, std::string &output) { - char buffer[256] = {}; - const auto *type_name = qnn::get_ggml_type_name(tensor->type); - int len = 0; +void append_tensor_dimensions(const ggml_tensor * tensor, std::string & output) { + char buffer[256] = {}; + const auto * type_name = qnn::get_ggml_type_name(tensor->type); + int len = 0; switch (ggml_n_dims(tensor)) { case 1: - len = snprintf(buffer, sizeof(buffer), "%ld%s", (long)tensor->ne[0], type_name); + len = snprintf(buffer, sizeof(buffer), "%ld%s", (long) tensor->ne[0], type_name); break; case 2: - len = snprintf(buffer, sizeof(buffer), "%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1], type_name); + len = snprintf(buffer, sizeof(buffer), "%ldx%ld%s", (long) tensor->ne[0], (long) tensor->ne[1], type_name); break; case 3: - len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1], - (long)tensor->ne[2], type_name); + len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ld%s", (long) tensor->ne[0], (long) tensor->ne[1], + (long) tensor->ne[2], type_name); break; case 4: default: - len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1], - (long)tensor->ne[2], (long)tensor->ne[3], type_name); + len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ldx%ld%s", (long) tensor->ne[0], (long) tensor->ne[1], + (long) tensor->ne[2], (long) tensor->ne[3], type_name); break; } - GGML_ASSERT(len > 0 && len < (int)sizeof(buffer)); + GGML_ASSERT(len > 0 && len < (int) sizeof(buffer)); output.append(buffer, len); } -void get_graph_key_from_op(const ggml_tensor *op, std::string &output) { +void get_graph_key_from_op(const ggml_tensor * op, std::string & output) { GGML_ASSERT(op->op != GGML_OP_NONE); output += ggml_op_desc(op); output += qnn::get_ggml_type_name(op->type); const auto param_count = qnn::get_qnn_op_input_param_count(op); for (size_t i = 0; i < param_count; ++i) { - auto *input = op->src[i]; + auto * input = op->src[i]; if (!input) { break; } @@ -103,7 +93,7 @@ void get_graph_key_from_op(const ggml_tensor *op, std::string &output) { } } -void get_op_key_with_src_op_desc(const ggml_tensor *op, std::string &output) { +void get_op_key_with_src_op_desc(const ggml_tensor * op, std::string & output) { output += ggml_op_desc(op); output += '('; if (op->src[0]) { @@ -116,25 +106,37 @@ void get_op_key_with_src_op_desc(const ggml_tensor *op, std::string &output) { output += ')'; } -void get_graph_key_from_cgraph(const ggml_cgraph *cgraph, std::string &output) { - // generate key from the graph, the key is used to cache the graph, like: - // "MUL_MATf32_256x16x10f32_256x1x10f32#LOG#ADD#ADDf32_16x1x10f32" +/** + * @brief Generates a unique key for a given computation graph (cgraph). + * + * This key is used to cache the graph, enabling efficient reuse of previously + * compiled graphs. The key is constructed by concatenating the descriptions + * of the operations and their associated tensor dimensions within the graph. + * + * Example key format: "MUL_MATf32_256x16x10f32_256x1x10f32#LOG#ADD#ADDf32_16x1x10f32" + * + * @param cgraph The computation graph for which the key is generated. + * @param output The string where the generated key will be stored. + * + * TODO: Improve the key generation logic to handle more complex graph structures and edge cases. + */ +void get_graph_key_from_cgraph(const ggml_cgraph * cgraph, std::string & output) { if (cgraph->n_nodes == 0) { - QNN_LOG_DEBUG("empty cgraph"); + QNN_LOG_DEBUG("empty cgraph\n"); return; } { bool is_start = true; for (int i = 0; i < cgraph->n_nodes; ++i) { - auto *op = cgraph->nodes[i]; + auto * op = cgraph->nodes[i]; if (ggml_is_empty(op)) { - QNN_LOG_DEBUG("empty op in graph, skipping"); + QNN_LOG_DEBUG("empty op in graph, skipping\n"); continue; } if (op->op == GGML_OP_NONE) { - QNN_LOG_DEBUG("GGML_OP_NONE in graph, skipping"); + QNN_LOG_DEBUG("GGML_OP_NONE in graph, skipping\n"); continue; } @@ -149,55 +151,27 @@ void get_graph_key_from_cgraph(const ggml_cgraph *cgraph, std::string &output) { } if (cgraph->n_nodes > 1) { - auto *last_op = cgraph->nodes[cgraph->n_nodes - 1]; + auto * last_op = cgraph->nodes[cgraph->n_nodes - 1]; output += qnn::get_ggml_type_name(last_op->type); output += '_'; append_tensor_dimensions(last_op, output); } } -qnn::qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *ctx, ggml_tensor *output) { - auto &graph_cache = ctx->qnn_graph_cache; - std::string graph_key; - get_graph_key_from_op(output, graph_key); - auto it = graph_cache.find(graph_key); - qnn::qnn_graph *graph_ptr = nullptr; - if (it != graph_cache.end()) { - QNN_LOG_DEBUG("[%s]found graph %s in cache", qnn::get_backend_name(ctx->device), graph_key.c_str()); - graph_ptr = it->second.get(); - } else { - auto graph = - std::make_unique(graph_key, ctx->device, ctx->instance, ctx->socinfo.vtcm_size_in_mb); - if (!graph->is_valid()) { - return nullptr; - } - - if (!graph->build_graph_from_op(output)) { - QNN_LOG_ERROR("[%s]build_graph_from_op failed", qnn::get_backend_name(ctx->device)); - return nullptr; - } - - graph_ptr = graph.get(); - graph_cache[graph_key] = std::move(graph); - } - - return graph_ptr; -} - -qnn::qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *ctx, const ggml_cgraph *cgraph) { - auto &graph_cache = ctx->qnn_graph_cache; +qnn::qnn_graph * get_qnn_graph_from_cache(ggml_backend_qnn_device_context * ctx, const ggml_cgraph * cgraph) { + auto & graph_cache = ctx->qnn_graph_cache; std::string graph_key; get_graph_key_from_cgraph(cgraph, graph_key); if (graph_key.empty()) { - QNN_LOG_DEBUG("[%s]empty graph key for cgraph: %p, size: %d", qnn::get_backend_name(ctx->device), cgraph, - (int)cgraph->n_nodes); + QNN_LOG_DEBUG("[%s]empty graph key for cgraph: %p, size: %d\n", qnn::get_backend_name(ctx->device), + (const void *) cgraph, (int) cgraph->n_nodes); return nullptr; } - auto it = graph_cache.find(graph_key); - qnn::qnn_graph *graph_ptr = nullptr; + auto it = graph_cache.find(graph_key); + qnn::qnn_graph * graph_ptr = nullptr; if (it != graph_cache.end()) { - QNN_LOG_DEBUG("[%s]found graph %s in cache", qnn::get_backend_name(ctx->device), graph_key.c_str()); + QNN_LOG_DEBUG("[%s]found graph %s in cache\n", qnn::get_backend_name(ctx->device), graph_key.c_str()); graph_ptr = it->second.get(); } else { auto graph = @@ -207,180 +181,151 @@ qnn::qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *ctx, c } if (!graph->build_graph_from_ggml_graph(cgraph)) { - QNN_LOG_ERROR("[%s]build_graph_from_op failed", qnn::get_backend_name(ctx->device)); + QNN_LOG_ERROR("[%s]build_graph_from_op failed\n", qnn::get_backend_name(ctx->device)); return nullptr; } - graph_ptr = graph.get(); + graph_ptr = graph.get(); graph_cache[graph_key] = std::move(graph); } return graph_ptr; } -bool qnn_generic_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *dst) { - if (!qnn_is_op_valid(ctx, dst)) { - return false; - } - - auto *graph_ptr = get_qnn_graph_from_cache(ctx, dst); - bool succeed = graph_ptr && execute_graph(graph_ptr, dst); - -#ifndef NDEBUG - if (!succeed) { - const auto param_count = qnn::get_qnn_op_input_param_count(dst); - for (size_t i = 0; i < param_count; ++i) { - print_ggml_tensor(dst->src[i]); - } - print_ggml_tensor(dst); - } -#endif - - return succeed; -} - -bool qnn_nop_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *dst) { - GGML_UNUSED(ctx); - GGML_UNUSED(dst); - return true; -} - -constexpr const ggml_qnn_op_t kQnnOpsTable[] = { - qnn_nop_impl, // GGML_OP_NONE - nullptr, // GGML_OP_DUP - qnn_generic_op_impl, // GGML_OP_ADD - nullptr, // GGML_OP_ADD1 - nullptr, // GGML_OP_ACC - qnn_generic_op_impl, // GGML_OP_SUB - qnn_generic_op_impl, // GGML_OP_MUL - qnn_generic_op_impl, // GGML_OP_DIV - nullptr, // GGML_OP_SQR - qnn_generic_op_impl, // GGML_OP_SQRT - qnn_generic_op_impl, // GGML_OP_LOG - nullptr, // GGML_OP_SIN - nullptr, // GGML_OP_COS - nullptr, // GGML_OP_SUM - nullptr, // GGML_OP_SUM_ROWS - nullptr, // GGML_OP_MEAN - nullptr, // GGML_OP_ARGMAX - nullptr, // GGML_OP_COUNT_EQUAL - nullptr, // GGML_OP_REPEAT - nullptr, // GGML_OP_REPEAT_BACK - nullptr, // GGML_OP_CONCAT - nullptr, // GGML_OP_SILU_BACK - nullptr, // GGML_OP_NORM - nullptr, // GGML_OP_RMS_NORM - nullptr, // GGML_OP_RMS_NORM_BACK - nullptr, // GGML_OP_GROUP_NORM - - qnn_generic_op_impl, // GGML_OP_MUL_MAT - nullptr, // GGML_OP_MUL_MAT_ID - nullptr, // GGML_OP_OUT_PROD - - nullptr, // GGML_OP_SCALE - nullptr, // GGML_OP_SET - nullptr, // GGML_OP_CPY - nullptr, // GGML_OP_CONT - qnn_nop_impl, // GGML_OP_RESHAPE - nullptr, // GGML_OP_VIEW - nullptr, // GGML_OP_PERMUTE - nullptr, // GGML_OP_TRANSPOSE - nullptr, // GGML_OP_GET_ROWS - nullptr, // GGML_OP_GET_ROWS_BACK - nullptr, // GGML_OP_DIAG - nullptr, // GGML_OP_DIAG_MASK_INF - nullptr, // GGML_OP_DIAG_MASK_ZERO - nullptr, // GGML_OP_SOFT_MAX - nullptr, // GGML_OP_SOFT_MAX_BACK - nullptr, // GGML_OP_ROPE - nullptr, // GGML_OP_ROPE_BACK - nullptr, // GGML_OP_CLAMP - nullptr, // GGML_OP_CONV_TRANSPOSE_1D - nullptr, // GGML_OP_IM2COL - nullptr, // GGML_OP_IM2COL_BACK - nullptr, // GGML_OP_CONV_TRANSPOSE_2D - nullptr, // GGML_OP_POOL_1D - nullptr, // GGML_OP_POOL_2D - nullptr, // GGML_OP_POOL_2D_BACK - nullptr, // GGML_OP_UPSCALE - nullptr, // GGML_OP_PAD - nullptr, // GGML_OP_PAD_REFLECT_1D - nullptr, // GGML_OP_ARANGE - nullptr, // GGML_OP_TIMESTEP_EMBEDDING - nullptr, // GGML_OP_ARGSORT - nullptr, // GGML_OP_LEAKY_RELU - - nullptr, // GGML_OP_FLASH_ATTN_EXT - nullptr, // GGML_OP_FLASH_ATTN_BACK - nullptr, // GGML_OP_SSM_CONV - nullptr, // GGML_OP_SSM_SCAN - nullptr, // GGML_OP_WIN_PART - nullptr, // GGML_OP_WIN_UNPART - nullptr, // GGML_OP_GET_REL_POS - nullptr, // GGML_OP_ADD_REL_POS - nullptr, // GGML_OP_RWKV_WKV6 - nullptr, // GGML_OP_GATED_LINEAR_ATTN - - nullptr, // GGML_OP_UNARY - - nullptr, // GGML_OP_MAP_UNARY - nullptr, // GGML_OP_MAP_BINARY - - nullptr, // GGML_OP_MAP_CUSTOM1_F32 - nullptr, // GGML_OP_MAP_CUSTOM2_F32 - nullptr, // GGML_OP_MAP_CUSTOM3_F32 - - nullptr, // GGML_OP_MAP_CUSTOM1 - nullptr, // GGML_OP_MAP_CUSTOM2 - nullptr, // GGML_OP_MAP_CUSTOM3 - - nullptr, // GGML_OP_CROSS_ENTROPY_LOSS - nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK - nullptr, // GGML_OP_OPT_STEP_ADAMW +// TODO: could be merge into op caps array +constexpr const bool kQnnSupportedOps[] = { + true, // GGML_OP_NONE + false, // GGML_OP_DUP + true, // GGML_OP_ADD + false, // GGML_OP_ADD1 + false, // GGML_OP_ACC + true, // GGML_OP_SUB + true, // GGML_OP_MUL + true, // GGML_OP_DIV + false, // GGML_OP_SQR + true, // GGML_OP_SQRT + true, // GGML_OP_LOG + false, // GGML_OP_SIN + false, // GGML_OP_COS + false, // GGML_OP_SUM + false, // GGML_OP_SUM_ROWS + false, // GGML_OP_MEAN + false, // GGML_OP_ARGMAX + false, // GGML_OP_COUNT_EQUAL + false, // GGML_OP_REPEAT + false, // GGML_OP_REPEAT_BACK + false, // GGML_OP_CONCAT + false, // GGML_OP_SILU_BACK + false, // GGML_OP_NORM + false, // GGML_OP_RMS_NORM + false, // GGML_OP_RMS_NORM_BACK + false, // GGML_OP_GROUP_NORM + + true, // GGML_OP_MUL_MAT + false, // GGML_OP_MUL_MAT_ID + false, // GGML_OP_OUT_PROD + + false, // GGML_OP_SCALE + false, // GGML_OP_SET + false, // GGML_OP_CPY + false, // GGML_OP_CONT + true, // GGML_OP_RESHAPE + false, // GGML_OP_VIEW + false, // GGML_OP_PERMUTE + false, // GGML_OP_TRANSPOSE + false, // GGML_OP_GET_ROWS + false, // GGML_OP_GET_ROWS_BACK + false, // GGML_OP_DIAG + false, // GGML_OP_DIAG_MASK_INF + false, // GGML_OP_DIAG_MASK_ZERO + false, // GGML_OP_SOFT_MAX + false, // GGML_OP_SOFT_MAX_BACK + false, // GGML_OP_ROPE + false, // GGML_OP_ROPE_BACK + false, // GGML_OP_CLAMP + false, // GGML_OP_CONV_TRANSPOSE_1D + false, // GGML_OP_IM2COL + false, // GGML_OP_IM2COL_BACK + false, // GGML_OP_CONV_TRANSPOSE_2D + false, // GGML_OP_POOL_1D + false, // GGML_OP_POOL_2D + false, // GGML_OP_POOL_2D_BACK + false, // GGML_OP_UPSCALE + false, // GGML_OP_PAD + false, // GGML_OP_PAD_REFLECT_1D + false, // GGML_OP_ARANGE + false, // GGML_OP_TIMESTEP_EMBEDDING + false, // GGML_OP_ARGSORT + false, // GGML_OP_LEAKY_RELU + + false, // GGML_OP_FLASH_ATTN_EXT + false, // GGML_OP_FLASH_ATTN_BACK + false, // GGML_OP_SSM_CONV + false, // GGML_OP_SSM_SCAN + false, // GGML_OP_WIN_PART + false, // GGML_OP_WIN_UNPART + false, // GGML_OP_GET_REL_POS + false, // GGML_OP_ADD_REL_POS + false, // GGML_OP_RWKV_WKV6 + false, // GGML_OP_GATED_LINEAR_ATTN + + false, // GGML_OP_UNARY + + false, // GGML_OP_MAP_UNARY + false, // GGML_OP_MAP_BINARY + + false, // GGML_OP_MAP_CUSTOM1_F32 + false, // GGML_OP_MAP_CUSTOM2_F32 + false, // GGML_OP_MAP_CUSTOM3_F32 + + false, // GGML_OP_MAP_CUSTOM1 + false, // GGML_OP_MAP_CUSTOM2 + false, // GGML_OP_MAP_CUSTOM3 + + false, // GGML_OP_CROSS_ENTROPY_LOSS + false, // GGML_OP_CROSS_ENTROPY_LOSS_BACK + false, // GGML_OP_OPT_STEP_ADAMW // ggml_unary_op - nullptr, // GGML_UNARY_OP_ABS - nullptr, // GGML_UNARY_OP_SGN - nullptr, // GGML_UNARY_OP_NEG - nullptr, // GGML_UNARY_OP_STEP - nullptr, // GGML_UNARY_OP_TANH - nullptr, // GGML_UNARY_OP_ELU - nullptr, // GGML_UNARY_OP_RELU - nullptr, // GGML_UNARY_OP_SIGMOID - qnn_generic_op_impl, // GGML_UNARY_OP_GELU - nullptr, // GGML_UNARY_OP_GELU_QUICK - nullptr, // GGML_UNARY_OP_SILU - nullptr, // GGML_UNARY_OP_HARDSWISH - nullptr, // GGML_UNARY_OP_HARDSIGMOID - nullptr, // GGML_UNARY_OP_EXP + false, // GGML_UNARY_OP_ABS + false, // GGML_UNARY_OP_SGN + false, // GGML_UNARY_OP_NEG + false, // GGML_UNARY_OP_STEP + false, // GGML_UNARY_OP_TANH + false, // GGML_UNARY_OP_ELU + false, // GGML_UNARY_OP_RELU + false, // GGML_UNARY_OP_SIGMOID + true, // GGML_UNARY_OP_GELU + false, // GGML_UNARY_OP_GELU_QUICK + false, // GGML_UNARY_OP_SILU + false, // GGML_UNARY_OP_HARDSWISH + false, // GGML_UNARY_OP_HARDSIGMOID + false, // GGML_UNARY_OP_EXP }; -static_assert(kQnnOpsTable[GGML_OP_NONE] == qnn_nop_impl, "GGML_OP_NONE does not match the qnn_nop_impl function"); -static_assert(kQnnOpsTable[GGML_OP_ADD] == qnn_generic_op_impl, - "GGML_OP_ADD does not match the qnn_generic_op_impl function"); -static_assert(kQnnOpsTable[GGML_OP_MUL] == qnn_generic_op_impl, - "GGML_OP_MUL does not match the qnn_generic_op_impl function"); -static_assert(kQnnOpsTable[GGML_OP_MUL_MAT] == qnn_generic_op_impl, - "GGML_OP_MUL_MAT does not match the qnn_generic_op_impl function"); -static_assert(kQnnOpsTable[GGML_OP_RESHAPE] == qnn_nop_impl, - "GGML_OP_RESHAPE does not match the qnn_nop_impl function"); -static_assert(kQnnOpsTable[GGML_OP_VIEW] == nullptr, "GGML_OP_VIEW is not nullptr"); -static_assert(std::size(kQnnOpsTable) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), - "GGML_OP_COUNT does not match the size of the kQnnOpsTable table"); - -bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_tensor *tensor) { +static_assert(kQnnSupportedOps[GGML_OP_NONE], "GGML_OP_NONE is not true"); +static_assert(kQnnSupportedOps[GGML_OP_ADD], "GGML_OP_ADD is not true"); +static_assert(kQnnSupportedOps[GGML_OP_MUL], "GGML_OP_MUL is not true"); +static_assert(kQnnSupportedOps[GGML_OP_MUL_MAT], + "GGML_OP_MUL_MAT is not true, please check the kQnnSupportedOps table in the backend-ops.cpp file"); +static_assert(kQnnSupportedOps[GGML_OP_RESHAPE], "GGML_OP_RESHAPE is not true"); +static_assert(!kQnnSupportedOps[GGML_OP_VIEW], "GGML_OP_VIEW is not false"); +static_assert(std::size(kQnnSupportedOps) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), + "GGML_OP_COUNT does not match the size of the kQnnSupportedOps table"); + +bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context * ctx, const ggml_tensor * tensor) { if (!tensor) { - QNN_LOG_DEBUG("tensor is nullptr"); + QNN_LOG_DEBUG("tensor is nullptr\n"); return false; } #ifndef NDEBUG if (tensor->view_src) { - auto *src_tensor = tensor->view_src; - QNN_LOG_DEBUG("[%s]tensor(%s_%dx%dx%dx%d) is a view, src: %s_%dx%dx%dx%d", qnn::get_backend_name(ctx->device), - ggml_get_name(tensor), tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], - ggml_get_name(src_tensor), src_tensor->ne[0], src_tensor->ne[1], src_tensor->ne[2], - src_tensor->ne[3]); + auto * src_tensor = tensor->view_src; + QNN_LOG_DEBUG("[%s]tensor(%s_%dx%dx%dx%d) is a view, src: %s_%dx%dx%dx%d\n", qnn::get_backend_name(ctx->device), + ggml_get_name(tensor), (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2], + (int) tensor->ne[3], ggml_get_name(src_tensor), (int) src_tensor->ne[0], (int) src_tensor->ne[1], + (int) src_tensor->ne[2], (int) src_tensor->ne[3]); } #endif @@ -390,13 +335,14 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t case GGML_TYPE_Q8_0: case GGML_TYPE_Q4_0: if (!(ctx->supported_types & (uint64_t(1) << tensor->type))) { - QNN_LOG_DEBUG("[%s]unsupported data type %s, supported_types: 0x%x", qnn::get_backend_name(ctx->device), - ggml_type_name(tensor->type), ctx->supported_types); + QNN_LOG_DEBUG("[%s]unsupported data type %s, supported_types: 0x%x\n", + qnn::get_backend_name(ctx->device), ggml_type_name(tensor->type), + (unsigned int) ctx->supported_types); return false; } break; default: - QNN_LOG_DEBUG("[%s]unsupported data type %s", qnn::get_backend_name(ctx->device), + QNN_LOG_DEBUG("[%s]unsupported data type %s\n", qnn::get_backend_name(ctx->device), ggml_type_name(tensor->type)); return false; } @@ -404,7 +350,7 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t return true; } -bool ggnl_qnn_supports_op_tensor(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) { +bool ggnl_qnn_supports_op_tensor(ggml_backend_qnn_device_context * ctx, const ggml_tensor * op) { if (op->op == GGML_OP_NONE) { return true; } @@ -423,14 +369,14 @@ bool ggnl_qnn_supports_op_tensor(ggml_backend_qnn_device_context *ctx, const ggm return true; } -bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) { +bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context * ctx, const ggml_tensor * op) { constexpr const size_t kMaxNpuTensorSize = 8192L * 2048 + 8192 * 512 + 2048 * 512; - constexpr const auto get_tensor_size = [](const ggml_tensor *tensor) -> size_t { + constexpr const auto get_tensor_size = [](const ggml_tensor * tensor) -> size_t { return tensor->ne[0] * tensor->ne[1] * tensor->ne[2] * tensor->ne[3]; }; - auto *src0 = op->src[0]; - auto *src1 = op->src[1]; + auto * src0 = op->src[0]; + auto * src1 = op->src[1]; switch (ctx->device) { case QNN_BACKEND_NPU: if (src1->ne[2] != src0->ne[2] || src1->ne[3] != src0->ne[3]) { @@ -438,12 +384,10 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggm * TODO: remove the blocker here when NPU backend supports mul_mat like this: * [ne03, ne02, n, k] * [ne03 * x, ne02 * y, m, k] -> [ne03 * x, ne02 * y, m, n] */ - QNN_LOG_DEBUG("[qnn-npu][MUL_MAT]src0 and src1 dimensions are not equal, support/unsupported: %d/%d", - ctx->support_op_count.load(), ++(ctx->unsupported_op_count)); + QNN_LOG_DEBUG("[qnn-npu][MUL_MAT]src0 and src1 dimensions are not equal\n"); return false; } else if (get_tensor_size(src0) + get_tensor_size(src1) + get_tensor_size(op) >= kMaxNpuTensorSize) { - QNN_LOG_DEBUG("[qnn-npu][MUL_MAT]tensor size is too large, support/unsupported: %d/%d", - ctx->support_op_count.load(), ++(ctx->unsupported_op_count)); + QNN_LOG_DEBUG("[qnn-npu][MUL_MAT]tensor size is too large\n"); return false; } // fall through, from test here, the convert op is super slow on NPU: @@ -451,9 +395,8 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggm case QNN_BACKEND_GPU: if (src0->type != src1->type || src0->type != op->type) { // there's no convert op for GPU. - QNN_LOG_DEBUG( - "[qnn-gpu][MUL_MAT]type src0(%d), src1(%d) and op(%d) are not equal, support/unsupported: %d/%d", - src0->type, src1->type, op->type, ctx->support_op_count.load(), ++(ctx->unsupported_op_count)); + QNN_LOG_DEBUG("[qnn-gpu][MUL_MAT]type src0(%s), src1(%s) and op(%s) are not equal\n", + ggml_type_name(src0->type), ggml_type_name(src1->type), ggml_type_name(op->type)); return false; } break; @@ -462,31 +405,31 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggm } if ((src1->ne[2] % src0->ne[2]) != 0 || (src1->ne[3] % src0->ne[3]) != 0) { - QNN_LOG_DEBUG("[%s][MUL_MAT]src0 and src1 dimensions are not equal, support/unsupported: %d/%d", - qnn::get_backend_name(ctx->device), ctx->support_op_count.load(), ++(ctx->unsupported_op_count)); + QNN_LOG_DEBUG("[%s][MUL_MAT]src0 and src1 dimensions are not equal\n", qnn::get_backend_name(ctx->device)); return false; } - QNN_LOG_DEBUG("[%s][MUL_MAT]supported matmul op, support/unsupported: %d/%d", qnn::get_backend_name(ctx->device), - ++(ctx->support_op_count), ctx->unsupported_op_count.load()); + QNN_LOG_DEBUG("[%s][MUL_MAT]supported matmul op\n", qnn::get_backend_name(ctx->device)); return true; } -} // namespace +} // namespace namespace qnn { -bool device_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) { +bool device_supports_op(ggml_backend_qnn_device_context * ctx, const ggml_tensor * op) { // Note that this function could be called before the device context is initialized if (op->op == GGML_OP_NONE) { return true; } - if (!kQnnOpsTable[qnn::get_qnn_op_index(op)]) { + if (!kQnnSupportedOps[qnn::get_qnn_op_index(op)]) { #ifndef NDEBUG std::string op_key; get_graph_key_from_op(op, op_key); - QNN_LOG_DEBUG("[%s]unsupported op", op_key.c_str()); + ctx->unsupported_op_count++; + QNN_LOG_DEBUG("[%s][%s]op was unsupported, support/unsupported: %d/%d\n", qnn::get_backend_name(ctx->device), + op_key.c_str(), ctx->supported_op_count.load(), ctx->unsupported_op_count.load()); #endif return false; } @@ -495,48 +438,69 @@ bool device_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor #ifndef NDEBUG std::string tensor_dims; append_tensor_dimensions(op, tensor_dims); - QNN_LOG_DEBUG("[%s]unsupported tensor(%s)", ggml_op_name(op->op), tensor_dims.c_str()); + QNN_LOG_DEBUG("[%s][%s]unsupported tensor(%s), support/unsupported: %d/%d\n", + qnn::get_backend_name(ctx->device), ggml_op_name(op->op), tensor_dims.c_str(), + ctx->supported_op_count.load(), ctx->unsupported_op_count.load()); #endif return false; } + bool is_op_supported = true; if (op->op == GGML_OP_UNARY) { const auto unary_op = ggml_get_unary_op(op); if (unary_op == GGML_UNARY_OP_GELU) { // TODO: fix this - QNN_LOG_DEBUG("[GELU]unsupported unary op GGML_UNARY_OP_GELU for NPU"); - return false; + QNN_LOG_DEBUG("[GELU]unsupported unary op GGML_UNARY_OP_GELU for NPU\n"); + is_op_supported = false; } } else { - auto *src0 = op->src[0]; - auto *src1 = op->src[1]; + auto * src0 = op->src[0]; + auto * src1 = op->src[1]; switch (op->op) { case GGML_OP_ADD: + case GGML_OP_SUB: + case GGML_OP_MUL: + case GGML_OP_DIV: if (!ggml_are_same_shape(src0, src1)) { - QNN_LOG_DEBUG("[ADD] src0 and src1 dimensions are not equal"); - return false; + QNN_LOG_DEBUG("[%s][%s] src0 and src1 dimensions are not equal\n", + qnn::get_backend_name(ctx->device), ggml_op_name(op->op)); + is_op_supported = false; } break; - case GGML_OP_MUL_MAT: - return ggml_qnn_supports_matmul_op(ctx, op); + is_op_supported = ggml_qnn_supports_matmul_op(ctx, op); + break; default: - return false; + // default to supported + break; } } - return true; +#ifndef NDEBUG + if (is_op_supported) { + ctx->supported_op_count++; + QNN_LOG_DEBUG("[%s][%s]op was supported, support/unsupported: %d/%d\n", qnn::get_backend_name(ctx->device), + ggml_op_name(op->op), ctx->supported_op_count.load(), ctx->unsupported_op_count.load()); + } else { + ctx->unsupported_op_count++; + QNN_LOG_DEBUG("[%s][%s]op was unsupported, support/unsupported: %d/%d\n", qnn::get_backend_name(ctx->device), + ggml_op_name(op->op), ctx->supported_op_count.load(), ctx->unsupported_op_count.load()); + } +#endif + + return is_op_supported; } -bool device_compute_graph(ggml_backend_qnn_device_context *ctx, ggml_cgraph *cgraph) { - QNN_LOG_DEBUG("[%s]compute graph start, nodes count: %d", qnn::get_backend_name(ctx->device), (int)cgraph->n_nodes); +bool device_compute_graph(ggml_backend_qnn_device_context * ctx, ggml_cgraph * cgraph) { + QNN_LOG_DEBUG("[%s]compute graph start, nodes count: %d\n", qnn::get_backend_name(ctx->device), + (int) cgraph->n_nodes); auto qnn_graph = get_qnn_graph_from_cache(ctx, cgraph); - bool success = qnn_graph && qnn_graph->execute(cgraph); + bool success = qnn_graph && qnn_graph->execute(cgraph); - QNN_LOG_DEBUG("[%s]compute graph, success: %d", qnn::get_backend_name(ctx->device), (int)success); + QNN_LOG_DEBUG("[%s]compute graph, success: %d\n", qnn::get_backend_name(ctx->device), (int) success); return success; } -} // namespace qnn +} // namespace qnn diff --git a/ggml/src/ggml-qnn/backend-ops.hpp b/ggml/src/ggml-qnn/backend-ops.hpp index c49c4d6dc19d7..64fb10f00ddfe 100644 --- a/ggml/src/ggml-qnn/backend-ops.hpp +++ b/ggml/src/ggml-qnn/backend-ops.hpp @@ -1,12 +1,11 @@ #pragma once -#include "ggml.h" - #include "backend.hpp" +#include "ggml.h" namespace qnn { -bool device_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op); -bool device_compute_graph(ggml_backend_qnn_device_context *ctx, ggml_cgraph *cgraph); +bool device_supports_op(ggml_backend_qnn_device_context * ctx, const ggml_tensor * op); +bool device_compute_graph(ggml_backend_qnn_device_context * ctx, ggml_cgraph * cgraph); -} // namespace qnn +} // namespace qnn diff --git a/ggml/src/ggml-qnn/backend.hpp b/ggml/src/ggml-qnn/backend.hpp index df5e2eb08fb8f..253b0b672383d 100644 --- a/ggml/src/ggml-qnn/backend.hpp +++ b/ggml/src/ggml-qnn/backend.hpp @@ -2,7 +2,7 @@ #pragma once #ifndef NDEBUG -#include +# include #endif #include @@ -10,39 +10,41 @@ #include #include -#include "ggml.h" - #include "ggml-backend.h" #include "ggml-qnn.h" - +#include "ggml.h" #include "graph.hpp" #include "qnn-lib.hpp" namespace qnn { typedef std::unordered_map> qnn_graph_cache_t; -} // namespace qnn +} // namespace qnn struct ggml_backend_qnn_device_context { // initialize in constructor - QNNBackend device; - size_t threads; + QNNBackend device; + size_t threads; std::string name; std::string lib_name; // initialize in qnn init - qnn::qcom_socinfo socinfo = {}; - uint64_t supported_types; - std::shared_ptr instance; + qnn::qcom_socinfo socinfo = {}; + uint64_t supported_types; + std::shared_ptr instance; std::shared_ptr qnn_interface; qnn::qnn_graph_cache_t qnn_graph_cache; #ifndef NDEBUG - std::atomic_uint32_t support_op_count = 0; + std::atomic_uint32_t supported_op_count = 0; std::atomic_uint32_t unsupported_op_count = 0; #endif - explicit ggml_backend_qnn_device_context(QNNBackend device, size_t threads, const char *name, const char *lib_name, - uint64_t supported_types) - : device(device), threads(threads), name(name), lib_name(lib_name), supported_types(supported_types) {} + explicit ggml_backend_qnn_device_context(QNNBackend device, size_t threads, const char * name, + const char * lib_name, uint64_t supported_types) : + device(device), + threads(threads), + name(name), + lib_name(lib_name), + supported_types(supported_types) {} }; diff --git a/ggml/src/ggml-qnn/buffer.hpp b/ggml/src/ggml-qnn/buffer.hpp index ce796cbe4df08..43c4666dd15b1 100644 --- a/ggml/src/ggml-qnn/buffer.hpp +++ b/ggml/src/ggml-qnn/buffer.hpp @@ -14,7 +14,7 @@ namespace qnn { * This abstract class defines the interface for managing generic memory buffers in a QNN context. */ class qnn_buffer_interface { -public: + public: virtual ~qnn_buffer_interface() = default; /** @@ -35,7 +35,7 @@ class qnn_buffer_interface { * * @return A pointer to the buffer. */ - virtual uint8_t *get_buffer() = 0; + virtual uint8_t * get_buffer() = 0; /** * @brief Gets the buffer pointer. @@ -68,21 +68,22 @@ using qnn_buffer_ptr = std::shared_ptr; * handles cleanup of the buffer and its associated memory handle upon destruction. */ class qnn_rpc_buffer : public qnn_buffer_interface { -public: + public: qnn_rpc_buffer(std::shared_ptr qnn_instance, const size_t size, const uint32_t rank, - uint32_t *dimensions, Qnn_DataType_t data_type) - : _size(size), _qnn_instance(qnn_instance) { - - _qnn_rpc_buffer = static_cast(qnn_instance->alloc_rpcmem(size, alignof(uint8_t *))); + uint32_t * dimensions, Qnn_DataType_t data_type) : + _size(size), + _qnn_instance(qnn_instance) { + _qnn_rpc_buffer = static_cast(qnn_instance->alloc_rpcmem(size, alignof(uint8_t *))); _qnn_rpc_mem_handle = qnn_instance->register_rpcmem(_qnn_rpc_buffer, rank, dimensions, data_type); if (!_qnn_rpc_buffer || !_qnn_rpc_mem_handle) { - QNN_LOG_WARN("Failed to register RPC memory: buffer or memory handle is null"); + QNN_LOG_WARN("Failed to register RPC memory: buffer or memory handle is null\n"); // let the destructor free the buffer return; } - QNN_LOG_DEBUG("alloc rpcmem(%p) successfully, size %d", _qnn_rpc_buffer, (int)size); + QNN_LOG_DEBUG("alloc rpcmem(%p) successfully, size %d\n", (void *) _qnn_rpc_buffer, (int) size); } + ~qnn_rpc_buffer() { if (_qnn_instance) { if (_qnn_rpc_mem_handle) { @@ -97,14 +98,16 @@ class qnn_rpc_buffer : public qnn_buffer_interface { bool is_valid() const override { return _qnn_rpc_buffer && _qnn_rpc_mem_handle; } - uint8_t *get_buffer() override { return _qnn_rpc_buffer; } + uint8_t * get_buffer() override { return _qnn_rpc_buffer; } + size_t get_size() const override { return _size; } + Qnn_MemHandle_t get_mem_handle() const override { return _qnn_rpc_mem_handle; } -private: - size_t _size = 0; - uint8_t *_qnn_rpc_buffer = nullptr; - Qnn_MemHandle_t _qnn_rpc_mem_handle = nullptr; + private: + size_t _size = 0; + uint8_t * _qnn_rpc_buffer = nullptr; + Qnn_MemHandle_t _qnn_rpc_mem_handle = nullptr; std::shared_ptr _qnn_instance; DISABLE_COPY(qnn_rpc_buffer); @@ -119,12 +122,12 @@ class qnn_rpc_buffer : public qnn_buffer_interface { * a consistent interface for buffer management. */ class qnn_mem_buffer : public qnn_buffer_interface { -public: - explicit qnn_mem_buffer(const uint8_t *data, size_t size) { + public: + explicit qnn_mem_buffer(const uint8_t * data, size_t size) { _buffer = reinterpret_cast(qnn::page_align_alloc(size)); if (!_buffer) { - QNN_LOG_WARN("failed to allocate %.2f MiB", float(size / (1 << 20))); + QNN_LOG_WARN("failed to allocate %.2f MiB\n", float(size / (1 << 20))); return; } @@ -134,49 +137,51 @@ class qnn_mem_buffer : public qnn_buffer_interface { memcpy(_buffer, data, size); } - QNN_LOG_DEBUG("alloc buffer: %p, size: %ld", _buffer, size); + QNN_LOG_DEBUG("alloc buffer: %p, size: %ld\n", (void *) _buffer, (long) size); } explicit qnn_mem_buffer(size_t size) : qnn_mem_buffer(nullptr, size) {} ~qnn_mem_buffer() { - QNN_LOG_DEBUG("free buffer: %p, size: %ld", _buffer, _size); + QNN_LOG_DEBUG("free buffer: %p, size: %ld\n", (void *) _buffer, (long) _size); // the free will do nothing if the _buffer is nullptr qnn::align_free(_buffer); } bool is_valid() const override { return _buffer != nullptr; } - uint8_t *get_buffer() override { return _buffer; } + uint8_t * get_buffer() override { return _buffer; } + size_t get_size() const override { return _size; } + Qnn_MemHandle_t get_mem_handle() const override { return nullptr; } -private: - size_t _size = 0; - uint8_t *_buffer = nullptr; + private: + size_t _size = 0; + uint8_t * _buffer = nullptr; DISABLE_COPY(qnn_mem_buffer); DISABLE_MOVE(qnn_mem_buffer); }; class qnn_mem_buffer_slice : public qnn_buffer_interface { -public: - qnn_mem_buffer_slice(const uint8_t *buffer, size_t size) : _buffer(const_cast(buffer)), _size(size) {} + public: + qnn_mem_buffer_slice(const uint8_t * buffer, size_t size) : _buffer(const_cast(buffer)), _size(size) {} bool is_valid() const override { return _buffer && _size; } - uint8_t *get_buffer() override { return _buffer; } + uint8_t * get_buffer() override { return _buffer; } size_t get_size() const override { return _size; } Qnn_MemHandle_t get_mem_handle() const override { return nullptr; } -private: - uint8_t *_buffer = nullptr; - size_t _size = 0; + private: + uint8_t * _buffer = nullptr; + size_t _size = 0; DISABLE_COPY(qnn_mem_buffer_slice); DISABLE_MOVE(qnn_mem_buffer_slice); }; -} // namespace qnn +} // namespace qnn diff --git a/ggml/src/ggml-qnn/dl-loader.hpp b/ggml/src/ggml-qnn/dl-loader.hpp new file mode 100644 index 0000000000000..e183d190ce18f --- /dev/null +++ b/ggml/src/ggml-qnn/dl-loader.hpp @@ -0,0 +1,76 @@ +#pragma once + +#ifdef __linux__ +# include +# include +#elif defined(_WIN32) +# define WIN32_LEAN_AND_MEAN +# ifndef NOMINMAX +# define NOMINMAX +# endif +# include +#endif + +#include + +namespace qnn { + +#ifdef __linux__ +typedef void * dl_handler_t; + +inline qnn::dl_handler_t dl_load(const std::string & lib_path) { + return dlopen(lib_path.c_str(), RTLD_NOW | RTLD_LOCAL); +} + +inline void * dl_sym(qnn::dl_handler_t handle, const std::string & symbol) { + return dlsym(handle, symbol.c_str()); +} + +inline bool dl_unload(qnn::dl_handler_t handle) { + return dlclose(handle) == 0; +} + +inline const char * dl_error() { + return dlerror(); +} +#elif defined(_WIN32) +using dl_handler_t = HMODULE; + +inline qnn::dl_handler_t dl_load(const std::string & lib_path) { + // suppress error dialogs for missing DLLs + auto old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); + SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); + + auto handle = LoadLibraryA(lib_path.c_str()); // TODO: use wstring version for unicode paths + + SetErrorMode(old_mode); + return handle; +} + +inline void * dl_sym(qnn::dl_handler_t handle, const std::string & symbol) { + auto old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); + SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); + + void * p = (void *) GetProcAddress(handle, symbol.c_str()); + + SetErrorMode(old_mode); + return p; +} + +inline bool dl_unload(qnn::dl_handler_t handle) { + FreeLibrary(handle); + return true; +} + +inline const char * dl_error() { + // TODO: implement dl_error for Windows + return nullptr; +} + +#endif + +template Fn dl_sym_typed(qnn::dl_handler_t handle, const std::string & function_name) { + return reinterpret_cast(dl_sym(handle, function_name)); +} + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/dl_loader.hpp b/ggml/src/ggml-qnn/dl_loader.hpp deleted file mode 100644 index 1beec8866ba4c..0000000000000 --- a/ggml/src/ggml-qnn/dl_loader.hpp +++ /dev/null @@ -1,71 +0,0 @@ -#pragma once - -#ifdef __linux__ -#include -#include -#elif defined(_WIN32) -#define WIN32_LEAN_AND_MEAN -#ifndef NOMINMAX -#define NOMINMAX -#endif -#include -#endif - -#include - -namespace qnn { - -#ifdef __linux__ -typedef void *dl_handler_t; - -inline qnn::dl_handler_t dl_load(const std::string &lib_path) { - return dlopen(lib_path.c_str(), RTLD_NOW | RTLD_LOCAL); -} - -inline void *dl_sym(qnn::dl_handler_t handle, const std::string &symbol) { return dlsym(handle, symbol.c_str()); } - -inline bool dl_unload(qnn::dl_handler_t handle) { return dlclose(handle) == 0; } - -inline const char *dl_error() { return dlerror(); } -#elif defined(_WIN32) -using dl_handler_t = HMODULE; - -inline qnn::dl_handler_t dl_load(const std::string &lib_path) { - // suppress error dialogs for missing DLLs - auto old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); - SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); - - auto handle = LoadLibraryA(lib_path.c_str()); // TODO: use wstring version for unicode paths - - SetErrorMode(old_mode); - return handle; -} - -inline void *dl_sym(qnn::dl_handler_t handle, const std::string &symbol) { - auto old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); - SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); - - void *p = (void *)GetProcAddress(handle, symbol.c_str()); - - SetErrorMode(old_mode); - return p; -} - -inline bool dl_unload(qnn::dl_handler_t handle) { - FreeLibrary(handle); - return true; -} - -inline const char *dl_error() { - // TODO: implement dl_error for Windows - return nullptr; -} - -#endif - -template -Fn dl_sym_typed(qnn::dl_handler_t handle, const std::string &function_name) { - return reinterpret_cast(dl_sym(handle, function_name)); -} - -} // namespace qnn diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp index 8150dcb9ea240..626ba2cce9520 100644 --- a/ggml/src/ggml-qnn/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn/ggml-qnn.cpp @@ -6,7 +6,6 @@ #include "ggml-backend-impl.h" #include "ggml-impl.h" - #include "ggml-qnn/backend-ops.hpp" #include "ggml-qnn/backend.hpp" #include "ggml-qnn/logger.hpp" @@ -19,9 +18,9 @@ // // ================================================================================================= #ifdef NDEBUG -#define ENABLE_QNNBACKEND_PERF 0 // enable/disable op's perf info +# define ENABLE_QNNBACKEND_PERF 0 // enable/disable op's perf info #else -#define ENABLE_QNNBACKEND_PERF 1 // enable/disable op's perf info +# define ENABLE_QNNBACKEND_PERF 1 // enable/disable op's perf info #endif #define QNN_BACKEND_NAME "qnn" @@ -29,50 +28,42 @@ namespace { #ifdef _WIN32 -constexpr const char *kQnnCpuLibName = "QnnCpu.dll"; -constexpr const char *kQnnGpuLibName = "QnnGpu.dll"; -constexpr const char *kQnnNpuLibName = "QnnHtp.dll"; +constexpr const char * kQnnCpuLibName = "QnnCpu.dll"; +constexpr const char * kQnnGpuLibName = "QnnGpu.dll"; +constexpr const char * kQnnNpuLibName = "QnnHtp.dll"; #else -constexpr const char *kQnnCpuLibName = "libQnnCpu.so"; -constexpr const char *kQnnGpuLibName = "libQnnGpu.so"; -constexpr const char *kQnnNpuLibName = "libQnnHtp.so"; +constexpr const char * kQnnCpuLibName = "libQnnCpu.so"; +constexpr const char * kQnnGpuLibName = "libQnnGpu.so"; +constexpr const char * kQnnNpuLibName = "libQnnHtp.so"; #endif struct qnn_device_caps { - const char *name; - const char *description; - const char *lib_name; + const char * name; + const char * description; + const char * lib_name; enum ggml_backend_dev_type type; // TODO: should get this caps from device uint64_t supported_types; }; +// TODO: should move this to qnn-lib.cpp constexpr const qnn_device_caps kDeviceCaps[] = { { - // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/CpuOpDefSupplement.html#matmul - "qnn-cpu", - "Qualcomm Kryo CPU", - kQnnCpuLibName, - GGML_BACKEND_DEVICE_TYPE_CPU, - (1 << GGML_TYPE_I8) | (1 << GGML_TYPE_F32), - }, + "qnn-cpu", "Qualcomm Kryo CPU", + kQnnCpuLibName, GGML_BACKEND_DEVICE_TYPE_CPU, + (1 << GGML_TYPE_I8) | (1 << GGML_TYPE_F32), + }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/CpuOpDefSupplement.html#matmul { - // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/GpuOpDefSupplement.html#matmul - "qnn-gpu", - "Qualcomm Adreno GPU", - kQnnGpuLibName, - GGML_BACKEND_DEVICE_TYPE_GPU, - (1 << GGML_TYPE_F32) | (1 << GGML_TYPE_F16), - }, + "qnn-gpu", "Qualcomm Adreno GPU", + kQnnGpuLibName, GGML_BACKEND_DEVICE_TYPE_GPU, + (1 << GGML_TYPE_F32) | (1 << GGML_TYPE_F16), + }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/GpuOpDefSupplement.html#matmul { - // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/HtpOpDefSupplement.html#matmul - "qnn-npu", - "Qualcomm NPU", - kQnnNpuLibName, - GGML_BACKEND_DEVICE_TYPE_ACCEL, - (1 << GGML_TYPE_F32) | (1 << GGML_TYPE_F16) | (1 << GGML_TYPE_I16) | (1 << GGML_TYPE_I8), - }, + "qnn-npu", "Qualcomm NPU", + kQnnNpuLibName, GGML_BACKEND_DEVICE_TYPE_ACCEL, + (1 << GGML_TYPE_F32) | (1 << GGML_TYPE_F16) | (1 << GGML_TYPE_I16) | (1 << GGML_TYPE_I8), + }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/HtpOpDefSupplement.html#matmul }; static_assert(sizeof(kDeviceCaps) / sizeof(kDeviceCaps[0]) == GGML_QNN_MAX_DEVICES, @@ -85,11 +76,11 @@ static_assert(kDeviceCaps[QNN_BACKEND_GPU].type == GGML_BACKEND_DEVICE_TYPE_GPU, static_assert(kDeviceCaps[QNN_BACKEND_CPU].type == GGML_BACKEND_DEVICE_TYPE_CPU, "The NPU device should be an accelerator device"); -ggml_backend_qnn_device_context *get_device_context(ggml_backend_dev_t dev) { +ggml_backend_qnn_device_context * get_device_context(ggml_backend_dev_t dev) { return reinterpret_cast(dev->context); } -qnn::qnn_buffer_interface *get_buffer_context(ggml_backend_buffer_t buffer) { +qnn::qnn_buffer_interface * get_buffer_context(ggml_backend_buffer_t buffer) { return reinterpret_cast(buffer->context); } @@ -99,34 +90,34 @@ qnn::qnn_buffer_interface *get_buffer_context(ggml_backend_buffer_t buffer) { * ----------------------------------------------------------------------------------------------- */ void ggml_backend_qnn_buffer_free_buffer(ggml_backend_buffer_t buffer) { - auto *ctx = get_buffer_context(buffer); + auto * ctx = get_buffer_context(buffer); delete ctx; } -void *ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t buffer) { - auto *ctx = get_buffer_context(buffer); +void * ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t buffer) { + auto * ctx = get_buffer_context(buffer); return ctx->get_buffer(); } -void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor) { +void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { GGML_UNUSED(buffer); GGML_UNUSED(tensor); // TODO: we should create the qnn tensor along with the ggml tensor } -void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data, +void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { GGML_UNUSED(buffer); - memcpy((char *)tensor->data + offset, data, size); + memcpy((char *) tensor->data + offset, data, size); } -void ggml_backend_qnn_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor *tensor, void *data, +void ggml_backend_qnn_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { GGML_UNUSED(buffer); - memcpy(data, (const char *)tensor->data + offset, size); + memcpy(data, (const char *) tensor->data + offset, size); } -bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor *src, ggml_tensor *dst) { +bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { GGML_UNUSED(buffer); if (ggml_backend_buffer_is_host(src->buffer)) { memcpy(dst->data, src->data, ggml_nbytes(src)); @@ -137,7 +128,7 @@ bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml } void ggml_backend_qnn_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { - auto *ctx = get_buffer_context(buffer); + auto * ctx = get_buffer_context(buffer); memset(ctx->get_buffer(), value, ctx->get_size()); } @@ -158,19 +149,19 @@ constexpr const ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = { * qnn backend object * ----------------------------------------------------------------------------------------------- */ -const char *ggml_backend_qnn_buffer_type_name(ggml_backend_buffer_type_t buft) { - auto *dev_ctx = get_device_context(buft->device); +const char * ggml_backend_qnn_buffer_type_name(ggml_backend_buffer_type_t buft) { + auto * dev_ctx = get_device_context(buft->device); return qnn::get_backend_name(dev_ctx->device); } ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - qnn::qnn_buffer_interface *ctx = new qnn::qnn_mem_buffer(size); + qnn::qnn_buffer_interface * ctx = new qnn::qnn_mem_buffer(size); if (!ctx->is_valid()) { return nullptr; } - QNN_LOG_DEBUG("[%s]alloc buffer: %p, size: %ld", qnn::get_backend_name(get_device_context(buft->device)->device), - ctx->get_buffer(), size); + QNN_LOG_DEBUG("[%s]alloc buffer: %p, size: %ld\n", qnn::get_backend_name(get_device_context(buft->device)->device), + (void *) ctx->get_buffer(), (long) size); return ggml_backend_buffer_init(buft, ggml_backend_qnn_buffer_interface, ctx, size); } @@ -192,16 +183,16 @@ bool ggml_backend_qnn_buffer_is_host(ggml_backend_buffer_type_t buft) { return true; } -const char *ggml_backend_qnn_name(ggml_backend_t backend) { - auto *device_ctx = get_device_context(backend->device); +const char * ggml_backend_qnn_name(ggml_backend_t backend) { + auto * device_ctx = get_device_context(backend->device); return device_ctx->name.c_str(); } void ggml_backend_qnn_free(ggml_backend_t backend) { - auto *device_ctx = get_device_context(backend->device); - QNN_LOG_INFO("idx %d, name:%s", device_ctx->device, device_ctx->name.c_str()); + auto * device_ctx = get_device_context(backend->device); + QNN_LOG_INFO("idx %d, name:%s\n", device_ctx->device, device_ctx->name.c_str()); - auto &instance = device_ctx->instance; + auto & instance = device_ctx->instance; if (instance) { device_ctx->qnn_graph_cache.clear(); device_ctx->qnn_interface.reset(); @@ -212,35 +203,33 @@ void ggml_backend_qnn_free(ggml_backend_t backend) { delete backend; } -bool ggml_backend_qnn_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor *src, - ggml_tensor *dst) { +bool ggml_backend_qnn_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, + ggml_tensor * dst) { GGML_UNUSED(backend_src); GGML_UNUSED(backend_dst); GGML_UNUSED(src); GGML_UNUSED(dst); - QNN_LOG_DEBUG("opy form %s to %s, src_is_qnn: %d, dst_is_qnn: %d", ggml_get_name(src), ggml_get_name(dst), - (int)ggml_backend_is_qnn(backend_src), (int)ggml_backend_is_qnn(backend_dst)); + QNN_LOG_DEBUG("opy form %s to %s, src_is_qnn: %d, dst_is_qnn: %d\n", ggml_get_name(src), ggml_get_name(dst), + (int) ggml_backend_is_qnn(backend_src), (int) ggml_backend_is_qnn(backend_dst)); return false; } ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(ggml_backend_dev_t dev) { static ggml_backend_buffer_type ggml_backend_qnn_buffer_types[GGML_QNN_MAX_DEVICES]; - auto *dev_ctx = get_device_context(dev); + auto * dev_ctx = get_device_context(dev); if (!ggml_backend_qnn_buffer_types[dev_ctx->device].device) { ggml_backend_qnn_buffer_types[dev_ctx->device] = { /* .iface = */ { - /* .get_name = */ ggml_backend_qnn_buffer_type_name, - /* .alloc_buffer = */ - ggml_backend_qnn_buffer_type_alloc_buffer, - /* .get_alignment = */ - ggml_backend_qnn_buffer_type_get_alignment, - /* .get_max_size = */ - ggml_backend_qnn_buffer_type_get_max_size, - /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes + /* .get_name = */ ggml_backend_qnn_buffer_type_name, + /* .alloc_buffer = */ + ggml_backend_qnn_buffer_type_alloc_buffer, /* .get_alignment = */ + ggml_backend_qnn_buffer_type_get_alignment, /* .get_max_size = */ + ggml_backend_qnn_buffer_type_get_max_size, /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes /* .is_host = */ ggml_backend_qnn_buffer_is_host, - }, - /* .device */ dev, + }, + /* .device */ + dev, /* .context = */ nullptr, }; } else { @@ -250,9 +239,9 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(ggml_backend_dev_t dev) return &ggml_backend_qnn_buffer_types[dev_ctx->device]; } -ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, ggml_cgraph *cgraph) { - return qnn::device_compute_graph(get_device_context(backend->device), cgraph) ? GGML_STATUS_SUCCESS - : GGML_STATUS_FAILED; +ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { + return qnn::device_compute_graph(get_device_context(backend->device), cgraph) ? GGML_STATUS_SUCCESS : + GGML_STATUS_FAILED; } constexpr const ggml_backend_i ggml_backend_qnn_interface = { @@ -276,31 +265,31 @@ constexpr const ggml_backend_i ggml_backend_qnn_interface = { * qnn backend device object * ----------------------------------------------------------------------------------------------- */ -const char *ggml_backend_qnn_device_get_name(ggml_backend_dev_t dev) { - const auto &caps = kDeviceCaps[get_device_context(dev)->device]; +const char * ggml_backend_qnn_device_get_name(ggml_backend_dev_t dev) { + const auto & caps = kDeviceCaps[get_device_context(dev)->device]; return caps.name; } -const char *ggml_backend_qnn_device_get_description(ggml_backend_dev_t dev) { - const auto &caps = kDeviceCaps[get_device_context(dev)->device]; +const char * ggml_backend_qnn_device_get_description(ggml_backend_dev_t dev) { + const auto & caps = kDeviceCaps[get_device_context(dev)->device]; return caps.description; } -void ggml_backend_qnn_device_get_memory(ggml_backend_dev_t dev, size_t *free, size_t *total) { +void ggml_backend_qnn_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { GGML_UNUSED(dev); - *free = qnn::get_system_free_memory_in_bytes(); + *free = qnn::get_system_free_memory_in_bytes(); *total = qnn::get_system_total_memory_in_bytes(); - QNN_LOG_DEBUG("free memory: %ldMB, total memory: %ldMB", (*free / 1048576), (*total) / 1048576); + QNN_LOG_DEBUG("free memory: %ldMB, total memory: %ldMB\n", (*free / 1048576), (*total) / 1048576); } enum ggml_backend_dev_type ggml_backend_qnn_device_get_type(ggml_backend_dev_t dev) { return kDeviceCaps[get_device_context(dev)->device].type; } -void ggml_backend_qnn_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props *props) { - props->name = ggml_backend_qnn_device_get_name(dev); +void ggml_backend_qnn_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) { + props->name = ggml_backend_qnn_device_get_name(dev); props->description = ggml_backend_qnn_device_get_description(dev); - props->type = ggml_backend_qnn_device_get_type(dev); + props->type = ggml_backend_qnn_device_get_type(dev); ggml_backend_qnn_device_get_memory(dev, &props->memory_free, &props->memory_total); props->caps = { /* async */ false, @@ -311,12 +300,12 @@ void ggml_backend_qnn_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_ } ggml_guid_t ggml_backend_qnn_guid() { - static ggml_guid guid = {0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81, - 0x92, 0xa3, 0xb4, 0xc5, 0xd6, 0xe7, 0xf8, 0x09}; + static ggml_guid guid = { 0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81, + 0x92, 0xa3, 0xb4, 0xc5, 0xd6, 0xe7, 0xf8, 0x09 }; return &guid; } -ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, const char *extend_lib_search_path) { +ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, const char * extend_lib_search_path) { if (!extend_lib_search_path) { extend_lib_search_path = GGML_QNN_DEFAULT_LIB_SEARCH_PATH; QNN_LOG_WARN( @@ -324,27 +313,27 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, "use " GGML_QNN_DEFAULT_LIB_SEARCH_PATH " as default"); } - auto *dev_ctx = get_device_context(dev); - const auto device = dev_ctx->device; - QNN_LOG_DEBUG("device %s", qnn::get_backend_name(device)); - QNN_LOG_DEBUG("extend_lib_search_path %s", extend_lib_search_path); + auto * dev_ctx = get_device_context(dev); + const auto device = dev_ctx->device; + QNN_LOG_DEBUG("device %s\n", qnn::get_backend_name(device)); + QNN_LOG_DEBUG("extend_lib_search_path %s\n", extend_lib_search_path); auto instance = std::make_shared(extend_lib_search_path, dev_ctx->lib_name); - auto result = instance->qnn_init(nullptr); + auto result = instance->qnn_init(nullptr); if (result != 0) { - QNN_LOG_WARN("failed to init qnn backend %s", qnn::get_backend_name(device)); + QNN_LOG_WARN("failed to init qnn backend %s\n", qnn::get_backend_name(device)); return nullptr; } auto qnn_interface = instance->get_qnn_interface(); if (!qnn_interface) { - QNN_LOG_WARN("qnn subsystem failure"); + QNN_LOG_WARN("qnn subsystem failure\n"); return nullptr; } std::string device_name = qnn::get_backend_name(device); - QNN_LOG_INFO("qnn device name %s", device_name.c_str()); - dev_ctx->instance = instance; - dev_ctx->qnn_interface = qnn_interface; - dev_ctx->socinfo = instance->get_soc_info(); + QNN_LOG_INFO("qnn device name %s\n", device_name.c_str()); + dev_ctx->instance = instance; + dev_ctx->qnn_interface = qnn_interface; + dev_ctx->socinfo = instance->get_soc_info(); dev_ctx->supported_types = kDeviceCaps[device].supported_types; ggml_backend_t qnn_backend = new ggml_backend{ @@ -357,7 +346,7 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, return qnn_backend; } -ggml_backend_t ggml_backend_qnn_device_init(ggml_backend_dev_t dev, const char *params) { +ggml_backend_t ggml_backend_qnn_device_init(ggml_backend_dev_t dev, const char * params) { return ggml_backend_qnn_init_with_device_context(dev, params); } @@ -365,7 +354,7 @@ ggml_backend_buffer_type_t ggml_backend_qnn_device_get_buffer_type(ggml_backend_ return ggml_backend_qnn_buffer_type(dev); } -ggml_backend_buffer_t ggml_backend_qnn_device_buffer_from_ptr(ggml_backend_dev_t dev, void *ptr, size_t size, +ggml_backend_buffer_t ggml_backend_qnn_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { // TODO GGML_UNUSED(dev); @@ -373,9 +362,9 @@ ggml_backend_buffer_t ggml_backend_qnn_device_buffer_from_ptr(ggml_backend_dev_t return ggml_backend_cpu_buffer_from_ptr(ptr, size); } -bool ggml_backend_qnn_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor *op) { +bool ggml_backend_qnn_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { // Note that this function could be called before the device context is initialized - auto *device_ctx = get_device_context(dev); + auto * device_ctx = get_device_context(dev); return qnn::device_supports_op(device_ctx, op); } @@ -384,13 +373,13 @@ bool ggml_backend_qnn_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_ return ggml_backend_buft_is_host(buft); } -bool ggml_backend_qnn_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor *op) { +bool ggml_backend_qnn_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) { #ifdef NDEBUG GGML_UNUSED(dev); GGML_UNUSED(op); #else - auto *device_ctx = get_device_context(dev); - QNN_LOG_DEBUG("[%s][%s]offload op", qnn::get_backend_name(device_ctx->device), ggml_op_name(op->op)); + auto * device_ctx = get_device_context(dev); + QNN_LOG_DEBUG("[%s][%s]offload op\n", qnn::get_backend_name(device_ctx->device), ggml_op_name(op->op)); #endif return false; } @@ -421,15 +410,15 @@ constexpr const ggml_backend_device_i ggml_backend_qnn_device_interface = { struct ggml_backend_qnn_reg_impl : ggml_backend_reg { std::vector> device_contexts; - std::vector devices; + std::vector devices; explicit ggml_backend_qnn_reg_impl(ggml_backend_reg_i interface) { context = this; - iface = interface; + iface = interface; - QNN_LOG_DEBUG("qnn backend registry init"); + QNN_LOG_DEBUG("qnn backend registry init\n"); for (size_t i = 0; i < QNN_BACKEND_COUNT; i++) { - const auto device_enum = (QNNBackend)(QNN_BACKEND_COUNT - 1 - i); // init from the last device, i.e. NPU + const auto device_enum = (QNNBackend) (QNN_BACKEND_COUNT - 1 - i); // init from the last device, i.e. NPU #ifndef GGML_QNN_ENABLE_CPU_BACKEND if (device_enum == QNN_BACKEND_CPU) { /* @@ -441,7 +430,7 @@ struct ggml_backend_qnn_reg_impl : ggml_backend_reg { #endif device_contexts.emplace_back(std::make_unique( - /* .device = */ device_enum, // init from the last device, i.e. NPU + /* .device = */ device_enum, // init from the last device, i.e. NPU /* .threads = */ 1, /* .name = */ qnn::get_backend_name(device_enum), /* .lib_name = */ kDeviceCaps[device_enum].lib_name, @@ -456,18 +445,18 @@ struct ggml_backend_qnn_reg_impl : ggml_backend_reg { } }; -const char *ggml_backend_qnn_reg_get_name(ggml_backend_reg_t reg) { +const char * ggml_backend_qnn_reg_get_name(ggml_backend_reg_t reg) { GGML_UNUSED(reg); return GGML_QNN_NAME; } size_t ggml_backend_qnn_reg_get_device_count(ggml_backend_reg_t reg) { - auto *ctx = (ggml_backend_qnn_reg_impl *)reg->context; + auto * ctx = (ggml_backend_qnn_reg_impl *) reg->context; return ctx->devices.size(); } ggml_backend_dev_t ggml_backend_qnn_reg_get_device(ggml_backend_reg_t reg, size_t index) { - auto *ctx = (ggml_backend_qnn_reg_impl *)reg->context; + auto * ctx = (ggml_backend_qnn_reg_impl *) reg->context; GGML_ASSERT(index < ctx->devices.size()); return &(ctx->devices[index]); } @@ -479,11 +468,13 @@ const ggml_backend_reg_i ggml_backend_qnn_reg_interface = { /* .get_proc_address = */ nullptr, }; -} // namespace +} // namespace -bool ggml_backend_is_qnn(ggml_backend_t backend) { return ggml_guid_matches(backend->guid, ggml_backend_qnn_guid()); } +bool ggml_backend_is_qnn(ggml_backend_t backend) { + return ggml_guid_matches(backend->guid, ggml_backend_qnn_guid()); +} ggml_backend_reg_t ggml_backend_qnn_reg() { - static ggml_backend_qnn_reg_impl reg{ggml_backend_qnn_reg_interface}; + static ggml_backend_qnn_reg_impl reg{ ggml_backend_qnn_reg_interface }; return ® } diff --git a/ggml/src/ggml-qnn/graph.cpp b/ggml/src/ggml-qnn/graph.cpp index 25ce5b8fb2754..b3ab161e9f6ca 100644 --- a/ggml/src/ggml-qnn/graph.cpp +++ b/ggml/src/ggml-qnn/graph.cpp @@ -5,7 +5,6 @@ #include #include "ggml-impl.h" - #include "logger.hpp" #include "op-config.hpp" #include "tensor.hpp" @@ -13,9 +12,9 @@ namespace { using qnn_tensor_cache_t = std::unordered_map; -int get_op_max_rank(const ggml_tensor *op) { - int max_rank = ggml_n_dims(op); - const int count = (int)qnn::get_qnn_op_input_param_count(op); +int get_op_max_rank(const ggml_tensor * op) { + int max_rank = ggml_n_dims(op); + const int count = (int) qnn::get_qnn_op_input_param_count(op); for (int i = 0; i < count; ++i) { max_rank = std::max(max_rank, ggml_n_dims(op->src[i])); } @@ -23,10 +22,10 @@ int get_op_max_rank(const ggml_tensor *op) { return max_rank; } -qnn::qnn_tensor_ptr_t create_tensor_with_cache(ggml_tensor *tensor, qnn::ggml_qnn_tensor::tensor_type_t type, int rank, +qnn::qnn_tensor_ptr_t create_tensor_with_cache(ggml_tensor * tensor, qnn::ggml_qnn_tensor::tensor_type_t type, int rank, QNNBackend device, Qnn_GraphHandle_t graph_handle, std::shared_ptr qnn_instance, - qnn_tensor_cache_t &tensor_cache) { + qnn_tensor_cache_t & tensor_cache) { GGML_ASSERT(tensor); if (tensor_cache.count(tensor)) { return tensor_cache[tensor]; @@ -38,13 +37,13 @@ qnn::qnn_tensor_ptr_t create_tensor_with_cache(ggml_tensor *tensor, qnn::ggml_qn return qnn_tensor; } -qnn::qnn_tensor_array_t create_tensors_with_cache(const qnn::ggml_tensor_array_t &ggml_tensors, +qnn::qnn_tensor_array_t create_tensors_with_cache(const qnn::ggml_tensor_array_t & ggml_tensors, qnn::ggml_qnn_tensor::tensor_type_t type, int rank, QNNBackend device, - Qnn_GraphHandle_t graph_handle, + Qnn_GraphHandle_t graph_handle, std::shared_ptr qnn_instance, - qnn_tensor_cache_t &tensor_cache) { + qnn_tensor_cache_t & tensor_cache) { qnn::qnn_tensor_array_t tensors; - for (auto *tensor : ggml_tensors) { + for (auto * tensor : ggml_tensors) { tensors.push_back( create_tensor_with_cache(tensor, type, rank, device, graph_handle, qnn_instance, tensor_cache)); } @@ -52,10 +51,10 @@ qnn::qnn_tensor_array_t create_tensors_with_cache(const qnn::ggml_tensor_array_t return tensors; } -qnn::qnn_op_config_ptr_t create_operation_from_op_tensor(ggml_tensor *dst, const std::string &name, int rank, +qnn::qnn_op_config_ptr_t create_operation_from_op_tensor(ggml_tensor * dst, const std::string & name, int rank, QNNBackend device, Qnn_GraphHandle_t graph_handle, std::shared_ptr qnn_instance, - bool is_intermediate, qnn_tensor_cache_t &tensor_cache) { + bool is_intermediate, qnn_tensor_cache_t & tensor_cache) { auto operation = qnn::create_op(dst, name, qnn_instance); // input tensors @@ -71,22 +70,22 @@ qnn::qnn_op_config_ptr_t create_operation_from_op_tensor(ggml_tensor *dst, const // output tensor tensor_type = is_intermediate ? qnn::ggml_qnn_tensor::INTERMEDIATE : qnn::ggml_qnn_tensor::OUTPUT; qnn::qnn_tensor_array_t output_qnn_tensors = - create_tensors_with_cache({dst}, tensor_type, rank, device, graph_handle, qnn_instance, tensor_cache); + create_tensors_with_cache({ dst }, tensor_type, rank, device, graph_handle, qnn_instance, tensor_cache); operation->set_output_tensors(output_qnn_tensors); // initialize operation if (!operation->initialize_op_nodes(device, graph_handle)) { - QNN_LOG_ERROR("[%s][%s]initialize_op_nodes failed", qnn::get_backend_name(device), name.c_str()); + QNN_LOG_ERROR("[%s][%s]initialize_op_nodes failed\n", qnn::get_backend_name(device), name.c_str()); return nullptr; } return operation; } -bool bind_src_tensors(ggml_tensor *op, qnn::qnn_tensor_array_t &tensor_wrappers, - std::vector &qnn_tensors) { +bool bind_src_tensors(ggml_tensor * op, qnn::qnn_tensor_array_t & tensor_wrappers, + std::vector & qnn_tensors) { if (op->op == GGML_OP_NONE) { - QNN_LOG_DEBUG("op %s is not a valid op", ggml_get_name(op)); + QNN_LOG_DEBUG("op %s is not a valid op\n", ggml_get_name(op)); return false; } @@ -94,9 +93,9 @@ bool bind_src_tensors(ggml_tensor *op, qnn::qnn_tensor_array_t &tensor_wrappers, GGML_ASSERT(tensor_wrappers.size() == param_count); qnn_tensors.resize(param_count); for (size_t i = 0; i < param_count; ++i) { - auto *ggml_tensor = op->src[i]; + auto * ggml_tensor = op->src[i]; if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor)) { - QNN_LOG_ERROR("bind tensor %s failed", ggml_get_name(ggml_tensor)); + QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); return false; } @@ -116,22 +115,21 @@ bool bind_src_tensors(ggml_tensor *op, qnn::qnn_tensor_array_t &tensor_wrappers, * * TODO: this algorithm is not perfect and may not work for all cases. It assumes that the tensors are * connected in a way that allows for unambiguous categorization. - * It also assumes that the tensors are connected in a way that allows for unambiguous categorization. */ -int get_io_tensors_from_graph(const ggml_cgraph *cgraph, qnn::ggml_tensor_array_t &inputs, - qnn::ggml_tensor_array_t &outputs) { +int get_io_tensors_from_graph(const ggml_cgraph * cgraph, qnn::ggml_tensor_array_t & inputs, + qnn::ggml_tensor_array_t & outputs) { struct _tensor_connectivity_info { - size_t in_degree = 0; - size_t out_degree = 0; + size_t in_degree = 0; + size_t out_degree = 0; size_t insert_index = 0; }; using ggml_tensor_connectivity_map_t = std::unordered_map; ggml_tensor_connectivity_map_t connectivity_map; - int rank = 0; + int rank = 0; for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_tensor *dst = cgraph->nodes[i]; + ggml_tensor * dst = cgraph->nodes[i]; if (ggml_is_empty(dst)) { continue; } @@ -144,7 +142,7 @@ int get_io_tensors_from_graph(const ggml_cgraph *cgraph, qnn::ggml_tensor_array_ rank = std::max(rank, ggml_n_dims(dst)); if (connectivity_map.count(dst) == 0) { connectivity_map[dst] = { - 1, // in-degree, at least 1 + 1, // in-degree, at least 1 0, connectivity_map.size(), }; @@ -153,13 +151,13 @@ int get_io_tensors_from_graph(const ggml_cgraph *cgraph, qnn::ggml_tensor_array_ } for (size_t i = 0; i < GGML_MAX_DIMS && dst->src[i]; ++i) { - auto *src = dst->src[i]; - rank = std::max(rank, ggml_n_dims(src)); + auto * src = dst->src[i]; + rank = std::max(rank, ggml_n_dims(src)); if (connectivity_map.count(src) == 0) { connectivity_map[src] = { 0, - 1, // out-degree, at least 1 + 1, // out-degree, at least 1 connectivity_map.size(), }; } else { @@ -168,7 +166,7 @@ int get_io_tensors_from_graph(const ggml_cgraph *cgraph, qnn::ggml_tensor_array_ } } - for (const auto &kv : connectivity_map) { + for (const auto & kv : connectivity_map) { if (kv.second.in_degree == 0) { inputs.push_back(kv.first); } @@ -178,126 +176,103 @@ int get_io_tensors_from_graph(const ggml_cgraph *cgraph, qnn::ggml_tensor_array_ } } - std::sort(inputs.begin(), inputs.end(), [&connectivity_map](ggml_tensor *lhs, ggml_tensor *rhs) { + std::sort(inputs.begin(), inputs.end(), [&connectivity_map](ggml_tensor * lhs, ggml_tensor * rhs) { return connectivity_map[lhs].insert_index < connectivity_map[rhs].insert_index; }); - std::sort(outputs.begin(), outputs.end(), [&connectivity_map](ggml_tensor *lhs, ggml_tensor *rhs) { + std::sort(outputs.begin(), outputs.end(), [&connectivity_map](ggml_tensor * lhs, ggml_tensor * rhs) { return connectivity_map[lhs].insert_index < connectivity_map[rhs].insert_index; }); return rank; } -} // namespace +} // namespace namespace qnn { -qnn_graph::qnn_graph(const std::string &graph_name, QNNBackend device, std::shared_ptr qnn_instance, - size_t vtcm_size_in_mb) - : _graph_name(graph_name), _device(device), _qnn_instance(qnn_instance) { - QNN_LOG_DEBUG("[%s][%s]created", get_backend_name(device), graph_name.c_str()); - - auto qnn_interface = qnn_instance->get_qnn_interface(); - auto qnn_context = qnn_instance->get_qnn_context_handle(); - Qnn_ErrorHandle_t error = QNN_SUCCESS; - Qnn_GraphHandle_t graph_handle = nullptr; +qnn_graph::qnn_graph(const std::string & graph_name, QNNBackend device, std::shared_ptr qnn_instance, + size_t vtcm_size_in_mb) : + _graph_name(graph_name), + _device(device), + _qnn_instance(qnn_instance) { + QNN_LOG_DEBUG("[%s][%s]created\n", get_backend_name(device), graph_name.c_str()); + + auto qnn_interface = qnn_instance->get_qnn_interface(); + auto qnn_context = qnn_instance->get_qnn_context_handle(); + Qnn_ErrorHandle_t error = QNN_SUCCESS; + Qnn_GraphHandle_t graph_handle = nullptr; if (device == QNN_BACKEND_NPU) { // TODO: fix graph config here for NPU QnnHtpGraph_CustomConfig_t hvx_config; - hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; + hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; hvx_config.numHvxThreads = 8; QnnGraph_Config_t graph_hvx_config; - graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; graph_hvx_config.customConfig = &hvx_config; QnnHtpGraph_CustomConfig_t dlbc_config; - dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; - dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; - dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC + dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; + dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; + dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC QnnGraph_Config_t graph_dlbc_config; - graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; graph_dlbc_config.customConfig = &dlbc_config; QnnHtpGraph_CustomConfig_t opt_config; - opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; - opt_config.optimizationOption.floatValue = 1; // 1 / 3 + opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; + opt_config.optimizationOption.floatValue = 1; // 1 / 3 QnnGraph_Config_t graph_opt_config; - graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; graph_opt_config.customConfig = &opt_config; QnnHtpGraph_CustomConfig_t vtcm_config; - vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; - vtcm_config.vtcmSizeInMB = (uint32_t)vtcm_size_in_mb; + vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; + vtcm_config.vtcmSizeInMB = (uint32_t) vtcm_size_in_mb; QnnGraph_Config_t graph_vtcm_config; - graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; graph_vtcm_config.customConfig = &vtcm_config; - const QnnGraph_Config_t *graph_configs[] = {&graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config, - &graph_opt_config, nullptr}; + const QnnGraph_Config_t * graph_configs[] = { &graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config, + &graph_opt_config, nullptr }; error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), graph_configs, &graph_handle); } else { error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), nullptr, &graph_handle); } if (error != QNN_SUCCESS) { - QNN_LOG_ERROR("[%s][%s]failed to create qnn graph, error: %s", get_backend_name(device), graph_name.c_str(), + QNN_LOG_ERROR("[%s][%s]failed to create qnn graph, error: %s\n", get_backend_name(device), graph_name.c_str(), get_qnn_error_string(error)); return; } - QNN_LOG_INFO("[%s][%s]create succeed", get_backend_name(device), graph_name.c_str()); - _graph_handle = graph_handle; + QNN_LOG_INFO("[%s][%s]create succeed\n", get_backend_name(device), graph_name.c_str()); + _graph_handle = graph_handle; _qnn_interface = qnn_interface; } -qnn_graph::~qnn_graph() { QNN_LOG_DEBUG("[%s][%s]destroy", get_backend_name(_device), _graph_name.c_str()); } - -bool qnn_graph::build_graph_from_op(ggml_tensor *op) { - if (!is_valid()) { - QNN_LOG_ERROR("Invalid graph"); - return false; - } - - QNN_LOG_DEBUG("[%s][%s]build start", get_backend_name(_device), _graph_name.c_str()); - qnn_tensor_cache_t tensor_cache; - const auto rank = get_op_max_rank(op); - auto operation = create_operation_from_op_tensor(op, _graph_name, rank, _device, _graph_handle, _qnn_instance, - false, tensor_cache); - if (!operation) { - QNN_LOG_ERROR("[%s][%s]create_operation_from_op_tensor failed", get_backend_name(_device), _graph_name.c_str()); - return false; - } - - _tensor_inputs = operation->get_input_tensors(); - _tensor_outputs = operation->get_output_tensors(); - _operations.push_back(std::move(operation)); - if (!finalize()) { - return false; - } - - QNN_LOG_DEBUG("[%s][%s]build succeed", get_backend_name(_device), _graph_name.c_str()); - return true; +qnn_graph::~qnn_graph() { + QNN_LOG_DEBUG("[%s][%s]destroy\n", get_backend_name(_device), _graph_name.c_str()); } -bool qnn_graph::build_graph_from_ggml_graph(const ggml_cgraph *cgraph) { - QNN_LOG_DEBUG("[%s][%s]build start", get_backend_name(_device), _graph_name.c_str()); +bool qnn_graph::build_graph_from_ggml_graph(const ggml_cgraph * cgraph) { + QNN_LOG_DEBUG("[%s][%s]build start\n", get_backend_name(_device), _graph_name.c_str()); ggml_tensor_array_t inputs; ggml_tensor_array_t outputs; - int rank = get_io_tensors_from_graph(cgraph, inputs, outputs); - QNN_LOG_DEBUG("[%s]rank: %d, input_set: %d, output_set: %d", get_backend_name(_device), rank, int(inputs.size()), + int rank = get_io_tensors_from_graph(cgraph, inputs, outputs); + QNN_LOG_DEBUG("[%s]rank: %d, input_set: %d, output_set: %d\n", get_backend_name(_device), rank, int(inputs.size()), int(outputs.size())); { qnn_tensor_cache_t tensor_cache; - auto input_tensors = create_tensors_with_cache(inputs, ggml_qnn_tensor::INPUT, rank, _device, _graph_handle, - _qnn_instance, tensor_cache); + auto input_tensors = create_tensors_with_cache(inputs, ggml_qnn_tensor::INPUT, rank, _device, _graph_handle, + _qnn_instance, tensor_cache); auto output_tensors = create_tensors_with_cache(outputs, ggml_qnn_tensor::OUTPUT, rank, _device, _graph_handle, _qnn_instance, tensor_cache); qnn_op_config_array_t operations; for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_tensor *dst = cgraph->nodes[i]; + ggml_tensor * dst = cgraph->nodes[i]; if (ggml_is_empty(dst)) { continue; } @@ -307,83 +282,49 @@ bool qnn_graph::build_graph_from_ggml_graph(const ggml_cgraph *cgraph) { continue; } - QNN_LOG_DEBUG("[%s]create op: %s", get_backend_name(_device), get_qnn_op_name(dst)); + QNN_LOG_DEBUG("[%s]create op: %s\n", get_backend_name(_device), get_qnn_op_name(dst)); auto operation = create_operation_from_op_tensor(dst, dst->name, rank, _device, _graph_handle, - _qnn_instance, true, tensor_cache); // TODO: fix op name + _qnn_instance, true, tensor_cache); // TODO: fix op name operations.push_back(operation); } - _tensor_inputs = std::move(input_tensors); + _tensor_inputs = std::move(input_tensors); _tensor_outputs = std::move(output_tensors); - _operations = std::move(operations); + _operations = std::move(operations); if (!finalize()) { return false; } } - QNN_LOG_DEBUG("[%s][%s]build succeed", get_backend_name(_device), _graph_name.c_str()); - return true; -} - -bool qnn_graph::execute(ggml_tensor *op) { - if (!bind_src_tensors(op, _tensor_inputs, _qnn_tensor_inputs)) { - QNN_LOG_ERROR("[%s][%s]bind input tensors failed", get_backend_name(_device), _graph_name.c_str()); - return false; - } - - if (!qnn::bind_tensors({op}, _tensor_outputs, _qnn_tensor_outputs)) { - QNN_LOG_ERROR("[%s][%s]bind output tensors failed", get_backend_name(_device), _graph_name.c_str()); - return false; - } - - auto &qnn_tensor_inputs = _qnn_tensor_inputs; - auto &qnn_tensor_outputs = _qnn_tensor_outputs; - auto error = - _qnn_interface->qnn_graph_execute(_graph_handle, qnn_tensor_inputs.data(), qnn_tensor_inputs.size(), - qnn_tensor_outputs.data(), qnn_tensor_outputs.size(), nullptr, nullptr); - unbind_tensors(_tensor_inputs); - unbind_tensors(_tensor_outputs); - - if (error != QNN_SUCCESS) { - if (_device == QNN_BACKEND_NPU && error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) { - QNN_LOG_WARN("[%s][%s]NPU crashed. SSR detected. Caused QNN graph execute error.", - get_backend_name(_device), _graph_name.c_str()); - } else { - QNN_LOG_ERROR("[%s][%s]error: %s", get_backend_name(_device), _graph_name.c_str(), - get_qnn_error_string(error)); - } - return false; - } - - QNN_LOG_DEBUG("[%s][%s]execute succeed", get_backend_name(_device), _graph_name.c_str()); + QNN_LOG_DEBUG("[%s][%s]build succeed\n", get_backend_name(_device), _graph_name.c_str()); return true; } -bool qnn_graph::execute(const ggml_cgraph *cgraph) { +bool qnn_graph::execute(const ggml_cgraph * cgraph) { ggml_tensor_array_t inputs; ggml_tensor_array_t outputs; #ifdef NDEBUG get_io_tensors_from_graph(cgraph, inputs, outputs); #else int rank = get_io_tensors_from_graph(cgraph, inputs, outputs); - QNN_LOG_DEBUG("[%s]rank: %d, input_set: %d, output_set: %d", get_backend_name(_device), rank, int(inputs.size()), + QNN_LOG_DEBUG("[%s]rank: %d, input_set: %d, output_set: %d\n", get_backend_name(_device), rank, int(inputs.size()), int(outputs.size())); #endif { if (!qnn::bind_tensors(inputs, _tensor_inputs, _qnn_tensor_inputs)) { - QNN_LOG_ERROR("[%s][%s]bind input tensors failed", get_backend_name(_device), _graph_name.c_str()); + QNN_LOG_ERROR("[%s][%s]bind input tensors failed\n", get_backend_name(_device), _graph_name.c_str()); return false; } if (!qnn::bind_tensors(outputs, _tensor_outputs, _qnn_tensor_outputs)) { - QNN_LOG_ERROR("[%s][%s]bind output tensors failed", get_backend_name(_device), _graph_name.c_str()); + QNN_LOG_ERROR("[%s][%s]bind output tensors failed\n", get_backend_name(_device), _graph_name.c_str()); return false; } - auto &qnn_tensor_inputs = _qnn_tensor_inputs; - auto &qnn_tensor_outputs = _qnn_tensor_outputs; - auto error = + auto & qnn_tensor_inputs = _qnn_tensor_inputs; + auto & qnn_tensor_outputs = _qnn_tensor_outputs; + auto error = _qnn_interface->qnn_graph_execute(_graph_handle, qnn_tensor_inputs.data(), qnn_tensor_inputs.size(), qnn_tensor_outputs.data(), qnn_tensor_outputs.size(), nullptr, nullptr); unbind_tensors(_tensor_inputs); @@ -391,35 +332,35 @@ bool qnn_graph::execute(const ggml_cgraph *cgraph) { if (error != QNN_SUCCESS) { if (_device == QNN_BACKEND_NPU && error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) { - QNN_LOG_WARN("[%s][%s]NPU crashed. SSR detected. Caused QNN graph execute error.", + QNN_LOG_WARN("[%s][%s]NPU crashed. SSR detected. Caused QNN graph execute error.\n", get_backend_name(_device), _graph_name.c_str()); } else { - QNN_LOG_ERROR("[%s][%s]error: %s", get_backend_name(_device), _graph_name.c_str(), + QNN_LOG_ERROR("[%s][%s]error: %s\n", get_backend_name(_device), _graph_name.c_str(), get_qnn_error_string(error)); } return false; } - QNN_LOG_DEBUG("[%s][%s]execute succeed", get_backend_name(_device), _graph_name.c_str()); + QNN_LOG_DEBUG("[%s][%s]execute succeed\n", get_backend_name(_device), _graph_name.c_str()); return true; } } bool qnn_graph::finalize() { if (!qnn::add_op_to_graph(_graph_handle, _operations)) { - QNN_LOG_ERROR("[%s]add nodes failed", _graph_name.c_str()); + QNN_LOG_ERROR("[%s]add nodes failed\n", _graph_name.c_str()); return false; } auto error = _qnn_interface->qnn_graph_finalize(_graph_handle, nullptr, nullptr); if (error != QNN_SUCCESS) { - QNN_LOG_ERROR("[%s][%s]qnn_graph_finalize.error: %s", get_backend_name(_device), _graph_name.c_str(), + QNN_LOG_ERROR("[%s][%s]qnn_graph_finalize.error: %s\n", get_backend_name(_device), _graph_name.c_str(), get_qnn_error_string(error)); return false; } - QNN_LOG_DEBUG("[%s][%s]finalize succeed", get_backend_name(_device), _graph_name.c_str()); + QNN_LOG_DEBUG("[%s][%s]finalize succeed\n", get_backend_name(_device), _graph_name.c_str()); return true; } -} // namespace qnn +} // namespace qnn diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 521186f790ee5..dc1ed0b3f8896 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -6,41 +6,51 @@ #include #include "ggml-qnn.h" - #include "op-config.hpp" #include "qnn-lib.hpp" namespace qnn { +/** + * @class qnn_graph + * @brief Manages a QNN graph, converting a GGML graph to QNN format and handling its execution. + * + * This class is responsible for building a QNN graph from a given GGML graph, + * determining its input/output tensors, finalizing the configuration, and + * executing the graph on the specified backend device. + */ class qnn_graph { -public: - explicit qnn_graph(const std::string &graph_name, QNNBackend device, std::shared_ptr qnn_instance, + public: + explicit qnn_graph(const std::string & graph_name, QNNBackend device, std::shared_ptr qnn_instance, size_t vtcm_size_in_mb); ~qnn_graph(); - bool build_graph_from_op(ggml_tensor *op); - bool build_graph_from_ggml_graph(const ggml_cgraph *cgraph); + bool build_graph_from_ggml_graph(const ggml_cgraph * cgraph); + + bool execute(const ggml_cgraph * cgraph); - bool execute(ggml_tensor *op); - bool execute(const ggml_cgraph *cgraph); bool is_valid() const { return _graph_handle != nullptr; } + Qnn_GraphHandle_t get_graph_handler() const { return _graph_handle; } + std::shared_ptr get_qnn_instance() { return _qnn_instance; } - const std::string &get_name() const { return _graph_name; } + + const std::string & get_name() const { return _graph_name; } + QNNBackend get_device() const { return _device; } -private: + private: bool finalize(); - const std::string _graph_name; - const QNNBackend _device; - Qnn_GraphHandle_t _graph_handle = nullptr; - std::shared_ptr _qnn_instance; + const std::string _graph_name; + const QNNBackend _device; + Qnn_GraphHandle_t _graph_handle = nullptr; + std::shared_ptr _qnn_instance; std::shared_ptr _qnn_interface; - qnn_op_config_array_t _operations; + qnn_op_config_array_t _operations; - qnn_tensor_array_t _tensor_inputs; - qnn_tensor_array_t _tensor_outputs; + qnn_tensor_array_t _tensor_inputs; + qnn_tensor_array_t _tensor_outputs; std::vector _qnn_tensor_inputs; std::vector _qnn_tensor_outputs; @@ -50,4 +60,4 @@ class qnn_graph { using qnn_graph_ptr_t = std::shared_ptr; -} // namespace qnn +} // namespace qnn diff --git a/ggml/src/ggml-qnn/logger.cpp b/ggml/src/ggml-qnn/logger.cpp index 23a3f305c060f..5418d03be45a4 100644 --- a/ggml/src/ggml-qnn/logger.cpp +++ b/ggml/src/ggml-qnn/logger.cpp @@ -1,70 +1,45 @@ #include "logger.hpp" -#include -#include +#ifndef NDEBUG -#if defined(__ANDROID__) || defined(ANDROID) -#include -#endif - -void qnn::internal_log(ggml_log_level level, const char * /*file*/, const char *func, int line, const char *format, - ...) { - static std::mutex qnn_internal_log_mutex; - static char s_qnn_internal_log_buf[QNN_LOGBUF_LEN]; +# include - { - std::lock_guard lock(qnn_internal_log_mutex); - va_list args; +# include "QnnInterface.h" +# include "QnnTypes.h" +# include "System/QnnSystemInterface.h" - va_start(args, format); - int len_prefix = snprintf(s_qnn_internal_log_buf, QNN_LOGBUF_LEN, "[%s, %d]: ", func, line); - int len = vsnprintf(s_qnn_internal_log_buf + len_prefix, QNN_LOGBUF_LEN - len_prefix, format, args); - if (len < (QNN_LOGBUF_LEN - len_prefix)) { -#if defined(__ANDROID__) || defined(ANDROID) - // print to android logcat - __android_log_print(level, "ggml-qnn", "%s\n", s_qnn_internal_log_buf); -#else - (void)level; -#endif - // print to stdout - printf("%s\n", s_qnn_internal_log_buf); - } - va_end(args); - } -} - -#if ENABLE_QNNSDK_LOG -void qnn::sdk_logcallback(const char *fmt, QnnLog_Level_t level, uint64_t /*timestamp*/, va_list argp) { +void qnn::sdk_logcallback(const char * fmt, QnnLog_Level_t level, uint64_t /*timestamp*/, va_list argp) { static std::mutex log_mutex; - static char s_ggml_qnn_logbuf[QNN_LOGBUF_LEN]; + static char s_ggml_qnn_logbuf[4096]; - const char *log_level_desc = ""; + char log_level_desc = 'U'; switch (level) { case QNN_LOG_LEVEL_ERROR: - log_level_desc = "ERROR"; + log_level_desc = 'E'; break; case QNN_LOG_LEVEL_WARN: - log_level_desc = "WARNING"; + log_level_desc = 'W'; break; case QNN_LOG_LEVEL_INFO: - log_level_desc = "INFO"; + log_level_desc = 'I'; break; case QNN_LOG_LEVEL_DEBUG: - log_level_desc = "DEBUG"; + log_level_desc = 'D'; break; case QNN_LOG_LEVEL_VERBOSE: - log_level_desc = "VERBOSE"; - break; - case QNN_LOG_LEVEL_MAX: - log_level_desc = "UNKNOWN"; + log_level_desc = 'V'; break; } { std::lock_guard lock(log_mutex); - vsnprintf(s_ggml_qnn_logbuf, QNN_LOGBUF_LEN, fmt, argp); - QNN_LOG_INFO("[%s]%s", log_level_desc, s_ggml_qnn_logbuf); + int size = vsnprintf(s_ggml_qnn_logbuf, sizeof(s_ggml_qnn_logbuf), fmt, argp); + if (size > 0 && s_ggml_qnn_logbuf[size - 1] != '\n') { + QNN_LOG_INFO("[%c]%s\n", log_level_desc, s_ggml_qnn_logbuf); + } else { + QNN_LOG_INFO("[%c]%s", log_level_desc, s_ggml_qnn_logbuf); + } } } #else diff --git a/ggml/src/ggml-qnn/logger.hpp b/ggml/src/ggml-qnn/logger.hpp index b4bab0c006691..cf94ce22174b6 100644 --- a/ggml/src/ggml-qnn/logger.hpp +++ b/ggml/src/ggml-qnn/logger.hpp @@ -1,43 +1,16 @@ #pragma once -#include +#include +#include "ggml-impl.h" #include "ggml.h" - -#include "QnnCommon.h" -#include "QnnInterface.h" -#include "QnnTypes.h" -#include "System/QnnSystemInterface.h" - -#define QNN_LOGBUF_LEN 4096 +#include "QnnLog.h" namespace qnn { -void internal_log(ggml_log_level level, const char *file, const char *func, int line, const char *format, ...); - -void sdk_logcallback(const char *fmt, QnnLog_Level_t level, uint64_t timestamp, va_list argp); -} // namespace qnn - -// ================================================================================================= -// -// QNN backend internal log function -// -// ================================================================================================= -#define QNN_LOG_ERROR(...) qnn::internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) - -#define QNN_LOG_WARN(...) qnn::internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) - -#define QNN_LOG_INFO(...) qnn::internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) - -#ifdef NDEBUG -#define ENABLE_QNNBACKEND_DEBUG 0 // for troubleshooting QNN backend -#define ENABLE_QNNSDK_LOG 0 // enable/disable QNN SDK's internal log -#else -#define ENABLE_QNNBACKEND_DEBUG 1 // for troubleshooting QNN backend -#define ENABLE_QNNSDK_LOG 1 // enable/disable QNN SDK's internal log -#endif +void sdk_logcallback(const char * fmt, QnnLog_Level_t level, uint64_t timestamp, va_list argp); +} // namespace qnn -#if ENABLE_QNNBACKEND_DEBUG -#define QNN_LOG_DEBUG(...) qnn::internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) -#else -#define QNN_LOG_DEBUG(...) -#endif +#define QNN_LOG_ERROR(...) (GGML_LOG_ERROR(__VA_ARGS__)) +#define QNN_LOG_WARN(...) (GGML_LOG_WARN(__VA_ARGS__)) +#define QNN_LOG_INFO(...) (GGML_LOG_INFO(__VA_ARGS__)) +#define QNN_LOG_DEBUG(...) (GGML_LOG_DEBUG(__VA_ARGS__)) diff --git a/ggml/src/ggml-qnn/op-config-base.hpp b/ggml/src/ggml-qnn/op-config-base.hpp index 274bb8318ff99..b24b53bf2a3b6 100644 --- a/ggml/src/ggml-qnn/op-config-base.hpp +++ b/ggml/src/ggml-qnn/op-config-base.hpp @@ -4,7 +4,6 @@ #include #include "ggml-qnn.h" - #include "qnn-types.hpp" #include "tensor.hpp" @@ -18,7 +17,7 @@ namespace qnn { * adding operations to a graph, and binding/unbinding input and output tensors. */ class ggml_qnn_op_config { -public: + public: virtual ~ggml_qnn_op_config() {} /** @@ -32,8 +31,8 @@ class ggml_qnn_op_config { * * @param tensor_inputs A reference to a vector of qnn_tensor_ptr_t objects representing the input tensors. */ - virtual void set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) = 0; - virtual void set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) = 0; + virtual void set_input_tensors(qnn::qnn_tensor_array_t & tensor_inputs) = 0; + virtual void set_input_tensors(qnn::qnn_tensor_array_t && tensor_inputs) = 0; /** * @brief Sets custom output tensors for the operation. This method should be called before `initialize_op_nodes`. @@ -46,8 +45,8 @@ class ggml_qnn_op_config { * * @param tensor_outputs A reference to a vector of qnn_tensor_ptr_t objects representing the output tensors. */ - virtual void set_output_tensors(qnn::qnn_tensor_array_t &tensor_outputs) = 0; - virtual void set_output_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) = 0; + virtual void set_output_tensors(qnn::qnn_tensor_array_t & tensor_outputs) = 0; + virtual void set_output_tensors(qnn::qnn_tensor_array_t && tensor_inputs) = 0; /** * @brief Creates tensors and internal nodes for constructing the calculation graph. @@ -71,7 +70,7 @@ class ggml_qnn_op_config { * * @return A reference to a vector of qnn_tensor_ptr_t objects representing the input tensors. */ - virtual const qnn_tensor_array_t &get_input_tensors() = 0; + virtual const qnn_tensor_array_t & get_input_tensors() = 0; /** * @brief Pure virtual function to retrieve the output tensors of a QNN. @@ -82,7 +81,7 @@ class ggml_qnn_op_config { * * @return A reference to a vector of qnn_tensor_ptr_t objects representing the output tensors. */ - virtual const qnn_tensor_array_t &get_output_tensors() = 0; + virtual const qnn_tensor_array_t & get_output_tensors() = 0; /** * @brief Adds an operation to the given graph. @@ -109,7 +108,7 @@ class ggml_qnn_op_config { * containing the input tensors. * @return true if the input tensors were successfully bound, false otherwise. */ - virtual bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) = 0; + virtual bool bind_input_tensors(const ggml_tensor_array_t & tensor_inputs) = 0; /** * @brief Binds the output tensors to the given tensor array. @@ -123,7 +122,7 @@ class ggml_qnn_op_config { * represent the output tensors to be bound. * @return true if the binding is successful, false otherwise. */ - virtual bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) = 0; + virtual bool bind_output_tensors(const ggml_tensor_array_t & tensor_outputs) = 0; /** * @brief Unbinds the input tensors from the operation. @@ -146,7 +145,7 @@ class ggml_qnn_op_config { virtual void unbind_output_tensors() = 0; }; -using qnn_op_config_ptr_t = std::shared_ptr; +using qnn_op_config_ptr_t = std::shared_ptr; using qnn_op_config_array_t = std::vector; -} // namespace qnn +} // namespace qnn diff --git a/ggml/src/ggml-qnn/op-config-caps.cpp b/ggml/src/ggml-qnn/op-config-caps.cpp index b250c214a3ad9..16b50503bea4c 100644 --- a/ggml/src/ggml-qnn/op-config-caps.cpp +++ b/ggml/src/ggml-qnn/op-config-caps.cpp @@ -3,73 +3,73 @@ namespace { -using op_constructor_t = std::shared_ptr (*)(const ggml_tensor *, const std::string &, +using op_constructor_t = std::shared_ptr (*)(const ggml_tensor *, const std::string &, std::shared_ptr); -using op_dims_calc_func_t = void (*)(const std::vector &input_dims, - qnn::ggml_dimension_array_t &output_dims); +using op_dims_calc_func_t = void (*)(const std::vector & input_dims, + qnn::ggml_dimension_array_t & output_dims); -void element_wise_op_dims(const std::vector &input_dims, - qnn::ggml_dimension_array_t &output_dims) { +void element_wise_op_dims(const std::vector & input_dims, + qnn::ggml_dimension_array_t & output_dims) { for (size_t i = 1; i < std::size(output_dims); i++) { output_dims[i] = input_dims.front()[i]; } } -void mat_mul_op_dims(const std::vector &input_dims, - qnn::ggml_dimension_array_t &output_dims) { +void mat_mul_op_dims(const std::vector & input_dims, + qnn::ggml_dimension_array_t & output_dims) { GGML_ASSERT(input_dims.size() == 2); output_dims[0] = input_dims.front()[1]; output_dims[1] = input_dims.back()[1]; } struct qnn_op_caps_t { - const char *qnn_op_name = nullptr; - const size_t input_param_count = 0; - op_dims_calc_func_t calc_dims_func = nullptr; - const char *qnn_param_name = nullptr; + const char * qnn_op_name = nullptr; + const size_t input_param_count = 0; + op_dims_calc_func_t calc_dims_func = nullptr; + const char * qnn_param_name = nullptr; }; constexpr const qnn_op_caps_t kOpCaps[] = { {}, // GGML_OP_NONE {}, // GGML_OP_DUP { - // GGML_OP_ADD - QNN_OP_ELEMENT_WISE_ADD, // qnn_op_name - 2, // input_param_count - element_wise_op_dims, // calc_dims_func + // GGML_OP_ADD + QNN_OP_ELEMENT_WISE_ADD, // qnn_op_name + 2, // input_param_count + element_wise_op_dims, // calc_dims_func }, {}, // GGML_OP_ADD1 {}, // GGML_OP_ACC { - // GGML_OP_SUB - QNN_OP_ELEMENT_WISE_SUBTRACT, // qnn_op_name - 2, // input_param_count - element_wise_op_dims, // calc_dims_func + // GGML_OP_SUB + QNN_OP_ELEMENT_WISE_SUBTRACT, // qnn_op_name + 2, // input_param_count + element_wise_op_dims, // calc_dims_func }, { - // GGML_OP_MUL - QNN_OP_ELEMENT_WISE_MULTIPLY, // qnn_op_name - 2, // input_param_count - element_wise_op_dims, // calc_dims_func + // GGML_OP_MUL + QNN_OP_ELEMENT_WISE_MULTIPLY, // qnn_op_name + 2, // input_param_count + element_wise_op_dims, // calc_dims_func }, { - // GGML_OP_DIV - QNN_OP_ELEMENT_WISE_DIVIDE, // qnn_op_name - 2, // input_param_count - element_wise_op_dims, // calc_dims_func + // GGML_OP_DIV + QNN_OP_ELEMENT_WISE_DIVIDE, // qnn_op_name + 2, // input_param_count + element_wise_op_dims, // calc_dims_func }, {}, // GGML_OP_SQR { - // GGML_OP_SQRT - QNN_OP_ELEMENT_WISE_SQUARE_ROOT, // qnn_op_name - 1, // input_param_count - element_wise_op_dims, // calc_dims_func + // GGML_OP_SQRT + QNN_OP_ELEMENT_WISE_SQUARE_ROOT, // qnn_op_name + 1, // input_param_count + element_wise_op_dims, // calc_dims_func }, { - // GGML_OP_LOG - QNN_OP_ELEMENT_WISE_LOG, // qnn_op_name - 1, // input_param_count - element_wise_op_dims, // calc_dims_func + // GGML_OP_LOG + QNN_OP_ELEMENT_WISE_LOG, // qnn_op_name + 1, // input_param_count + element_wise_op_dims, // calc_dims_func }, {}, // GGML_OP_SIN {}, // GGML_OP_COS @@ -84,19 +84,19 @@ constexpr const qnn_op_caps_t kOpCaps[] = { {}, // GGML_OP_SILU_BACK {}, // GGML_OP_NORM { - // GGML_OP_RMS_NORM - QNN_OP_RMS_NORM, // qnn_op_name - 1, // input_param_count - nullptr, // TODO: calc_dims_func - QNN_OP_RMS_NORM_PARAM_EPSILON, // qnn_param_name + // GGML_OP_RMS_NORM + QNN_OP_RMS_NORM, // qnn_op_name + 1, // input_param_count + nullptr, // TODO: calc_dims_func + QNN_OP_RMS_NORM_PARAM_EPSILON, // qnn_param_name }, {}, // GGML_OP_RMS_NORM_BACK {}, // GGML_OP_GROUP_NORM { - // GGML_OP_MUL_MAT - QNN_OP_MAT_MUL, // qnn_op_name - 2, // input_param_count - mat_mul_op_dims, // calc_dims_func + // GGML_OP_MUL_MAT + QNN_OP_MAT_MUL, // qnn_op_name + 2, // input_param_count + mat_mul_op_dims, // calc_dims_func }, {}, // GGML_OP_MUL_MAT_ID {}, // GGML_OP_OUT_PROD @@ -105,10 +105,10 @@ constexpr const qnn_op_caps_t kOpCaps[] = { {}, // GGML_OP_CPY {}, // GGML_OP_CONT { - // GGML_OP_RESHAPE - QNN_OP_RESHAPE, // qnn_op_name - 1, // input_param_count - nullptr, // TODO: calc_dims_func + // GGML_OP_RESHAPE + QNN_OP_RESHAPE, // qnn_op_name + 1, // input_param_count + nullptr, // TODO: calc_dims_func }, {}, // GGML_OP_VIEW {}, // GGML_OP_PERMUTE @@ -177,10 +177,10 @@ constexpr const qnn_op_caps_t kOpCaps[] = { {}, // GGML_UNARY_OP_RELU {}, // GGML_UNARY_OP_SIGMOID { - // GGML_UNARY_OP_GELU - QNN_OP_GELU, // qnn_op_name - 1, // input_param_count - nullptr, // TODO: calc_dims_func + // GGML_UNARY_OP_GELU + QNN_OP_GELU, // qnn_op_name + 1, // input_param_count + nullptr, // TODO: calc_dims_func }, {}, // GGML_UNARY_OP_GELU_QUICK {}, // GGML_UNARY_OP_SILU @@ -201,15 +201,17 @@ static_assert(kOpCaps[GGML_OP_COUNT + GGML_UNARY_OP_GELU].input_param_count == 1 static_assert(std::size(kOpCaps) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), "GGML_OP_COUNT does not match the size of the kOpCaps table"); -std::shared_ptr mat_mul_op_constructor(const ggml_tensor *op, const std::string &instance_name, +std::shared_ptr mat_mul_op_constructor(const ggml_tensor * op, + const std::string & instance_name, std::shared_ptr qnn_instance) { GGML_UNUSED(op); - QNN_LOG_DEBUG("create QNN_OP_MAT_MUL, name %s", instance_name.c_str()); + QNN_LOG_DEBUG("create QNN_OP_MAT_MUL, name %s\n", instance_name.c_str()); return std::make_shared(instance_name, qnn_instance); } template -std::shared_ptr generic_op_constructor(const ggml_tensor *op, const std::string &instance_name, +std::shared_ptr generic_op_constructor(const ggml_tensor * op, + const std::string & instance_name, std::shared_ptr qnn_instance) { GGML_UNUSED(op); static_assert(_op < std::size(kOpCaps)); @@ -218,20 +220,20 @@ std::shared_ptr generic_op_constructor(const ggml_tenso kOpCaps[_op].qnn_op_name, qnn_instance); } -void add_type_parameters(std::shared_ptr op, const char *name, float value) { +void add_type_parameters(std::shared_ptr op, const char * name, float value) { Qnn_Scalar_t scalar = QNN_SCALAR_INIT; - scalar.dataType = QNN_DATATYPE_FLOAT_32; - scalar.floatValue = value; + scalar.dataType = QNN_DATATYPE_FLOAT_32; + scalar.floatValue = value; op->add_scalar_param(name, scalar); } template std::shared_ptr op_constructor_with_type_param( - const ggml_tensor *op, const std::string &instance_name, std::shared_ptr qnn_instance) { + const ggml_tensor * op, const std::string & instance_name, std::shared_ptr qnn_instance) { static_assert(std::is_base_of::value); static_assert(_op < std::size(kOpCaps)); - constexpr auto &op_caps = kOpCaps[_op]; + constexpr auto & op_caps = kOpCaps[_op]; static_assert(op_caps.qnn_op_name != nullptr); _ggml_op_param_type op_param; @@ -245,113 +247,113 @@ std::shared_ptr op_constructor_with_type_param( } constexpr const op_constructor_t kOpConstructors[] = { - nullptr, // GGML_OP_NONE - nullptr, // GGML_OP_DUP - generic_op_constructor, // GGML_OP_ADD - nullptr, // GGML_OP_ADD1 - nullptr, // GGML_OP_ACC - generic_op_constructor, // GGML_OP_SUB - generic_op_constructor, // GGML_OP_MUL - generic_op_constructor, // GGML_OP_DIV - nullptr, // GGML_OP_SQR - generic_op_constructor, // GGML_OP_SQRT - generic_op_constructor, // GGML_OP_LOG - nullptr, // GGML_OP_SIN - nullptr, // GGML_OP_COS - nullptr, // GGML_OP_SUM - nullptr, // GGML_OP_SUM_ROWS - nullptr, // GGML_OP_MEAN - nullptr, // GGML_OP_ARGMAX - nullptr, // GGML_OP_COUNT_EQUAL - nullptr, // GGML_OP_REPEAT - nullptr, // GGML_OP_REPEAT_BACK - nullptr, // GGML_OP_CONCAT - nullptr, // GGML_OP_SILU_BACK - nullptr, // GGML_OP_NORM - op_constructor_with_type_param, // GGML_OP_RMS_NORM - nullptr, // GGML_OP_RMS_NORM_BACK - nullptr, // GGML_OP_GROUP_NORM - - mat_mul_op_constructor, // GGML_OP_MUL_MAT - nullptr, // GGML_OP_MUL_MAT_ID - nullptr, // GGML_OP_OUT_PROD - - nullptr, // GGML_OP_SCALE - nullptr, // GGML_OP_SET - nullptr, // GGML_OP_CPY - nullptr, // GGML_OP_CONT - generic_op_constructor, // GGML_OP_RESHAPE - nullptr, // GGML_OP_VIEW - nullptr, // GGML_OP_PERMUTE - nullptr, // GGML_OP_TRANSPOSE - nullptr, // GGML_OP_GET_ROWS - nullptr, // GGML_OP_GET_ROWS_BACK - nullptr, // GGML_OP_DIAG - nullptr, // GGML_OP_DIAG_MASK_INF - nullptr, // GGML_OP_DIAG_MASK_ZERO - nullptr, // GGML_OP_SOFT_MAX - nullptr, // GGML_OP_SOFT_MAX_BACK - nullptr, // GGML_OP_ROPE - nullptr, // GGML_OP_ROPE_BACK - nullptr, // GGML_OP_CLAMP - nullptr, // GGML_OP_CONV_TRANSPOSE_1D - nullptr, // GGML_OP_IM2COL - nullptr, // GGML_OP_IM2COL_BACK - nullptr, // GGML_OP_CONV_TRANSPOSE_2D - nullptr, // GGML_OP_POOL_1D - nullptr, // GGML_OP_POOL_2D - nullptr, // GGML_OP_POOL_2D_BACK - nullptr, // GGML_OP_UPSCALE - nullptr, // GGML_OP_PAD - nullptr, // GGML_OP_PAD_REFLECT_1D - nullptr, // GGML_OP_ARANGE - nullptr, // GGML_OP_TIMESTEP_EMBEDDING - nullptr, // GGML_OP_ARGSORT - nullptr, // GGML_OP_LEAKY_RELU - - nullptr, // GGML_OP_FLASH_ATTN_EXT - nullptr, // GGML_OP_FLASH_ATTN_BACK - nullptr, // GGML_OP_SSM_CONV - nullptr, // GGML_OP_SSM_SCAN - nullptr, // GGML_OP_WIN_PART - nullptr, // GGML_OP_WIN_UNPART - nullptr, // GGML_OP_GET_REL_POS - nullptr, // GGML_OP_ADD_REL_POS - nullptr, // GGML_OP_RWKV_WKV6 - nullptr, // GGML_OP_GATED_LINEAR_ATTN - - nullptr, // GGML_OP_UNARY - - nullptr, // GGML_OP_MAP_UNARY - nullptr, // GGML_OP_MAP_BINARY - - nullptr, // GGML_OP_MAP_CUSTOM1_F32 - nullptr, // GGML_OP_MAP_CUSTOM2_F32 - nullptr, // GGML_OP_MAP_CUSTOM3_F32 - - nullptr, // GGML_OP_MAP_CUSTOM1 - nullptr, // GGML_OP_MAP_CUSTOM2 - nullptr, // GGML_OP_MAP_CUSTOM3 - - nullptr, // GGML_OP_CROSS_ENTROPY_LOSS - nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK - nullptr, // GGML_OP_OPT_STEP_ADAMW + nullptr, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + generic_op_constructor, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + generic_op_constructor, // GGML_OP_SUB + generic_op_constructor, // GGML_OP_MUL + generic_op_constructor, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + generic_op_constructor, // GGML_OP_SQRT + generic_op_constructor, // GGML_OP_LOG + nullptr, // GGML_OP_SIN + nullptr, // GGML_OP_COS + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_COUNT_EQUAL + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + op_constructor_with_type_param, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM + + mat_mul_op_constructor, // GGML_OP_MUL_MAT + nullptr, // GGML_OP_MUL_MAT_ID + nullptr, // GGML_OP_OUT_PROD + + nullptr, // GGML_OP_SCALE + nullptr, // GGML_OP_SET + nullptr, // GGML_OP_CPY + nullptr, // GGML_OP_CONT + generic_op_constructor, // GGML_OP_RESHAPE + nullptr, // GGML_OP_VIEW + nullptr, // GGML_OP_PERMUTE + nullptr, // GGML_OP_TRANSPOSE + nullptr, // GGML_OP_GET_ROWS + nullptr, // GGML_OP_GET_ROWS_BACK + nullptr, // GGML_OP_DIAG + nullptr, // GGML_OP_DIAG_MASK_INF + nullptr, // GGML_OP_DIAG_MASK_ZERO + nullptr, // GGML_OP_SOFT_MAX + nullptr, // GGML_OP_SOFT_MAX_BACK + nullptr, // GGML_OP_ROPE + nullptr, // GGML_OP_ROPE_BACK + nullptr, // GGML_OP_CLAMP + nullptr, // GGML_OP_CONV_TRANSPOSE_1D + nullptr, // GGML_OP_IM2COL + nullptr, // GGML_OP_IM2COL_BACK + nullptr, // GGML_OP_CONV_TRANSPOSE_2D + nullptr, // GGML_OP_POOL_1D + nullptr, // GGML_OP_POOL_2D + nullptr, // GGML_OP_POOL_2D_BACK + nullptr, // GGML_OP_UPSCALE + nullptr, // GGML_OP_PAD + nullptr, // GGML_OP_PAD_REFLECT_1D + nullptr, // GGML_OP_ARANGE + nullptr, // GGML_OP_TIMESTEP_EMBEDDING + nullptr, // GGML_OP_ARGSORT + nullptr, // GGML_OP_LEAKY_RELU + + nullptr, // GGML_OP_FLASH_ATTN_EXT + nullptr, // GGML_OP_FLASH_ATTN_BACK + nullptr, // GGML_OP_SSM_CONV + nullptr, // GGML_OP_SSM_SCAN + nullptr, // GGML_OP_WIN_PART + nullptr, // GGML_OP_WIN_UNPART + nullptr, // GGML_OP_GET_REL_POS + nullptr, // GGML_OP_ADD_REL_POS + nullptr, // GGML_OP_RWKV_WKV6 + nullptr, // GGML_OP_GATED_LINEAR_ATTN + + nullptr, // GGML_OP_UNARY + + nullptr, // GGML_OP_MAP_UNARY + nullptr, // GGML_OP_MAP_BINARY + + nullptr, // GGML_OP_MAP_CUSTOM1_F32 + nullptr, // GGML_OP_MAP_CUSTOM2_F32 + nullptr, // GGML_OP_MAP_CUSTOM3_F32 + + nullptr, // GGML_OP_MAP_CUSTOM1 + nullptr, // GGML_OP_MAP_CUSTOM2 + nullptr, // GGML_OP_MAP_CUSTOM3 + + nullptr, // GGML_OP_CROSS_ENTROPY_LOSS + nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK + nullptr, // GGML_OP_OPT_STEP_ADAMW // ggml_unary_op - nullptr, // GGML_UNARY_OP_ABS - nullptr, // GGML_UNARY_OP_SGN - nullptr, // GGML_UNARY_OP_NEG - nullptr, // GGML_UNARY_OP_STEP - nullptr, // GGML_UNARY_OP_TANH - nullptr, // GGML_UNARY_OP_ELU - nullptr, // GGML_UNARY_OP_RELU - nullptr, // GGML_UNARY_OP_SIGMOID - nullptr, // GGML_UNARY_OP_GELU - nullptr, // GGML_UNARY_OP_GELU_QUICK - nullptr, // GGML_UNARY_OP_SILU - nullptr, // GGML_UNARY_OP_HARDSWISH - nullptr, // GGML_UNARY_OP_HARDSIGMOID - nullptr, // GGML_UNARY_OP_EXP + nullptr, // GGML_UNARY_OP_ABS + nullptr, // GGML_UNARY_OP_SGN + nullptr, // GGML_UNARY_OP_NEG + nullptr, // GGML_UNARY_OP_STEP + nullptr, // GGML_UNARY_OP_TANH + nullptr, // GGML_UNARY_OP_ELU + nullptr, // GGML_UNARY_OP_RELU + nullptr, // GGML_UNARY_OP_SIGMOID + nullptr, // GGML_UNARY_OP_GELU + nullptr, // GGML_UNARY_OP_GELU_QUICK + nullptr, // GGML_UNARY_OP_SILU + nullptr, // GGML_UNARY_OP_HARDSWISH + nullptr, // GGML_UNARY_OP_HARDSIGMOID + nullptr, // GGML_UNARY_OP_EXP }; static_assert(kOpConstructors[GGML_OP_NONE] == nullptr, "GGML_OP_NONE does not match the nullptr function"); @@ -362,11 +364,11 @@ static_assert(kOpConstructors[GGML_OP_MUL_MAT] == mat_mul_op_constructor, static_assert(std::size(kOpConstructors) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), "GGML_OP_COUNT does not match the size of the kOpConstructors table"); -} // namespace +} // namespace namespace qnn { -size_t get_qnn_op_index(const ggml_tensor *tensor) { +size_t get_qnn_op_index(const ggml_tensor * tensor) { if (tensor->op == GGML_OP_UNARY) { return kGgmlUnaryOpStart + ggml_get_unary_op(tensor); } @@ -374,20 +376,20 @@ size_t get_qnn_op_index(const ggml_tensor *tensor) { return tensor->op; } -const char *get_qnn_op_name(const ggml_tensor *op) { +const char * get_qnn_op_name(const ggml_tensor * op) { auto op_index = get_qnn_op_index(op); GGML_ASSERT(op_index < std::size(kOpCaps)); GGML_ASSERT(kOpCaps[op_index].qnn_op_name); return kOpCaps[op_index].qnn_op_name; } -size_t get_qnn_op_input_param_count(const ggml_tensor *op) { +size_t get_qnn_op_input_param_count(const ggml_tensor * op) { auto op_index = get_qnn_op_index(op); GGML_ASSERT(op_index < std::size(kOpCaps)); return kOpCaps[op_index].input_param_count; } -std::shared_ptr create_op(const ggml_tensor *op, const std::string &name, +std::shared_ptr create_op(const ggml_tensor * op, const std::string & name, std::shared_ptr qnn_instance) { auto op_index = get_qnn_op_index(op); GGML_ASSERT(op_index < std::size(kOpCaps)); @@ -396,4 +398,4 @@ std::shared_ptr create_op(const ggml_tensor *op, const std:: return op_constructor(op, name, qnn_instance); } -} // namespace qnn +} // namespace qnn diff --git a/ggml/src/ggml-qnn/op-config-impl.cpp b/ggml/src/ggml-qnn/op-config-impl.cpp index 1b05b3581a419..14638a554e066 100644 --- a/ggml/src/ggml-qnn/op-config-impl.cpp +++ b/ggml/src/ggml-qnn/op-config-impl.cpp @@ -6,14 +6,7 @@ namespace { -constexpr const qnn::qnn_dimension_array_t kTransposeParamData[GGML_MAX_DIMS] = { - {0}, - {1, 0}, - {0, 2, 1}, - {0, 1, 3, 2}, -}; - -qnn::qnn_dimension_array_t get_transposed_dimensions(const qnn::qnn_dimension_array_t &dimensions, int rank) { +qnn::qnn_dimension_array_t get_transposed_dimensions(const qnn::qnn_dimension_array_t & dimensions, int rank) { qnn::qnn_dimension_array_t transposed_dims = dimensions; if (rank >= 2) { transposed_dims[rank - 1] = dimensions[rank - 2]; @@ -23,11 +16,11 @@ qnn::qnn_dimension_array_t get_transposed_dimensions(const qnn::qnn_dimension_ar return transposed_dims; } -int get_rank(const qnn::ggml_tensor_array_t &tensor_inputs, const qnn::ggml_tensor_array_t &tensor_outputs) { +int get_rank(const qnn::ggml_tensor_array_t & tensor_inputs, const qnn::ggml_tensor_array_t & tensor_outputs) { return std::max(qnn::get_ggml_tensors_max_rank(tensor_inputs), qnn::get_ggml_tensors_max_rank(tensor_outputs)); } -Qnn_DataType_t get_tensor_type(const qnn::qnn_tensor_array_t &tensors) { +Qnn_DataType_t get_tensor_type(const qnn::qnn_tensor_array_t & tensors) { Qnn_DataType_t type = QNN_DATATYPE_UNDEFINED; for (auto tensor : tensors) { auto tensor_type_size = qnn::qnn_datatype_size(tensor->get_data_type()); @@ -40,67 +33,67 @@ Qnn_DataType_t get_tensor_type(const qnn::qnn_tensor_array_t &tensors) { return type; } -} // namespace +} // namespace namespace qnn { -void ggml_qnn_op_config_base::add_scalar_param(const std::string &name, const Qnn_Scalar_t scalar) { +void ggml_qnn_op_config_base::add_scalar_param(const std::string & name, const Qnn_Scalar_t scalar) { _param_names.push_back(name); Qnn_Param_t param = QNN_PARAM_INIT; - param.paramType = QNN_PARAMTYPE_SCALAR; - param.name = _param_names.back().c_str(); + param.paramType = QNN_PARAMTYPE_SCALAR; + param.name = _param_names.back().c_str(); param.scalarParam = scalar; _qnn_parameters.push_back(param); } -bool ggml_qnn_op_config_base::add_tensor_param(const std::string &name, const qnn_dimension_array_t &dimensions, - int rank, const uint8_t *data, const Qnn_DataType_t data_type, +bool ggml_qnn_op_config_base::add_tensor_param(const std::string & name, const qnn_dimension_array_t & dimensions, + int rank, const uint8_t * data, const Qnn_DataType_t data_type, QNNBackend device, Qnn_GraphHandle_t graph_handle) { - std::string tensor_name = _name + name + std::to_string(_tensor_parameters.size()); - auto param_tensor = std::make_shared(ggml_qnn_tensor::PARAMETER, tensor_name, dimensions, - data_type, rank, device, graph_handle, _qnn_instance); - size_t data_size = ggml_type_size(ggml_datatype_from_qnn_datatype(data_type)); + std::string tensor_name = _name + name + std::to_string(_tensor_parameters.size()); + auto param_tensor = std::make_shared(ggml_qnn_tensor::PARAMETER, tensor_name, dimensions, + data_type, rank, device, graph_handle, _qnn_instance); + size_t data_size = ggml_type_size(ggml_datatype_from_qnn_datatype(data_type)); for (int i = 0; i < rank; i++) { data_size *= dimensions[i]; } GGML_ASSERT(data_size > 0); if (!param_tensor->set_data_buffer(data, data_size)) { - QNN_LOG_ERROR("parameter tensor bind_buffer failed"); + QNN_LOG_ERROR("parameter tensor bind_buffer failed\n"); return false; } if (!param_tensor->alloc_qnn_tensor_id()) { - QNN_LOG_ERROR("parameter tensor alloc_qnn_tensor_id failed"); + QNN_LOG_ERROR("parameter tensor alloc_qnn_tensor_id failed\n"); return false; } _tensor_parameters.push_back(param_tensor); _param_names.push_back(name); Qnn_Param_t param = QNN_PARAM_INIT; - param.paramType = QNN_PARAMTYPE_TENSOR; - param.name = _param_names.back().c_str(); + param.paramType = QNN_PARAMTYPE_TENSOR; + param.name = _param_names.back().c_str(); param.tensorParam = param_tensor->get_qnn_tensor(); _qnn_parameters.push_back(param); return true; } -void ggml_qnn_op_config_base::set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) { +void ggml_qnn_op_config_base::set_input_tensors(qnn::qnn_tensor_array_t & tensor_inputs) { _tensor_inputs = tensor_inputs; _qnn_tensor_inputs.resize(_tensor_inputs.size()); } -void ggml_qnn_op_config_base::set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) { +void ggml_qnn_op_config_base::set_input_tensors(qnn::qnn_tensor_array_t && tensor_inputs) { _tensor_inputs = tensor_inputs; _qnn_tensor_inputs.resize(_tensor_inputs.size()); } -void ggml_qnn_op_config_base::set_output_tensors(qnn::qnn_tensor_array_t &tensor_outputs) { +void ggml_qnn_op_config_base::set_output_tensors(qnn::qnn_tensor_array_t & tensor_outputs) { _tensor_outputs = std::move(tensor_outputs); _qnn_tensor_outputs.resize(_tensor_outputs.size()); } -void ggml_qnn_op_config_base::set_output_tensors(qnn::qnn_tensor_array_t &&tensor_outputs) { +void ggml_qnn_op_config_base::set_output_tensors(qnn::qnn_tensor_array_t && tensor_outputs) { _tensor_outputs = std::move(tensor_outputs); _qnn_tensor_outputs.resize(_tensor_outputs.size()); } @@ -109,74 +102,80 @@ bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) { GGML_ASSERT(_qnn_tensor_inputs.size() == _tensor_inputs.size()); GGML_ASSERT(_qnn_tensor_outputs.size() == _tensor_outputs.size()); - QNN_LOG_DEBUG("[%s]add to graph start", _name.c_str()); + QNN_LOG_DEBUG("[%s]add to graph start\n", _name.c_str()); for (size_t i = 0; i < _tensor_inputs.size(); i++) { auto tensor = _tensor_inputs[i]; if (!tensor->alloc_qnn_tensor_id()) { - QNN_LOG_ERROR("[%s]input tensor alloc_qnn_tensor_id failed", _name.c_str()); + QNN_LOG_ERROR("[%s]input tensor alloc_qnn_tensor_id failed\n", _name.c_str()); return false; } - QNN_LOG_DEBUG("[%s]input tensor id: %d", _name.c_str(), tensor->get_qnn_tensor_id()); + QNN_LOG_DEBUG("[%s]input tensor id: %d\n", _name.c_str(), tensor->get_qnn_tensor_id()); _qnn_tensor_inputs[i] = tensor->get_qnn_tensor(); } for (size_t i = 0; i < _tensor_outputs.size(); i++) { auto tensor = _tensor_outputs[i]; if (!tensor->alloc_qnn_tensor_id()) { - QNN_LOG_ERROR("[%s]output tensor alloc_qnn_tensor_id failed", _name.c_str()); + QNN_LOG_ERROR("[%s]output tensor alloc_qnn_tensor_id failed\n", _name.c_str()); return false; } - QNN_LOG_DEBUG("[%s]output tensor id: %d", _name.c_str(), tensor->get_qnn_tensor_id()); + QNN_LOG_DEBUG("[%s]output tensor id: %d\n", _name.c_str(), tensor->get_qnn_tensor_id()); _qnn_tensor_outputs[i] = tensor->get_qnn_tensor(); } auto qnn_interface = _qnn_instance->get_qnn_interface(); - auto error = qnn_interface->qnn_graph_add_node(graph_handle, get_op_config()); + auto error = qnn_interface->qnn_graph_add_node(graph_handle, get_op_config()); if (error != QNN_SUCCESS) { - QNN_LOG_ERROR("[%s]qnn_graph_add_node.error: %s", _name.c_str(), get_qnn_error_string(error)); + QNN_LOG_ERROR("[%s]qnn_graph_add_node.error: %s\n", _name.c_str(), get_qnn_error_string(error)); return false; } - QNN_LOG_DEBUG("[%s]added to graph succeed", _name.c_str()); + QNN_LOG_DEBUG("[%s]added to graph succeed\n", _name.c_str()); return true; } -bool ggml_qnn_op_config_base::bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) { +bool ggml_qnn_op_config_base::bind_input_tensors(const ggml_tensor_array_t & tensor_inputs) { GGML_ASSERT(tensor_inputs.size() == _tensor_inputs.size()); return qnn::bind_tensors(tensor_inputs, _tensor_inputs, _qnn_tensor_inputs); } -bool ggml_qnn_op_config_base::bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) { +bool ggml_qnn_op_config_base::bind_output_tensors(const ggml_tensor_array_t & tensor_outputs) { GGML_ASSERT(tensor_outputs.size() == _tensor_outputs.size()); return qnn::bind_tensors(tensor_outputs, _tensor_outputs, _qnn_tensor_outputs); } void ggml_qnn_op_config_base::unbind_input_tensors() { - for (auto &tensor : _tensor_inputs) { + for (auto & tensor : _tensor_inputs) { tensor->unbind(); } } void ggml_qnn_op_config_base::unbind_output_tensors() { - for (auto &tensor : _tensor_outputs) { + for (auto & tensor : _tensor_outputs) { tensor->unbind(); } } Qnn_OpConfig_t ggml_qnn_op_config_base::get_op_config() { - Qnn_OpConfig_t config = QNN_OPCONFIG_INIT; - config.version = QNN_OPCONFIG_VERSION_1; - auto &op_config = config.v1; - op_config.name = _name.c_str(); - op_config.packageName = _package_name.c_str(); - op_config.typeName = _op_type.c_str(); - op_config.numOfParams = (uint32_t)_qnn_parameters.size(); - op_config.params = _qnn_parameters.data(); - op_config.numOfInputs = (uint32_t)_qnn_tensor_inputs.size(); - op_config.inputTensors = _qnn_tensor_inputs.data(); - op_config.numOfOutputs = (uint32_t)_qnn_tensor_outputs.size(); + GGML_ASSERT(_qnn_parameters.size() == _param_names.size()); + + for (size_t i = 0; i < _qnn_parameters.size(); i++) { + _qnn_parameters[i].name = _param_names[i].c_str(); + } + + Qnn_OpConfig_t config = QNN_OPCONFIG_INIT; + config.version = QNN_OPCONFIG_VERSION_1; + auto & op_config = config.v1; + op_config.name = _name.c_str(); + op_config.packageName = _package_name.c_str(); + op_config.typeName = _op_type.c_str(); + op_config.numOfParams = (uint32_t) _qnn_parameters.size(); + op_config.params = _qnn_parameters.data(); + op_config.numOfInputs = (uint32_t) _qnn_tensor_inputs.size(); + op_config.inputTensors = _qnn_tensor_inputs.data(); + op_config.numOfOutputs = (uint32_t) _qnn_tensor_outputs.size(); op_config.outputTensors = _qnn_tensor_outputs.data(); return config; } @@ -188,33 +187,33 @@ bool ggml_qnn_single_op_config::initialize_op_nodes(QNNBackend device, Qnn_Graph } bool ggml_qnn_rmsnorm_op_config::initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) { - constexpr const uint32_t kAxes[] = {0}; - add_tensor_param(QNN_OP_RMS_NORM_PARAM_AXES, {1}, 1, reinterpret_cast(kAxes), QNN_DATATYPE_UINT_32, - device, graph_handle); + constexpr const uint32_t kAxes[] = { 0 }; + add_tensor_param(QNN_OP_RMS_NORM_PARAM_AXES, { 1 }, 1, reinterpret_cast(kAxes), + QNN_DATATYPE_UINT_32, device, graph_handle); return true; } -void ggml_qnn_aggregate_op_config::set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) { +void ggml_qnn_aggregate_op_config::set_input_tensors(qnn::qnn_tensor_array_t & tensor_inputs) { _tensor_inputs = tensor_inputs; } -void ggml_qnn_aggregate_op_config::set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) { +void ggml_qnn_aggregate_op_config::set_input_tensors(qnn::qnn_tensor_array_t && tensor_inputs) { _tensor_inputs = std::move(tensor_inputs); } -void ggml_qnn_aggregate_op_config::set_output_tensors(qnn::qnn_tensor_array_t &tensor_outputs) { +void ggml_qnn_aggregate_op_config::set_output_tensors(qnn::qnn_tensor_array_t & tensor_outputs) { _tensor_outputs = tensor_outputs; } -void ggml_qnn_aggregate_op_config::set_output_tensors(qnn::qnn_tensor_array_t &&tensor_outputs) { +void ggml_qnn_aggregate_op_config::set_output_tensors(qnn::qnn_tensor_array_t && tensor_outputs) { _tensor_outputs = std::move(tensor_outputs); } -bool ggml_qnn_aggregate_op_config::bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) { +bool ggml_qnn_aggregate_op_config::bind_input_tensors(const ggml_tensor_array_t & tensor_inputs) { return qnn::bind_tensors(tensor_inputs, _tensor_inputs); } -bool ggml_qnn_aggregate_op_config::bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) { +bool ggml_qnn_aggregate_op_config::bind_output_tensors(const ggml_tensor_array_t & tensor_outputs) { return qnn::bind_tensors(tensor_outputs, _tensor_outputs); } @@ -223,18 +222,18 @@ bool ggml_qnn_matmul_op_config::initialize_op_nodes(QNNBackend device, Qnn_Graph GGML_ASSERT(_tensor_outputs.size() == 1); // create convert nodes - const auto tensor_rank = _tensor_inputs.front()->get_rank(); - qnn_tensor_array_t mat_mul_tensor_inputs = _tensor_inputs; + const auto tensor_rank = _tensor_inputs.front()->get_rank(); + qnn_tensor_array_t mat_mul_tensor_inputs = _tensor_inputs; qnn_tensor_array_t mat_mul_tensor_outputs = _tensor_outputs; if (!create_convert_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs, mat_mul_tensor_outputs)) { - QNN_LOG_ERROR("create convert nodes failed"); + QNN_LOG_ERROR("create convert nodes failed\n"); return false; } mat_mul_tensor_inputs.front() = create_gather_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs.front(), mat_mul_tensor_inputs.back()->get_dimensions()); - return create_mat_mul_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs, mat_mul_tensor_outputs); + return create_mat_mul_nodes(mat_mul_tensor_inputs, mat_mul_tensor_outputs); } qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, @@ -244,9 +243,9 @@ qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend devic return tensor_input; } - const auto &input_dimensions = tensor_input->get_dimensions(); - output_dimensions[rank - 1] = input_dimensions[rank - 1]; - output_dimensions[rank - 2] = input_dimensions[rank - 2]; + const auto & input_dimensions = tensor_input->get_dimensions(); + output_dimensions[rank - 1] = input_dimensions[rank - 1]; + output_dimensions[rank - 2] = input_dimensions[rank - 2]; const auto y = output_dimensions[rank - 3] / input_dimensions[rank - 3]; if (y == 1 && (rank == 3 || (rank == 4 && output_dimensions[rank - 4] == input_dimensions[rank - 4]))) { @@ -255,9 +254,9 @@ qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend devic // create concat nodes, to convert tensor shape from [ne03, ne02, n, k] to [ne03 * x, ne02 * y, n, k] constexpr const auto create_node = - [](const std::string &name, const int rank, const int axis, const qnn_dimension_array_t &dimensions, + [](const std::string & name, const int rank, const int axis, const qnn_dimension_array_t & dimensions, qnn_tensor_ptr_t tensor_input, QNNBackend device, Qnn_GraphHandle_t graph_handle, - std::shared_ptr qnn_instance, qnn_tensor_ptr_t &tensor_output) -> qnn_op_config_ptr_t { + std::shared_ptr qnn_instance, qnn_tensor_ptr_t & tensor_output) -> qnn_op_config_ptr_t { auto gather_out = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, name + "_out", dimensions, tensor_input->get_data_type(), rank, device, graph_handle, qnn_instance); @@ -265,32 +264,32 @@ qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend devic qnn_instance); Qnn_Scalar_t scalar = QNN_SCALAR_INIT; - scalar.dataType = QNN_DATATYPE_INT_32; - scalar.int32Value = axis; + scalar.dataType = QNN_DATATYPE_INT_32; + scalar.int32Value = axis; gather_op->add_scalar_param(QNN_OP_GATHER_PARAM_AXIS, scalar); - gather_op->set_output_tensors({gather_out}); + gather_op->set_output_tensors({ gather_out }); // here we calculate the index mapping, will generate a 1d tensor like [0, 0, 0, 1, 1, 1, 2, 2, 2, ...], // by repeating each index [scale] times. - const auto scale = dimensions[axis] / tensor_input->get_dimensions()[axis]; - auto index_buffer = std::make_shared(dimensions[axis] * sizeof(uint32_t)); + const auto scale = dimensions[axis] / tensor_input->get_dimensions()[axis]; + auto index_buffer = std::make_shared(dimensions[axis] * sizeof(uint32_t)); for (uint32_t *curr = reinterpret_cast(index_buffer->get_buffer()), *end = curr + dimensions[axis]; curr < end; curr++) { *curr = uint32_t((curr - reinterpret_cast(index_buffer->get_buffer())) / scale); } auto gather_index = std::make_shared( - ggml_qnn_tensor::PARAMETER, name + "_index", qnn_dimension_array_t{dimensions[axis]}, QNN_DATATYPE_UINT_32, - 1, device, graph_handle, qnn_instance); + ggml_qnn_tensor::PARAMETER, name + "_index", qnn_dimension_array_t{ dimensions[axis] }, + QNN_DATATYPE_UINT_32, 1, device, graph_handle, qnn_instance); gather_index->set_data_buffer(index_buffer); - gather_op->set_input_tensors({tensor_input, gather_index}); + gather_op->set_input_tensors({ tensor_input, gather_index }); tensor_output = gather_out; return gather_op; }; qnn_dimension_array_t intermediate_dimensions = input_dimensions; - intermediate_dimensions[rank - 3] = output_dimensions[rank - 3]; + intermediate_dimensions[rank - 3] = output_dimensions[rank - 3]; qnn_tensor_ptr_t gather0_out; _operations.push_back(create_node(_name + "_gather0", rank, rank - 3, intermediate_dimensions, tensor_input, device, graph_handle, _qnn_instance, gather0_out)); @@ -305,8 +304,8 @@ qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend devic } bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, - qnn_tensor_array_t &tensor_inputs, - qnn_tensor_array_t &tensor_outputs) { + qnn_tensor_array_t & tensor_inputs, + qnn_tensor_array_t & tensor_outputs) { if (device == QNN_BACKEND_GPU) { // there's no convert op for GPU, so we should create matmul nodes directly. return true; @@ -314,7 +313,7 @@ bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_Grap // create tensors for convert node auto tensor_type = get_tensor_type(tensor_inputs); - QNN_LOG_DEBUG("input tensor type: %s", qnn_datatype_to_string(tensor_type)); + QNN_LOG_DEBUG("input tensor type: %s\n", qnn_datatype_to_string(tensor_type)); for (size_t i = 0; i < tensor_inputs.size(); ++i) { // create input convert nodes @@ -327,10 +326,10 @@ bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_Grap auto convert_out = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_out", convert_in->get_dimensions(), tensor_type, rank, device, graph_handle, _qnn_instance); - auto convert = std::make_shared(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_CONVERT, _qnn_instance); - convert->set_input_tensors({convert_in}); - convert->set_output_tensors({convert_out}); + auto convert = std::make_shared(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_CONVERT, _qnn_instance); + convert->set_input_tensors({ convert_in }); + convert->set_output_tensors({ convert_out }); tensor_inputs[i] = convert_out; _operations.push_back(convert); } @@ -338,14 +337,14 @@ bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_Grap if (tensor_outputs.front()->get_data_type() != tensor_type) { // create output convert node std::string convert_name("convert_dst"); - auto convert_out = tensor_outputs.front(); - auto convert_in = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_in", - convert_out->get_dimensions(), tensor_type, rank, device, - graph_handle, _qnn_instance); + auto convert_out = tensor_outputs.front(); + auto convert_in = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_in", + convert_out->get_dimensions(), tensor_type, rank, device, + graph_handle, _qnn_instance); auto output_convert = std::make_shared(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_CONVERT, _qnn_instance); - output_convert->set_input_tensors({convert_in}); - output_convert->set_output_tensors({convert_out}); + output_convert->set_input_tensors({ convert_in }); + output_convert->set_output_tensors({ convert_out }); tensor_outputs.front() = convert_in; _operations.push_back(output_convert); } @@ -353,10 +352,8 @@ bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_Grap return true; } -bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, - qnn_tensor_array_t &tensor_inputs, - qnn_tensor_array_t &tensor_outputs) { - +bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(qnn_tensor_array_t & tensor_inputs, + qnn_tensor_array_t & tensor_outputs) { /* * First, both the ggml and qnn tensor in memory are stored as row-major format. (For more details, please refer to: * https://pytorch.org/blog/tensor-memory-format-matters/#:~:text=Column%20Major%20Order:%20In%20this%20format,%20the%20matrix) @@ -395,8 +392,8 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap * So here we need to create graph like: * ```mermaid * graph TD; - * i1>ggml_tensor_in0] --src1--> mat_mul0; - * i2>ggml_tensor_in1] --src0.T--> mat_mul0; + * i1>ggml_tensor_in1] --src0--> mat_mul0; + * i2>ggml_tensor_in0] --src1.T--> mat_mul0; * mat_mul0 --dst0--> o1>ggml_tensor_out]; * ``` */ @@ -411,8 +408,8 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap std::make_shared(_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL, _qnn_instance); Qnn_Scalar_t scalar = QNN_SCALAR_INIT; - scalar.dataType = QNN_DATATYPE_BOOL_8; - scalar.bool8Value = 1; + scalar.dataType = QNN_DATATYPE_BOOL_8; + scalar.bool8Value = 1; mat_mul->add_scalar_param(QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1, scalar); // set tensor to mat_mul @@ -424,4 +421,4 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap return true; } -} // namespace qnn +} // namespace qnn diff --git a/ggml/src/ggml-qnn/op-config-impl.hpp b/ggml/src/ggml-qnn/op-config-impl.hpp index 4a00ed2cc7ac3..8e2f107b2dae1 100644 --- a/ggml/src/ggml-qnn/op-config-impl.hpp +++ b/ggml/src/ggml-qnn/op-config-impl.hpp @@ -13,77 +13,83 @@ namespace qnn { class ggml_qnn_op_config_base : public ggml_qnn_op_config { -public: - explicit ggml_qnn_op_config_base(const std::string &name, const std::string &package_name, - const std::string &op_type, std::shared_ptr qnn_instance) - : _name(name), _package_name(package_name), _op_type(op_type), _qnn_instance(qnn_instance) {} - - void add_scalar_param(const std::string &name, const Qnn_Scalar_t scalar); - bool add_tensor_param(const std::string &name, const qnn_dimension_array_t &dimensions, int rank, - const uint8_t *data, const Qnn_DataType_t data_type, QNNBackend device, + public: + explicit ggml_qnn_op_config_base(const std::string & name, const std::string & package_name, + const std::string & op_type, std::shared_ptr qnn_instance) : + _name(name), + _package_name(package_name), + _op_type(op_type), + _qnn_instance(qnn_instance) {} + + void add_scalar_param(const std::string & name, const Qnn_Scalar_t scalar); + bool add_tensor_param(const std::string & name, const qnn_dimension_array_t & dimensions, int rank, + const uint8_t * data, const Qnn_DataType_t data_type, QNNBackend device, Qnn_GraphHandle_t graph_handle); - void set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) override; - void set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) override; - void set_output_tensors(qnn::qnn_tensor_array_t &tensor_inputs) override; - void set_output_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) override; + void set_input_tensors(qnn::qnn_tensor_array_t & tensor_inputs) override; + void set_input_tensors(qnn::qnn_tensor_array_t && tensor_inputs) override; + void set_output_tensors(qnn::qnn_tensor_array_t & tensor_inputs) override; + void set_output_tensors(qnn::qnn_tensor_array_t && tensor_inputs) override; bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override; - bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) override; - bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) override; + bool bind_input_tensors(const ggml_tensor_array_t & tensor_inputs) override; + bool bind_output_tensors(const ggml_tensor_array_t & tensor_outputs) override; void unbind_input_tensors() override; void unbind_output_tensors() override; - const qnn_tensor_array_t &get_input_tensors() override { return _tensor_inputs; } - const qnn_tensor_array_t &get_output_tensors() override { return _tensor_outputs; } -protected: + const qnn_tensor_array_t & get_input_tensors() override { return _tensor_inputs; } + + const qnn_tensor_array_t & get_output_tensors() override { return _tensor_outputs; } + + protected: Qnn_OpConfig_t get_op_config(); - std::string _name; - std::string _package_name; - std::string _op_type; + std::string _name; + std::string _package_name; + std::string _op_type; std::shared_ptr _qnn_instance; - qnn_tensor_array_t _tensor_inputs; - qnn_tensor_array_t _tensor_outputs; - qnn_tensor_array_t _tensor_parameters; - std::vector _qnn_tensor_inputs; - std::vector _qnn_tensor_outputs; - std::vector _qnn_parameters; - std::vector _param_names; + qnn_tensor_array_t _tensor_inputs; + qnn_tensor_array_t _tensor_outputs; + qnn_tensor_array_t _tensor_parameters; + std::vector _qnn_tensor_inputs; + std::vector _qnn_tensor_outputs; + std::vector _qnn_parameters; + std::vector _param_names; DISABLE_COPY(ggml_qnn_op_config_base); DISABLE_MOVE(ggml_qnn_op_config_base); }; class ggml_qnn_single_op_config : public ggml_qnn_op_config_base { -public: - explicit ggml_qnn_single_op_config(const std::string &name, const std::string &package_name, - const std::string &op_type, std::shared_ptr qnn_instance) - : ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} + public: + explicit ggml_qnn_single_op_config(const std::string & name, const std::string & package_name, + const std::string & op_type, std::shared_ptr qnn_instance) : + ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) override; -private: + private: DISABLE_COPY(ggml_qnn_single_op_config); DISABLE_MOVE(ggml_qnn_single_op_config); }; class ggml_qnn_rmsnorm_op_config : public ggml_qnn_op_config_base { -public: - explicit ggml_qnn_rmsnorm_op_config(const std::string &name, const std::string &package_name, - const std::string &op_type, std::shared_ptr qnn_instance) - : ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} + public: + explicit ggml_qnn_rmsnorm_op_config(const std::string & name, const std::string & package_name, + const std::string & op_type, std::shared_ptr qnn_instance) : + ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) override; -private: + private: DISABLE_COPY(ggml_qnn_rmsnorm_op_config); DISABLE_MOVE(ggml_qnn_rmsnorm_op_config); }; class ggml_qnn_aggregate_op_config : public ggml_qnn_op_config { -public: - explicit ggml_qnn_aggregate_op_config(const std::string &name, std::shared_ptr qnn_instance) - : _name(name), _qnn_instance(qnn_instance) {} + public: + explicit ggml_qnn_aggregate_op_config(const std::string & name, std::shared_ptr qnn_instance) : + _name(name), + _qnn_instance(qnn_instance) {} ~ggml_qnn_aggregate_op_config() { _tensor_inputs.clear(); @@ -91,61 +97,63 @@ class ggml_qnn_aggregate_op_config : public ggml_qnn_op_config { _operations.clear(); } - void set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) override; - void set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) override; - void set_output_tensors(qnn::qnn_tensor_array_t &tensor_inputs) override; - void set_output_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) override; + void set_input_tensors(qnn::qnn_tensor_array_t & tensor_inputs) override; + void set_input_tensors(qnn::qnn_tensor_array_t && tensor_inputs) override; + void set_output_tensors(qnn::qnn_tensor_array_t & tensor_inputs) override; + void set_output_tensors(qnn::qnn_tensor_array_t && tensor_inputs) override; + bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override { return qnn::add_op_to_graph(graph_handle, _operations); } - bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) override; - bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) override; + bool bind_input_tensors(const ggml_tensor_array_t & tensor_inputs) override; + bool bind_output_tensors(const ggml_tensor_array_t & tensor_outputs) override; + void unbind_input_tensors() override { - for (auto &tensor : _tensor_inputs) { + for (auto & tensor : _tensor_inputs) { tensor->unbind(); } } void unbind_output_tensors() override { - for (auto &tensor : _tensor_outputs) { + for (auto & tensor : _tensor_outputs) { tensor->unbind(); } } - const qnn_tensor_array_t &get_input_tensors() override { return _tensor_inputs; } - const qnn_tensor_array_t &get_output_tensors() override { return _tensor_outputs; } + const qnn_tensor_array_t & get_input_tensors() override { return _tensor_inputs; } + + const qnn_tensor_array_t & get_output_tensors() override { return _tensor_outputs; } -protected: - std::string _name; + protected: + std::string _name; std::shared_ptr _qnn_instance; std::vector _operations; - qnn_tensor_array_t _tensor_inputs; - qnn_tensor_array_t _tensor_outputs; + qnn_tensor_array_t _tensor_inputs; + qnn_tensor_array_t _tensor_outputs; -private: + private: DISABLE_COPY(ggml_qnn_aggregate_op_config); DISABLE_MOVE(ggml_qnn_aggregate_op_config); }; class ggml_qnn_matmul_op_config : public ggml_qnn_aggregate_op_config { -public: - ggml_qnn_matmul_op_config(const std::string &name, std::shared_ptr qnn_instance) - : ggml_qnn_aggregate_op_config(name, qnn_instance) {} + public: + ggml_qnn_matmul_op_config(const std::string & name, std::shared_ptr qnn_instance) : + ggml_qnn_aggregate_op_config(name, qnn_instance) {} bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) override; -private: + private: qnn_tensor_ptr_t create_gather_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, qnn_tensor_ptr_t tensor_input, qnn_dimension_array_t output_dimensions); - bool create_convert_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, - qnn_tensor_array_t &tensor_inputs, qnn_tensor_array_t &tensor_outputs); - bool create_mat_mul_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, - qnn_tensor_array_t &tensor_inputs, qnn_tensor_array_t &tensor_outputs); + bool create_convert_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, + qnn_tensor_array_t & tensor_inputs, qnn_tensor_array_t & tensor_outputs); + bool create_mat_mul_nodes(qnn_tensor_array_t & tensor_inputs, qnn_tensor_array_t & tensor_outputs); DISABLE_COPY(ggml_qnn_matmul_op_config); DISABLE_MOVE(ggml_qnn_matmul_op_config); }; -} // namespace qnn +} // namespace qnn diff --git a/ggml/src/ggml-qnn/op-config.hpp b/ggml/src/ggml-qnn/op-config.hpp index 6b8c6946b8e86..d613a2116c04a 100644 --- a/ggml/src/ggml-qnn/op-config.hpp +++ b/ggml/src/ggml-qnn/op-config.hpp @@ -14,14 +14,14 @@ namespace qnn { constexpr const size_t kGgmlUnaryOpStart = GGML_OP_COUNT; -size_t get_qnn_op_index(const ggml_tensor *tensor); -const char *get_qnn_op_name(const ggml_tensor *op); -size_t get_qnn_op_input_param_count(const ggml_tensor *op); -std::shared_ptr create_op(const ggml_tensor *op, const std::string &name, +size_t get_qnn_op_index(const ggml_tensor * tensor); +const char * get_qnn_op_name(const ggml_tensor * op); +size_t get_qnn_op_input_param_count(const ggml_tensor * op); +std::shared_ptr create_op(const ggml_tensor * op, const std::string & name, std::shared_ptr qnn_instance); -inline bool add_op_to_graph(Qnn_GraphHandle_t graph_handle, std::vector &operations) { - for (auto &op : operations) { +inline bool add_op_to_graph(Qnn_GraphHandle_t graph_handle, std::vector & operations) { + for (auto & op : operations) { if (!op->add_op_to_graph(graph_handle)) { return false; } @@ -30,4 +30,4 @@ inline bool add_op_to_graph(Qnn_GraphHandle_t graph_handle, std::vector #if defined(__linux__) -#include +# include #endif namespace { #ifdef _WIN32 -constexpr const char *kQnnSystemLibName = "QnnSystem.dll"; -constexpr const char *kQnnRpcLibName = "libcdsprpc.dll"; +constexpr const char * kQnnSystemLibName = "QnnSystem.dll"; +constexpr const char * kQnnRpcLibName = "libcdsprpc.dll"; #else -constexpr const char *kQnnSystemLibName = "libQnnSystem.so"; -constexpr const char *kQnnRpcLibName = "libcdsprpc.so"; +constexpr const char * kQnnSystemLibName = "libQnnSystem.so"; +constexpr const char * kQnnRpcLibName = "libcdsprpc.so"; #endif -void insert_path(std::string &path, std::string insert_path, const char separator = ':') { +void insert_path(std::string & path, std::string insert_path, const char separator = ':') { if (!insert_path.empty() && !path.empty()) { insert_path += separator; } @@ -27,10 +27,10 @@ void insert_path(std::string &path, std::string insert_path, const char separato } // TODO: Fix this for other platforms, or use a more portable way to set the library search path -bool set_qnn_lib_search_path(const std::string &custom_lib_search_path) { +bool set_qnn_lib_search_path(const std::string & custom_lib_search_path) { #if defined(__linux__) { - auto *original = getenv("LD_LIBRARY_PATH"); + auto * original = getenv("LD_LIBRARY_PATH"); std::string lib_search_path = original ? original : ""; insert_path(lib_search_path, "/vendor/dsp/cdsp:/vendor/lib64:" @@ -41,7 +41,7 @@ bool set_qnn_lib_search_path(const std::string &custom_lib_search_path) { } } -#if defined(__ANDROID__) || defined(ANDROID) +# if defined(__ANDROID__) || defined(ANDROID) { // See also: https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-2/dsp_runtime.html std::string adsp_lib_search_path = custom_lib_search_path + @@ -51,87 +51,89 @@ bool set_qnn_lib_search_path(const std::string &custom_lib_search_path) { return false; } - QNN_LOG_DEBUG("ADSP_LIBRARY_PATH=%s", getenv("ADSP_LIBRARY_PATH")); + QNN_LOG_DEBUG("ADSP_LIBRARY_PATH=%s", getenv("ADSP_LIBRARY_PATH\n")); } -#endif +# endif - QNN_LOG_DEBUG("LD_LIBRARY_PATH=%s", getenv("LD_LIBRARY_PATH")); + QNN_LOG_DEBUG("LD_LIBRARY_PATH=%s", getenv("LD_LIBRARY_PATH\n")); #else - (void)custom_lib_search_path; + (void) custom_lib_search_path; #endif return true; } -qnn::dl_handler_t load_lib_with_fallback(const std::string &lib_path, const std::string &load_directory) { +qnn::dl_handler_t load_lib_with_fallback(const std::string & lib_path, const std::string & load_directory) { std::filesystem::path full_path(load_directory); full_path /= std::filesystem::path(lib_path).filename(); auto handle = qnn::dl_load(full_path.string()); if (!handle) { - QNN_LOG_WARN("failed to load %s, fallback to %s", full_path.c_str(), lib_path.c_str()); + QNN_LOG_WARN("failed to load %s, fallback to %s\n", full_path.c_str(), lib_path.c_str()); handle = qnn::dl_load(lib_path); } return handle; } -} // namespace +} // namespace namespace qnn { -qnn_system_interface::qnn_system_interface(const QnnSystemInterface_t &qnn_sys_interface, dl_handler_t lib_handle) - : _qnn_sys_interface(qnn_sys_interface), _lib_handle(lib_handle) { +qnn_system_interface::qnn_system_interface(const QnnSystemInterface_t & qnn_sys_interface, dl_handler_t lib_handle) : + _qnn_sys_interface(qnn_sys_interface), + _lib_handle(lib_handle) { qnn_system_context_create(&_qnn_system_handle); if (_qnn_system_handle) { - QNN_LOG_INFO("initialize qnn system successfully"); + QNN_LOG_INFO("initialize qnn system successfully\n"); } else { - QNN_LOG_WARN("can not create QNN system contenxt"); + QNN_LOG_WARN("can not create QNN system contenxt\n"); } } qnn_system_interface::~qnn_system_interface() { if (_qnn_system_handle) { if (qnn_system_context_free(_qnn_system_handle) != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN system context"); + QNN_LOG_WARN("failed to free QNN system context\n"); } } else { - QNN_LOG_WARN("system handle is null"); + QNN_LOG_WARN("system handle is null\n"); } if (_lib_handle) { if (!dl_unload(_lib_handle)) { - QNN_LOG_WARN("failed to close QnnSystem library, error %s", dl_error()); + QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", dl_error()); } } else { - QNN_LOG_WARN("system lib handle is null"); + QNN_LOG_WARN("system lib handle is null\n"); } } -qnn_instance::qnn_instance(const std::string &lib_path, const std::string &backend_lib_name) - : _additional_lib_load_path(lib_path), _backend_lib_name(std::move(backend_lib_name)) { +qnn_instance::qnn_instance(const std::string & lib_path, const std::string & backend_lib_name) : + _additional_lib_load_path(lib_path), + _backend_lib_name(std::move(backend_lib_name)) { if (set_qnn_lib_search_path(lib_path)) { - QNN_LOG_DEBUG("[%s] set_qnn_lib_search_path succeed", _backend_lib_name.c_str()); + QNN_LOG_DEBUG("[%s] set_qnn_lib_search_path succeed\n", _backend_lib_name.c_str()); } else { - QNN_LOG_ERROR("[%s] set_qnn_lib_search_path failed", _backend_lib_name.c_str()); + QNN_LOG_ERROR("[%s] set_qnn_lib_search_path failed\n", _backend_lib_name.c_str()); } } -int qnn_instance::qnn_init(const QnnSaver_Config_t **saver_config) { +int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { BackendIdType backend_id = QNN_BACKEND_ID_NULL; - QNN_LOG_DEBUG("enter qnn_init"); + QNN_LOG_DEBUG("enter qnn_init\n"); std::lock_guard lock(_init_mutex); if (load_system() != 0) { - QNN_LOG_WARN("failed to load QNN system lib"); + QNN_LOG_WARN("failed to load QNN system lib\n"); return 1; } else { - QNN_LOG_DEBUG("load QNN system lib successfully"); + QNN_LOG_DEBUG("load QNN system lib successfully\n"); } std::string backend_lib_path = _backend_lib_name; if (_lib_path_to_backend_id.count(backend_lib_path) == 0) { if (load_backend(backend_lib_path, saver_config) != 0) { - QNN_LOG_WARN("failed to load QNN backend"); + QNN_LOG_WARN("failed to load QNN backend\n"); return 2; } } @@ -149,119 +151,119 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t **saver_config) { _qnn_interface->qnn_log_create(qnn::sdk_logcallback, _qnn_log_level, &_qnn_log_handle); if (!_qnn_log_handle) { // NPU backend not work on Qualcomm SoC equipped low-end phone - QNN_LOG_WARN("why failed to initialize qnn log"); + QNN_LOG_WARN("why failed to initialize qnn log\n"); return 4; } else { - QNN_LOG_DEBUG("initialize qnn log successfully"); + QNN_LOG_DEBUG("initialize qnn log successfully\n"); } std::vector temp_backend_config; _qnn_interface->qnn_backend_create( _qnn_log_handle, temp_backend_config.empty() ? nullptr : temp_backend_config.data(), &_qnn_backend_handle); if (!_qnn_backend_handle) { - QNN_LOG_WARN("why failed to initialize qnn backend"); + QNN_LOG_WARN("why failed to initialize qnn backend\n"); return 5; } else { - QNN_LOG_DEBUG("initialize qnn backend successfully"); + QNN_LOG_DEBUG("initialize qnn backend successfully\n"); } auto qnn_status = _qnn_interface->qnn_property_has_capability(QNN_PROPERTY_GROUP_DEVICE); if (QNN_PROPERTY_NOT_SUPPORTED == qnn_status) { - QNN_LOG_WARN("device property is not supported"); + QNN_LOG_WARN("device property is not supported\n"); } if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnn_status) { - QNN_LOG_WARN("device property is not known to backend"); + QNN_LOG_WARN("device property is not known to backend\n"); } qnn_status = QNN_SUCCESS; if (_backend_lib_name.find("Htp") != _backend_lib_name.npos) { - const QnnDevice_PlatformInfo_t *p_info = nullptr; - qnn_status = _qnn_interface->qnn_device_get_platform_info(nullptr, &p_info); + const QnnDevice_PlatformInfo_t * p_info = nullptr; + qnn_status = _qnn_interface->qnn_device_get_platform_info(nullptr, &p_info); if (qnn_status == QNN_SUCCESS) { - QNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices); - QnnDevice_HardwareDeviceInfo_t *infos = p_info->v1.hwDevices; + QNN_LOG_INFO("device counts %d\n", p_info->v1.numHwDevices); + QnnDevice_HardwareDeviceInfo_t * infos = p_info->v1.hwDevices; QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = {}; for (uint32_t i = 0; i < p_info->v1.numHwDevices; i++) { - QNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId, infos[i].v1.deviceType, - infos[i].v1.numCores); + QNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d\n", (int) infos[i].v1.deviceId, + (int) infos[i].v1.deviceType, (int) infos[i].v1.numCores); QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension; - chipinfo = devinfo->onChipDevice; - size_t htp_arch = (size_t)chipinfo.arch; - QNN_LOG_INFO("htp_type:%d(%s)", devinfo->devType, + chipinfo = devinfo->onChipDevice; + size_t htp_arch = (size_t) chipinfo.arch; + QNN_LOG_INFO("htp_type:%d(%s)\n", devinfo->devType, (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : ""); - QNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB", chipinfo.socModel, - qnn::get_chipset_desc(chipinfo.socModel), htp_arch, qnn::get_htparch_desc(htp_arch), - chipinfo.vtcmSize); - _soc_info = {chipinfo.socModel, htp_arch, chipinfo.vtcmSize}; + QNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB\n", (int) chipinfo.socModel, + qnn::get_chipset_desc(chipinfo.socModel), (int) htp_arch, qnn::get_htparch_desc(htp_arch), + (int) chipinfo.vtcmSize); + _soc_info = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize }; } _qnn_interface->qnn_device_free_platform_info(nullptr, p_info); } else { // For emulator, we can't get platform info - QNN_LOG_WARN("failed to get platform info, are we in emulator?"); - _soc_info = {NONE, UNKNOWN_SM, 0}; + QNN_LOG_WARN("failed to get platform info, are we in emulator?\n"); + _soc_info = { NONE, UNKNOWN_SM, 0 }; } QnnHtpDevice_CustomConfig_t soc_customconfig; - soc_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_SOC; + soc_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_SOC; soc_customconfig.socModel = _soc_info.soc_model; QnnDevice_Config_t soc_devconfig; - soc_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; + soc_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; soc_devconfig.customConfig = &soc_customconfig; QnnHtpDevice_CustomConfig_t arch_customconfig; - arch_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_ARCH; - arch_customconfig.arch.arch = (QnnHtpDevice_Arch_t)_soc_info.htp_arch; - arch_customconfig.arch.deviceId = 0; // Id of device to be used. 0 will use by default. + arch_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_ARCH; + arch_customconfig.arch.arch = (QnnHtpDevice_Arch_t) _soc_info.htp_arch; + arch_customconfig.arch.deviceId = 0; // Id of device to be used. 0 will use by default. QnnDevice_Config_t arch_devconfig; - arch_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; + arch_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; arch_devconfig.customConfig = &arch_customconfig; - const QnnDevice_Config_t *p_deviceconfig[] = {&soc_devconfig, &arch_devconfig, nullptr}; + const QnnDevice_Config_t * p_deviceconfig[] = { &soc_devconfig, &arch_devconfig, nullptr }; qnn_status = _qnn_interface->qnn_device_create(_qnn_log_handle, p_deviceconfig, &_qnn_device_handle); } else { qnn_status = _qnn_interface->qnn_device_create(_qnn_log_handle, nullptr, &_qnn_device_handle); } if (QNN_SUCCESS != qnn_status && QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnn_status) { - QNN_LOG_WARN("failed to create QNN device"); + QNN_LOG_WARN("failed to create QNN device\n"); } else { - QNN_LOG_INFO("create QNN device successfully"); + QNN_LOG_INFO("create QNN device successfully\n"); } if (_profile_level != sdk_profile_level::profile_off) { - QNN_LOG_INFO("profiling turned on; level = %d", _profile_level); + QNN_LOG_INFO("profiling turned on; level = %d\n", _profile_level); auto profile_level = _profile_level == sdk_profile_level::profile_detail ? QNN_PROFILE_LEVEL_DETAILED : QNN_PROFILE_LEVEL_BASIC; if (QNN_PROFILE_NO_ERROR != _qnn_interface->qnn_profile_create(_qnn_backend_handle, profile_level, &_qnn_profile_handle)) { - QNN_LOG_WARN("unable to create profile handle in the backend"); + QNN_LOG_WARN("unable to create profile handle in the backend\n"); return 6; } else { - QNN_LOG_DEBUG("initialize qnn profile successfully"); + QNN_LOG_DEBUG("initialize qnn profile successfully\n"); } } _rpc_lib_handle = load_lib_with_fallback(kQnnRpcLibName, _additional_lib_load_path); if (_rpc_lib_handle) { _pfn_rpc_mem_alloc = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_alloc")); - _pfn_rpc_mem_free = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_free")); + _pfn_rpc_mem_free = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_free")); _pfn_rpc_mem_to_fd = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_to_fd")); if (!_pfn_rpc_mem_alloc || !_pfn_rpc_mem_free || !_pfn_rpc_mem_to_fd) { - QNN_LOG_WARN("unable to access symbols in QNN RPC lib. error: %s", dl_error()); + QNN_LOG_WARN("unable to access symbols in QNN RPC lib. error: %s\n", dl_error()); dl_unload(_rpc_lib_handle); return 9; } - _pfn_rpc_mem_init = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_init")); + _pfn_rpc_mem_init = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_init")); _pfn_rpc_mem_deinit = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_deinit")); if (_pfn_rpc_mem_init) { _pfn_rpc_mem_init(); } _rpcmem_initialized = true; - QNN_LOG_DEBUG("load rpcmem lib successfully"); + QNN_LOG_DEBUG("load rpcmem lib successfully\n"); } else { - QNN_LOG_WARN("failed to load qualcomm rpc lib, skipping, error:%s", dl_error()); + QNN_LOG_WARN("failed to load qualcomm rpc lib, skipping, error:%s\n", dl_error()); } /* TODO: not used, keep it for further usage @@ -271,23 +273,23 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t **saver_config) { */ _qnn_interface->qnn_context_create(_qnn_backend_handle, _qnn_device_handle, nullptr, &_qnn_context_handle); if (nullptr == _qnn_context_handle) { - QNN_LOG_WARN("why failed to initialize qnn context"); + QNN_LOG_WARN("why failed to initialize qnn context\n"); return 10; } else { - QNN_LOG_DEBUG("initialize qnn context successfully"); + QNN_LOG_DEBUG("initialize qnn context successfully\n"); } if (_backend_lib_name.find("Htp") != _backend_lib_name.npos) { // TODO: faster approach to probe the accurate capacity of rpc ion memory - size_t candidate_size = 0; - uint8_t *rpc_buffer = nullptr; - const int size_in_mb = (1 << 20); - size_t probe_slots[] = {1024, 1536, 2048 - 48, 2048}; - size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); + size_t candidate_size = 0; + uint8_t * rpc_buffer = nullptr; + const int size_in_mb = (1 << 20); + size_t probe_slots[] = { 1024, 1536, 2048 - 48, 2048 }; + size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); for (size_t idx = 0; idx < probe_counts; idx++) { rpc_buffer = static_cast(alloc_rpcmem(probe_slots[idx] * size_in_mb, sizeof(void *))); if (!rpc_buffer) { - QNN_LOG_DEBUG("alloc rpcmem %d (MB) failure, %s", probe_slots[idx], strerror(errno)); + QNN_LOG_DEBUG("alloc rpcmem %d (MB) failure, %s\n", (int) probe_slots[idx], strerror(errno)); break; } else { candidate_size = probe_slots[idx]; @@ -297,27 +299,27 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t **saver_config) { } _rpcmem_capacity = std::max(candidate_size, _rpcmem_capacity); - QNN_LOG_INFO("capacity of QNN rpc ion memory is about %d MB", _rpcmem_capacity); + QNN_LOG_INFO("capacity of QNN rpc ion memory is about %d MB\n", (int) _rpcmem_capacity); if (init_htp_perfinfra() != 0) { - QNN_LOG_WARN("initialize HTP performance failure"); + QNN_LOG_WARN("initialize HTP performance failure\n"); } if (set_rpc_polling() != 0) { - QNN_LOG_WARN("set RPC polling failure"); + QNN_LOG_WARN("set RPC polling failure\n"); } if (set_high_performance_mode() != 0) { - QNN_LOG_WARN("set HTP high performance mode failure"); + QNN_LOG_WARN("set HTP high performance mode failure\n"); } } - QNN_LOG_DEBUG("leave qnn_init"); + QNN_LOG_DEBUG("leave qnn_init\n"); return 0; } int qnn_instance::qnn_finalize() { - int ret_status = 0; - Qnn_ErrorHandle_t error = QNN_SUCCESS; + int ret_status = 0; + Qnn_ErrorHandle_t error = QNN_SUCCESS; if (_rpc_lib_handle) { if (_pfn_rpc_mem_deinit) { @@ -326,9 +328,9 @@ int qnn_instance::qnn_finalize() { } if (dl_unload(_rpc_lib_handle)) { - QNN_LOG_DEBUG("succeed to close rpcmem lib"); + QNN_LOG_DEBUG("succeed to close rpcmem lib\n"); } else { - QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s", dl_error()); + QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dl_error()); } } @@ -339,8 +341,8 @@ int qnn_instance::qnn_finalize() { if (_qnn_context_handle) { error = _qnn_interface->qnn_context_free(_qnn_context_handle, _qnn_profile_handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d", _qnn_interface->get_backend_id(), - QNN_GET_ERROR_CODE(error)); + QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), + (int) QNN_GET_ERROR_CODE(error)); } _qnn_context_handle = nullptr; } @@ -348,8 +350,8 @@ int qnn_instance::qnn_finalize() { if (_qnn_profile_handle) { error = _qnn_interface->qnn_profile_free(_qnn_profile_handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d", _qnn_interface->get_backend_id(), - QNN_GET_ERROR_CODE(error)); + QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), + (int) QNN_GET_ERROR_CODE(error)); } _qnn_profile_handle = nullptr; } @@ -357,8 +359,8 @@ int qnn_instance::qnn_finalize() { if (_qnn_device_handle) { error = _qnn_interface->qnn_device_free(_qnn_device_handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d", _qnn_interface->get_backend_id(), - QNN_GET_ERROR_CODE(error)); + QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), + (int) QNN_GET_ERROR_CODE(error)); } _qnn_device_handle = nullptr; } @@ -366,17 +368,17 @@ int qnn_instance::qnn_finalize() { if (_qnn_backend_handle) { error = _qnn_interface->qnn_backend_free(_qnn_backend_handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d", _qnn_interface->get_backend_id(), - QNN_GET_ERROR_CODE(error)); + QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), + (int) QNN_GET_ERROR_CODE(error)); } _qnn_backend_handle = nullptr; } - if (nullptr != _qnn_log_handle) { + if (_qnn_log_handle) { error = _qnn_interface->qnn_log_free(_qnn_log_handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d", _qnn_interface->get_backend_id(), - QNN_GET_ERROR_CODE(error)); + QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), + (int) QNN_GET_ERROR_CODE(error)); } _qnn_log_handle = nullptr; } @@ -389,60 +391,60 @@ int qnn_instance::qnn_finalize() { } int qnn_instance::load_system() { - QNN_LOG_DEBUG("[%s]lib: %s", _backend_lib_name.c_str(), kQnnSystemLibName); + QNN_LOG_DEBUG("[%s]lib: %s\n", _backend_lib_name.c_str(), kQnnSystemLibName); auto system_lib_handle = load_lib_with_fallback(kQnnSystemLibName, _additional_lib_load_path); if (!system_lib_handle) { - QNN_LOG_WARN("can not load QNN library %s, error: %s", kQnnSystemLibName, dl_error()); + QNN_LOG_WARN("can not load QNN library %s, error: %s\n", kQnnSystemLibName, dl_error()); return 1; } - auto *get_providers = + auto * get_providers = dl_sym_typed(system_lib_handle, "QnnSystemInterface_getProviders"); if (!get_providers) { - QNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s", dl_error()); + QNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s\n", dl_error()); return 2; } - uint32_t num_providers = 0; - const QnnSystemInterface_t **provider_list = nullptr; - Qnn_ErrorHandle_t error = get_providers(&provider_list, &num_providers); + uint32_t num_providers = 0; + const QnnSystemInterface_t ** provider_list = nullptr; + Qnn_ErrorHandle_t error = get_providers(&provider_list, &num_providers); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error)); + QNN_LOG_WARN("failed to get providers, error %d\n", (int) QNN_GET_ERROR_CODE(error)); return 3; } - QNN_LOG_DEBUG("num_providers: %d", num_providers); + QNN_LOG_DEBUG("num_providers: %d\n", num_providers); if (num_providers != _required_num_providers) { - QNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers); + QNN_LOG_WARN("providers is %d instead of required %d\n", (int) num_providers, (int) _required_num_providers); return 4; } if (!provider_list) { - QNN_LOG_WARN("can not get providers"); + QNN_LOG_WARN("can not get providers\n"); return 5; } QNN_SYSTEM_INTERFACE_VER_TYPE qnn_system_interface; - bool found_valid_system_interface = false; + bool found_valid_system_interface = false; for (size_t idx = 0; idx < num_providers; idx++) { if (QNN_SYSTEM_API_VERSION_MAJOR == provider_list[idx]->systemApiVersion.major && QNN_SYSTEM_API_VERSION_MINOR <= provider_list[idx]->systemApiVersion.minor) { found_valid_system_interface = true; - qnn_system_interface = provider_list[idx]->QNN_SYSTEM_INTERFACE_VER_NAME; + qnn_system_interface = provider_list[idx]->QNN_SYSTEM_INTERFACE_VER_NAME; break; } } if (!found_valid_system_interface) { - QNN_LOG_WARN("unable to find a valid qnn system interface"); + QNN_LOG_WARN("unable to find a valid qnn system interface\n"); return 6; } else { - QNN_LOG_DEBUG("find a valid qnn system interface"); + QNN_LOG_DEBUG("find a valid qnn system interface\n"); } auto qnn_sys_interface = std::make_shared(*provider_list[0], system_lib_handle); if (!qnn_sys_interface->is_valid()) { - QNN_LOG_WARN("failed to create QNN system interface"); + QNN_LOG_WARN("failed to create QNN system interface\n"); return 7; } @@ -450,79 +452,79 @@ int qnn_instance::load_system() { return 0; } -int qnn_instance::load_backend(std::string &lib_path, const QnnSaver_Config_t ** /*saver_config*/) { +int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t ** /*saver_config*/) { Qnn_ErrorHandle_t error = QNN_SUCCESS; - QNN_LOG_DEBUG("lib_path:%s", lib_path.c_str()); + QNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str()); auto lib_handle = load_lib_with_fallback(lib_path, _additional_lib_load_path); if (!lib_handle) { - QNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dl_error()); + QNN_LOG_WARN("can not open QNN library %s, with error: %s\n", lib_path.c_str(), dl_error()); return 1; } auto get_providers = dl_sym_typed(lib_handle, "QnnInterface_getProviders"); if (!get_providers) { - QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dl_error()); + QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s\n", dl_error()); return 2; } - std::uint32_t num_providers = 0; - const QnnInterface_t **provider_list = nullptr; - error = get_providers(&provider_list, &num_providers); + std::uint32_t num_providers = 0; + const QnnInterface_t ** provider_list = nullptr; + error = get_providers(&provider_list, &num_providers); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error)); + QNN_LOG_WARN("failed to get providers, error %d\n", (int) QNN_GET_ERROR_CODE(error)); return 3; } - QNN_LOG_DEBUG("num_providers=%d", num_providers); + QNN_LOG_DEBUG("num_providers=%d\n", num_providers); if (num_providers != _required_num_providers) { - QNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers); + QNN_LOG_WARN("providers is %d instead of required %d\n", num_providers, _required_num_providers); return 4; } if (!provider_list) { - QNN_LOG_WARN("failed to get qnn interface providers"); + QNN_LOG_WARN("failed to get qnn interface providers\n"); return 5; } - bool found_valid_interface = false; + bool found_valid_interface = false; QNN_INTERFACE_VER_TYPE qnn_interface; for (size_t idx = 0; idx < num_providers; idx++) { if (QNN_API_VERSION_MAJOR == provider_list[idx]->apiVersion.coreApiVersion.major && QNN_API_VERSION_MINOR <= provider_list[idx]->apiVersion.coreApiVersion.minor) { found_valid_interface = true; - qnn_interface = provider_list[idx]->QNN_INTERFACE_VER_NAME; + qnn_interface = provider_list[idx]->QNN_INTERFACE_VER_NAME; break; } } if (!found_valid_interface) { - QNN_LOG_WARN("unable to find a valid qnn interface"); + QNN_LOG_WARN("unable to find a valid qnn interface\n"); return 6; } else { - QNN_LOG_DEBUG("find a valid qnn interface"); + QNN_LOG_DEBUG("find a valid qnn interface\n"); } - BackendIdType backend_id = provider_list[0]->backendId; + BackendIdType backend_id = provider_list[0]->backendId; _lib_path_to_backend_id[lib_path] = backend_id; if (_loaded_backend.count(backend_id) > 0) { - QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists", lib_path.c_str(), backend_id); + QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n", lib_path.c_str(), backend_id); } _loaded_backend[backend_id] = provider_list[0]; if (_loaded_lib_handle.count(backend_id) > 0) { - QNN_LOG_WARN("closing %p", _loaded_lib_handle[backend_id]); + QNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]); if (!dl_unload(_loaded_lib_handle[backend_id])) { - QNN_LOG_WARN("fail to close %p with error %s", _loaded_lib_handle[backend_id], dl_error()); + QNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], dl_error()); } } _loaded_lib_handle[backend_id] = lib_handle; - _backend_id = backend_id; + _backend_id = backend_id; return 0; } int qnn_instance::unload_backend() { - for (auto &it : _loaded_lib_handle) { + for (auto & it : _loaded_lib_handle) { if (!dl_unload(it.second)) { - QNN_LOG_WARN("failed to close QNN backend %d, error %s", it.first, dl_error()); + QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, dl_error()); } } @@ -533,4 +535,4 @@ int qnn_instance::unload_backend() { return 0; } -} // namespace qnn +} // namespace qnn diff --git a/ggml/src/ggml-qnn/qnn-lib.hpp b/ggml/src/ggml-qnn/qnn-lib.hpp index 968df5bcf297d..bb6006acda19c 100644 --- a/ggml/src/ggml-qnn/qnn-lib.hpp +++ b/ggml/src/ggml-qnn/qnn-lib.hpp @@ -24,7 +24,7 @@ #include #include -#include "dl_loader.hpp" +#include "dl-loader.hpp" #include "qnn-types.hpp" #include "utils.hpp" @@ -42,16 +42,15 @@ namespace qnn { #pragma GCC diagnostic ignored "-Wpedantic" class qnn_system_interface { - #define DEFINE_SHIM_FUNCTION_SYS_INTERFACE(F, pointer_name) \ - template \ - inline auto qnn_##F(Args... args) const { \ + template inline auto qnn_##F(Args... args) const { \ return (_qnn_sys_interface.QNN_SYSTEM_INTERFACE_VER_NAME.pointer_name)(std::forward(args)...); \ } -public: - qnn_system_interface(const QnnSystemInterface_t &qnn_sys_interface, dl_handler_t lib_handle); + public: + qnn_system_interface(const QnnSystemInterface_t & qnn_sys_interface, dl_handler_t lib_handle); ~qnn_system_interface(); + bool is_valid() const { return _qnn_system_handle != nullptr; } // QnnSystem @@ -61,27 +60,25 @@ class qnn_system_interface { DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree); -private: + private: qnn_system_interface(const qnn_system_interface &) = delete; - void operator=(const qnn_system_interface &) = delete; - qnn_system_interface(qnn_system_interface &&) = delete; - void operator=(qnn_system_interface &&) = delete; + void operator=(const qnn_system_interface &) = delete; + qnn_system_interface(qnn_system_interface &&) = delete; + void operator=(qnn_system_interface &&) = delete; const QnnSystemInterface_t _qnn_sys_interface = {}; - dl_handler_t _lib_handle = nullptr; - QnnSystemContext_Handle_t _qnn_system_handle = nullptr; + dl_handler_t _lib_handle = nullptr; + QnnSystemContext_Handle_t _qnn_system_handle = nullptr; }; class qnn_interface { - #define DEFINE_SHIM_FUNCTION_INTERFACE(F, pointer_name) \ - template \ - inline auto qnn_##F(Args... args) const { \ + template inline auto qnn_##F(Args... args) const { \ return (_qnn_interface.QNN_INTERFACE_VER_NAME.pointer_name)(std::forward(args)...); \ } -public: - qnn_interface(const QnnInterface_t &qnn_interface) : _qnn_interface(qnn_interface) {} + public: + qnn_interface(const QnnInterface_t & qnn_interface) : _qnn_interface(qnn_interface) {} // QnnBackend DEFINE_SHIM_FUNCTION_INTERFACE(backend_create, backendCreate); @@ -161,11 +158,11 @@ class qnn_interface { uint32_t get_backend_id() const { return _qnn_interface.backendId; } -private: - qnn_interface(const qnn_interface &) = delete; + private: + qnn_interface(const qnn_interface &) = delete; void operator=(const qnn_interface &) = delete; - qnn_interface(qnn_interface &&) = delete; - void operator=(qnn_interface &&) = delete; + qnn_interface(qnn_interface &&) = delete; + void operator=(qnn_interface &&) = delete; const QnnInterface_t _qnn_interface = {}; }; @@ -173,17 +170,19 @@ class qnn_interface { #pragma GCC diagnostic pop class qnn_instance { -public: + public: using BackendIdType = decltype(QnnInterface_t{}.backendId); - explicit qnn_instance(const std::string &lib_path, const std::string &backend_lib_name); + explicit qnn_instance(const std::string & lib_path, const std::string & backend_lib_name); + ~qnn_instance() {} - int qnn_init(const QnnSaver_Config_t **saver_config); + + int qnn_init(const QnnSaver_Config_t ** saver_config); int qnn_finalize(); std::shared_ptr get_qnn_interface() { if (!_qnn_interface) { - QNN_LOG_WARN("pls check why _qnn_interface is not loaded"); + QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); } return _qnn_interface; } @@ -202,26 +201,26 @@ class qnn_instance { int init_htp_perfinfra() { QnnDevice_Infrastructure_t device_infra = nullptr; - auto error = _qnn_interface->qnn_device_get_infrastructure(&device_infra); + auto error = _qnn_interface->qnn_device_get_infrastructure(&device_infra); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to get qnn device infra"); + QNN_LOG_WARN("failed to get qnn device infra\n"); return 1; } else { - QNN_LOG_INFO("HTP backend perf_infrastructure creation ok"); + QNN_LOG_INFO("HTP backend perf_infrastructure creation ok\n"); } - QnnHtpDevice_Infrastructure_t *htp_infra = static_cast(device_infra); - QnnHtpDevice_PerfInfrastructure_t *htp_perfinfra = &htp_infra->perfInfra; - uint32_t power_configid = 1; - uint32_t device_id = 0; - uint32_t core_id = 0; + QnnHtpDevice_Infrastructure_t * htp_infra = static_cast(device_infra); + QnnHtpDevice_PerfInfrastructure_t * htp_perfinfra = &htp_infra->perfInfra; + uint32_t power_configid = 1; + uint32_t device_id = 0; + uint32_t core_id = 0; htp_perfinfra->createPowerConfigId(device_id, core_id, &power_configid); if (htp_infra->infraType != QNN_HTP_DEVICE_INFRASTRUCTURE_TYPE_PERF) { - QNN_LOG_INFO("HTP infra type = %d, which is not perf infra type", htp_infra->infraType); + QNN_LOG_INFO("HTP infra type = %d, which is not perf infra type\n", htp_infra->infraType); } else { - QNN_LOG_INFO("HTP infra type = %d, which is perf infra type", htp_infra->infraType); + QNN_LOG_INFO("HTP infra type = %d, which is perf infra type\n", htp_infra->infraType); } - _qnn_htp_perfinfra = htp_perfinfra; + _qnn_htp_perfinfra = htp_perfinfra; _qnn_power_configid = power_configid; return 0; @@ -231,7 +230,7 @@ class qnn_instance { if (_qnn_htp_perfinfra) { QnnHtpPerfInfrastructure_PowerConfig_t rpc_polling_time; memset(&rpc_polling_time, 0, sizeof(rpc_polling_time)); - rpc_polling_time.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME; + rpc_polling_time.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME; // use rpc polling time recommended 0-10000 us rpc_polling_time.rpcPollingTimeConfig = 9999; @@ -241,16 +240,16 @@ class qnn_instance { // use rpc control latency recommended 100 us, refer hexagon sdk rpc_control_latency.rpcControlLatencyConfig = 100; - const QnnHtpPerfInfrastructure_PowerConfig_t *power_configs[] = {&rpc_polling_time, &rpc_control_latency, - nullptr}; + const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = { &rpc_polling_time, &rpc_control_latency, + nullptr }; Qnn_ErrorHandle_t qnn_status = _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs); if (qnn_status != QNN_SUCCESS) { - QNN_LOG_WARN("set htp perf failed"); + QNN_LOG_WARN("set htp perf failed\n"); } else { - QNN_LOG_DEBUG("set htp perf ok"); + QNN_LOG_DEBUG("set htp perf ok\n"); } } else { - QNN_LOG_WARN("can't set htp perf"); + QNN_LOG_WARN("can't set htp perf\n"); } return 0; @@ -258,7 +257,7 @@ class qnn_instance { int set_high_performance_mode() { if (nullptr == _qnn_htp_perfinfra) { - QNN_LOG_WARN("perf intra is null"); + QNN_LOG_WARN("perf intra is null\n"); return 1; } @@ -266,83 +265,83 @@ class qnn_instance { memset(&power_config, 0, sizeof(power_config)); power_config.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3; - power_config.dcvsV3Config.setDcvsEnable = 1; - power_config.dcvsV3Config.dcvsEnable = 0; - power_config.dcvsV3Config.contextId = _qnn_power_configid; - power_config.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE; - power_config.dcvsV3Config.setSleepLatency = 1; // true to consider Latency parameter otherwise false - power_config.dcvsV3Config.sleepLatency = 40; - power_config.dcvsV3Config.setBusParams = 1; // true to consider Bus parameter otherwise false - power_config.dcvsV3Config.setCoreParams = 1; // true to consider Core parameter otherwise false - power_config.dcvsV3Config.sleepDisable = 1; // true to consider sleep/LPM modes, false to enable + power_config.dcvsV3Config.setDcvsEnable = 1; + power_config.dcvsV3Config.dcvsEnable = 0; + power_config.dcvsV3Config.contextId = _qnn_power_configid; + power_config.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE; + power_config.dcvsV3Config.setSleepLatency = 1; // true to consider Latency parameter otherwise false + power_config.dcvsV3Config.sleepLatency = 40; + power_config.dcvsV3Config.setBusParams = 1; // true to consider Bus parameter otherwise false + power_config.dcvsV3Config.setCoreParams = 1; // true to consider Core parameter otherwise false + power_config.dcvsV3Config.sleepDisable = 1; // true to consider sleep/LPM modes, false to enable power_config.dcvsV3Config.setSleepDisable = - 1; // true to consider sleep disable/enable parameter otherwise false set sleep latency parameter + 1; // true to consider sleep disable/enable parameter otherwise false set sleep latency parameter // set Bus Clock Parameters - power_config.dcvsV3Config.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - power_config.dcvsV3Config.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - power_config.dcvsV3Config.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + power_config.dcvsV3Config.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + power_config.dcvsV3Config.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + power_config.dcvsV3Config.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; // set Core Clock Parameters - power_config.dcvsV3Config.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + power_config.dcvsV3Config.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; power_config.dcvsV3Config.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - power_config.dcvsV3Config.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + power_config.dcvsV3Config.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; // set power config with different performance parameters - const QnnHtpPerfInfrastructure_PowerConfig_t *power_configs[] = {&power_config, nullptr}; - Qnn_ErrorHandle_t qnn_status = QNN_SUCCESS; + const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = { &power_config, nullptr }; + Qnn_ErrorHandle_t qnn_status = QNN_SUCCESS; qnn_status = _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs); if (qnn_status != QNN_SUCCESS) { - QNN_LOG_WARN("set htp high performance mode failed"); + QNN_LOG_WARN("set htp high performance mode failed\n"); } else { - QNN_LOG_DEBUG("set htp high performance mode ok"); + QNN_LOG_DEBUG("set htp high performance mode ok\n"); } return 0; } - std::string &get_qnn_graph_name() { return _graph_name; } + std::string & get_qnn_graph_name() { return _graph_name; } bool is_rpcmem_initialized() { return _rpcmem_initialized; } size_t get_rpcmem_capacity() { return _rpcmem_capacity; } - void *alloc_rpcmem(size_t bytes, size_t alignment) { + void * alloc_rpcmem(size_t bytes, size_t alignment) { if (!_rpcmem_initialized) { - QNN_LOG_WARN("rpc memory not initialized"); + QNN_LOG_WARN("rpc memory not initialized\n"); return nullptr; } - auto allocate_bytes = static_cast(bytes + alignment); - void *buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, (int)allocate_bytes); + auto allocate_bytes = static_cast(bytes + alignment); + void * buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, (int) allocate_bytes); if (!buf) { - QNN_LOG_WARN("failed to allocate rpc memory, size: %d MB", (int)(allocate_bytes / (1 << 20))); + QNN_LOG_WARN("failed to allocate rpc memory, size: %d MB\n", (int) (allocate_bytes / (1 << 20))); return nullptr; } auto aligned_buf = reinterpret_cast(qnn::align_to(alignment, reinterpret_cast(buf))); - bool status = _rpcmem_store_map.insert(std::pair(aligned_buf, buf)).second; + bool status = _rpcmem_store_map.insert(std::pair(aligned_buf, buf)).second; if (!status) { - QNN_LOG_WARN("failed to allocate rpc memory"); + QNN_LOG_WARN("failed to allocate rpc memory\n"); _pfn_rpc_mem_free(buf); } return aligned_buf; } - void free_rpcmem(void *buf) { + void free_rpcmem(void * buf) { if (!_rpcmem_initialized) { - QNN_LOG_WARN("rpc memory not initialized"); + QNN_LOG_WARN("rpc memory not initialized\n"); } else if (_rpcmem_store_map.count(buf) == 0) { - QNN_LOG_WARN("no allocated tensor"); + QNN_LOG_WARN("no allocated tensor\n"); } else { _pfn_rpc_mem_free(_rpcmem_store_map[buf]); _rpcmem_store_map.erase(buf); } } - int32_t rpcmem_to_fd(void *buf) { + int32_t rpcmem_to_fd(void * buf) { int32_t mem_fd = -1; if (!is_rpcmem_initialized()) { - QNN_LOG_WARN("rpc memory not initialized"); + QNN_LOG_WARN("rpc memory not initialized\n"); } else { mem_fd = _pfn_rpc_mem_to_fd(buf); } @@ -350,74 +349,80 @@ class qnn_instance { return mem_fd; } - Qnn_MemHandle_t register_rpcmem(void *p_data, const uint32_t rank, uint32_t *dimensions, Qnn_DataType_t data_type) { + Qnn_MemHandle_t register_rpcmem(void * p_data, const uint32_t rank, uint32_t * dimensions, + Qnn_DataType_t data_type) { if (!p_data) { - QNN_LOG_WARN("invalid param"); + QNN_LOG_WARN("invalid param\n"); return nullptr; } if (!is_rpcmem_initialized()) { - QNN_LOG_WARN("rpc memory not initialized"); + QNN_LOG_WARN("rpc memory not initialized\n"); return nullptr; } if (is_rpcmem_registered(p_data)) { - QNN_LOG_WARN("rpc memory already registered"); + QNN_LOG_WARN("rpc memory already registered\n"); return _qnn_rpc_buffer_to_handles[p_data]; } auto mem_fd = rpcmem_to_fd(p_data); if (mem_fd == -1) { - QNN_LOG_WARN("failed to get file descriptor"); + QNN_LOG_WARN("failed to get file descriptor\n"); return nullptr; } - QNN_LOG_DEBUG("mem_fd %d", mem_fd); - Qnn_MemDescriptor_t descriptor = {{rank, dimensions, nullptr}, data_type, QNN_MEM_TYPE_ION, {{mem_fd}}}; + QNN_LOG_DEBUG("mem_fd %d\n", mem_fd); + Qnn_MemDescriptor_t descriptor = { + { rank, dimensions, nullptr }, + data_type, QNN_MEM_TYPE_ION, { { mem_fd } } + }; Qnn_MemHandle_t handle = nullptr; - auto error = _qnn_interface->qnn_mem_register(_qnn_context_handle, &descriptor, - /*numDescriptors=*/1, &handle); + auto error = _qnn_interface->qnn_mem_register(_qnn_context_handle, &descriptor, + /*numDescriptors=*/1, &handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to register shared memory, error %d, %s", QNN_GET_ERROR_CODE(error), strerror(error)); + QNN_LOG_WARN("failed to register shared memory, error %d, %s\n", (int) QNN_GET_ERROR_CODE(error), + strerror(error)); return nullptr; } - _qnn_rpc_buffer_to_handles.insert({p_data, handle}); - QNN_LOG_DEBUG("successfully register shared memory handler: %p", handle); + _qnn_rpc_buffer_to_handles.insert({ p_data, handle }); + QNN_LOG_DEBUG("successfully register shared memory handler: %p\n", handle); return handle; } void unregister_rpcmem(Qnn_MemHandle_t mem_handle) { auto error = _qnn_interface->qnn_mem_de_register(&mem_handle, 1); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to unregister shared memory, error %d", QNN_GET_ERROR_CODE(error)); + QNN_LOG_WARN("failed to unregister shared memory, error %d\n", (int) QNN_GET_ERROR_CODE(error)); } auto it = std::find_if(_qnn_rpc_buffer_to_handles.begin(), _qnn_rpc_buffer_to_handles.end(), - [mem_handle](const auto &kv) { return kv.second == mem_handle; }); + [mem_handle](const auto & kv) { return kv.second == mem_handle; }); if (it == _qnn_rpc_buffer_to_handles.end()) { - QNN_LOG_WARN("failed to find shared memory handler: %p", mem_handle); + QNN_LOG_WARN("failed to find shared memory handler: %p\n", mem_handle); return; } _qnn_rpc_buffer_to_handles.erase(it); } - bool is_rpcmem_allocated(void *buf) { return _rpcmem_store_map.count(buf) != 0; } - bool is_rpcmem_registered(void *buf) { return _qnn_rpc_buffer_to_handles.count(buf) != 0U; } + bool is_rpcmem_allocated(void * buf) { return _rpcmem_store_map.count(buf) != 0; } + + bool is_rpcmem_registered(void * buf) { return _qnn_rpc_buffer_to_handles.count(buf) != 0U; } - const qnn::qcom_socinfo &get_soc_info() { return _soc_info; } + const qnn::qcom_socinfo & get_soc_info() { return _soc_info; } -private: + private: int load_system(); - int load_backend(std::string &lib_path, const QnnSaver_Config_t ** /*saver_config*/); + int load_backend(std::string & lib_path, const QnnSaver_Config_t ** /*saver_config*/); int unload_backend(); -private: + private: static constexpr const int _required_num_providers = 1; - std::string _additional_lib_load_path; - std::string _backend_lib_name; + std::string _additional_lib_load_path; + std::string _backend_lib_name; BackendIdType _backend_id; QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_DEBUG; @@ -429,7 +434,7 @@ class qnn_instance { #endif std::shared_ptr _qnn_sys_interface; - std::shared_ptr _qnn_interface; + std::shared_ptr _qnn_interface; Qnn_GraphHandle_t _qnn_graph_handle = nullptr; @@ -443,29 +448,29 @@ class qnn_instance { Qnn_ContextHandle_t _qnn_context_handle = nullptr; - QnnHtpDevice_PerfInfrastructure_t *_qnn_htp_perfinfra = nullptr; - uint32_t _qnn_power_configid = 1; + QnnHtpDevice_PerfInfrastructure_t * _qnn_htp_perfinfra = nullptr; + uint32_t _qnn_power_configid = 1; std::unordered_map _qnn_rpc_buffer_to_handles; - std::mutex _init_mutex; - std::unordered_map _loaded_lib_handle; - std::unordered_map _lib_path_to_backend_id; + std::mutex _init_mutex; + std::unordered_map _loaded_lib_handle; + std::unordered_map _lib_path_to_backend_id; std::unordered_map _loaded_backend; - dl_handler_t _rpc_lib_handle = nullptr; - std::atomic_bool _rpcmem_initialized{false}; - qnn::pfn_rpc_mem_alloc _pfn_rpc_mem_alloc = nullptr; - qnn::pfn_rpc_mem_free _pfn_rpc_mem_free = nullptr; - qnn::pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd = nullptr; - qnn::pfn_rpc_mem_init _pfn_rpc_mem_init = nullptr; - qnn::pfn_rpc_mem_deinit _pfn_rpc_mem_deinit = nullptr; + dl_handler_t _rpc_lib_handle = nullptr; + std::atomic_bool _rpcmem_initialized{ false }; + qnn::pfn_rpc_mem_alloc _pfn_rpc_mem_alloc = nullptr; + qnn::pfn_rpc_mem_free _pfn_rpc_mem_free = nullptr; + qnn::pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd = nullptr; + qnn::pfn_rpc_mem_init _pfn_rpc_mem_init = nullptr; + qnn::pfn_rpc_mem_deinit _pfn_rpc_mem_deinit = nullptr; std::unordered_map _rpcmem_store_map; - size_t _rpcmem_capacity = 512; + size_t _rpcmem_capacity = 512; std::string _graph_name; qnn::qcom_socinfo _soc_info = {}; }; -} // namespace qnn +} // namespace qnn diff --git a/ggml/src/ggml-qnn/qnn-types.hpp b/ggml/src/ggml-qnn/qnn-types.hpp index ec30602843301..8284036bb7503 100644 --- a/ggml/src/ggml-qnn/qnn-types.hpp +++ b/ggml/src/ggml-qnn/qnn-types.hpp @@ -20,48 +20,48 @@ enum sdk_profile_level { profile_off = 0, profile_basic, profile_detail }; enum qcom_htp_arch { NONE = 0, - V68 = 68, - V69 = 69, - V73 = 73, - V75 = 75, - V79 = 79, // SD 8 Gen 4 (SM8750) + V68 = 68, + V69 = 69, + V73 = 73, + V75 = 75, + V79 = 79, // SD 8 Gen 4 (SM8750) }; enum qcom_chipset { UNKNOWN_SM = 0, - SM8450 = 36, // v69, SD 8 Gen 1 - SM8475 = 42, // v69, SD 8+ Gen 1 - SM8550 = 43, // v73, SD 8 Gen 2 - SSG2115P = 46, // v73 - SM8650 = 57, // v75, SD 8 Gen 3 - SA8295 = 39, // v68 - SM8750 = 69, // v79, SD 8 Gen 4 + SM8450 = 36, // v69, SD 8 Gen 1 + SM8475 = 42, // v69, SD 8+ Gen 1 + SM8550 = 43, // v73, SD 8 Gen 2 + SSG2115P = 46, // v73 + SM8650 = 57, // v75, SD 8 Gen 3 + SA8295 = 39, // v68 + SM8750 = 69, // v79, SD 8 Gen 4 }; struct qcom_socinfo { uint32_t soc_model; - size_t htp_arch; - size_t vtcm_size_in_mb; + size_t htp_arch; + size_t vtcm_size_in_mb; }; -using pfn_rpc_mem_init = void (*)(void); +using pfn_rpc_mem_init = void (*)(void); using pfn_rpc_mem_deinit = void (*)(void); -using pfn_rpc_mem_alloc = void *(*)(int, uint32_t, int); -using pfn_rpc_mem_free = void (*)(void *); -using pfn_rpc_mem_to_fd = int (*)(void *); +using pfn_rpc_mem_alloc = void * (*) (int, uint32_t, int); +using pfn_rpc_mem_free = void (*)(void *); +using pfn_rpc_mem_to_fd = int (*)(void *); -using pfn_qnnsaver_initialize = decltype(QnnSaver_initialize); -using pfn_qnninterface_getproviders = decltype(QnnInterface_getProviders); +using pfn_qnnsaver_initialize = decltype(QnnSaver_initialize); +using pfn_qnninterface_getproviders = decltype(QnnInterface_getProviders); using pfn_qnnsysteminterface_getproviders = decltype(QnnSystemInterface_getProviders); -} // namespace qnn +} // namespace qnn -#define RPCMEM_DEFAULT_FLAGS 1 +#define RPCMEM_DEFAULT_FLAGS 1 #define RPCMEM_HEAP_ID_SYSTEM 25 -#define DISABLE_COPY(class_name) \ - class_name(const class_name &) = delete; \ +#define DISABLE_COPY(class_name) \ + class_name(const class_name &) = delete; \ void operator=(const class_name &) = delete -#define DISABLE_MOVE(class_name) \ - class_name(class_name &&) = delete; \ +#define DISABLE_MOVE(class_name) \ + class_name(class_name &&) = delete; \ void operator=(class_name &&) = delete diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index 423c3ba7fa8c1..660223caf728a 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -9,9 +9,8 @@ #include #include -#include "ggml-qnn.h" - #include "buffer.hpp" +#include "ggml-qnn.h" #include "logger.hpp" #include "qnn-lib.hpp" #include "utils.hpp" @@ -21,14 +20,17 @@ namespace qnn { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS should be 4"); class ggml_qnn_tensor : public std::enable_shared_from_this { -public: + public: typedef enum _tensor_type { INPUT, OUTPUT, INTERMEDIATE, PARAMETER, BIDIRECTION } tensor_type_t; - explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string &name, - const qnn_dimension_array_t &dimensions, Qnn_DataType_t data_type, int rank, + explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string & name, + const qnn_dimension_array_t & dimensions, Qnn_DataType_t data_type, int rank, QNNBackend device, Qnn_GraphHandle_t graph_handle, - std::shared_ptr qnn_instance) - : _tensor_name(name), _device(device), _qnn_instance(qnn_instance), _graph_handle(graph_handle) { + std::shared_ptr qnn_instance) : + _tensor_name(name), + _device(device), + _qnn_instance(qnn_instance), + _graph_handle(graph_handle) { if (!_tensor_name.empty()) { QNN_TENSOR_SET_NAME(_qnn_tensor, _tensor_name.c_str()); } @@ -37,23 +39,24 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { QNN_TENSOR_SET_DIMENSIONS(_qnn_tensor, _dimensions.data()); QNN_TENSOR_SET_DATA_FORMAT(_qnn_tensor, QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER); update_params_from_ggml_tensor(tensor_type, data_type, rank); - QNN_LOG_DEBUG("[%s][%s]created, rank: %d, dims: [%d, %d, %d, %d], type: %s", get_backend_name(device), - _tensor_name.c_str(), rank, (int)_dimensions[0], (int)_dimensions[1], (int)_dimensions[2], - (int)_dimensions[3], qnn_datatype_to_string(data_type)); + QNN_LOG_DEBUG("[%s][%s]created, rank: %d, dims: [%d, %d, %d, %d], type: %s\n", get_backend_name(device), + _tensor_name.c_str(), rank, (int) _dimensions[0], (int) _dimensions[1], (int) _dimensions[2], + (int) _dimensions[3], qnn_datatype_to_string(data_type)); } - explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string &name, - const ggml_dimension_array_t &dimensions, ggml_type data_type, int rank, QNNBackend device, - Qnn_GraphHandle_t graph_handle, std::shared_ptr qnn_instance) - : ggml_qnn_tensor(tensor_type, name, get_internal_dimension(dimensions, rank), - qnn_datatype_from_ggml_datatype(data_type), rank, device, graph_handle, qnn_instance) {} + explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string & name, + const ggml_dimension_array_t & dimensions, ggml_type data_type, int rank, + QNNBackend device, Qnn_GraphHandle_t graph_handle, + std::shared_ptr qnn_instance) : + ggml_qnn_tensor(tensor_type, name, get_internal_dimension(dimensions, rank), + qnn_datatype_from_ggml_datatype(data_type), rank, device, graph_handle, qnn_instance) {} ~ggml_qnn_tensor() { _rpc_buffer.reset(); unbind(); } - bool set_data_buffer(const uint8_t *buffer, const size_t buffer_size) { + bool set_data_buffer(const uint8_t * buffer, const size_t buffer_size) { auto qnn_buffer = std::make_shared(buffer, buffer_size); if (bind_buffer_impl(qnn_buffer)) { return true; @@ -74,71 +77,72 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { bool alloc_qnn_tensor_id() { if (QNN_TENSOR_GET_ID(_qnn_tensor)) { - QNN_LOG_DEBUG("[%s]tensor already has a id: %d", _tensor_name.c_str(), QNN_TENSOR_GET_ID(_qnn_tensor)); + QNN_LOG_DEBUG("[%s]tensor already has a id: %d\n", _tensor_name.c_str(), QNN_TENSOR_GET_ID(_qnn_tensor)); return true; } - Qnn_Tensor_t qnn_tensor = _qnn_tensor; - auto qnn_interface = _qnn_instance->get_qnn_interface(); - auto error = qnn_interface->qnn_tensor_create_graph_tensor(_graph_handle, &qnn_tensor); + Qnn_Tensor_t qnn_tensor = _qnn_tensor; + auto qnn_interface = _qnn_instance->get_qnn_interface(); + auto error = qnn_interface->qnn_tensor_create_graph_tensor(_graph_handle, &qnn_tensor); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("[%s]allocate id failed , error: %d", _tensor_name.c_str(), error); + QNN_LOG_WARN("[%s]allocate id failed , error: %d\n", _tensor_name.c_str(), (int) error); return false; } QNN_TENSOR_SET_ID(_qnn_tensor, QNN_TENSOR_GET_ID(qnn_tensor)); - QNN_LOG_DEBUG("[%s][%s]allocated id: %d, rank: %d", get_backend_name(_device), _tensor_name.c_str(), + QNN_LOG_DEBUG("[%s][%s]allocated id: %d, rank: %d\n", get_backend_name(_device), _tensor_name.c_str(), QNN_TENSOR_GET_ID(qnn_tensor), QNN_TENSOR_GET_RANK(qnn_tensor)); return true; } - bool bind_ggml_tensor(ggml_tensor *tensor) { + bool bind_ggml_tensor(ggml_tensor * tensor) { if (!_can_unbind) { - QNN_LOG_DEBUG("[%s]already has buffer storage, skip bind", _tensor_name.c_str()); + QNN_LOG_DEBUG("[%s]already has buffer storage, skip bind\n", _tensor_name.c_str()); return true; } #ifndef NDEBUG if (tensor->view_src) { - auto *src = tensor->view_src; - QNN_LOG_DEBUG("[%s]tensor(%s_%dx%dx%dx%d) is a view, src: %s_%dx%dx%dx%d", get_backend_name(_device), - tensor->name, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], src->name, - src->ne[0], src->ne[1], src->ne[2], src->ne[3]); + auto * src = tensor->view_src; + QNN_LOG_DEBUG("[%s]tensor(%s_%dx%dx%dx%d) is a view, src: %s_%dx%dx%dx%d\n", get_backend_name(_device), + tensor->name, (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2], + (int) tensor->ne[3], src->name, (int) src->ne[0], (int) src->ne[1], (int) src->ne[2], + (int) src->ne[3]); } #endif auto buffer = std::make_shared(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); if (!bind_buffer_impl(buffer)) { - QNN_LOG_WARN("[%s]failed to bind ggml tensor(%s)", _tensor_name.c_str(), ggml_get_name(tensor)); + QNN_LOG_WARN("[%s]failed to bind ggml tensor(%s)\n", _tensor_name.c_str(), ggml_get_name(tensor)); return false; } - QNN_LOG_DEBUG("[%s][%s]bind to ggml tensor(%s)", get_backend_name(_device), _tensor_name.c_str(), + QNN_LOG_DEBUG("[%s][%s]bind to ggml tensor(%s)\n", get_backend_name(_device), _tensor_name.c_str(), ggml_get_name(tensor)); tensor->extra = this; - _ggml_tensor = tensor; + _ggml_tensor = tensor; return true; } bool unbind() { if (!_graph_handle) { - QNN_LOG_WARN("[%s]not bound to any graph", _tensor_name.c_str()); + QNN_LOG_WARN("[%s]not bound to any graph\n", _tensor_name.c_str()); return false; } if (!_buffer) { - QNN_LOG_DEBUG("[%s]unbind to ggml tensor", _tensor_name.c_str()); + QNN_LOG_DEBUG("[%s]unbind to ggml tensor\n", _tensor_name.c_str()); return true; } if (!read_from_qnn_tensor()) { - QNN_LOG_WARN("[%s]read from qnn tensor failed", _tensor_name.c_str()); + QNN_LOG_WARN("[%s]read from qnn tensor failed\n", _tensor_name.c_str()); return false; } if (!_can_unbind) { - QNN_LOG_DEBUG("[%s]already has buffer storage, stop unbind", _tensor_name.c_str()); + QNN_LOG_DEBUG("[%s]already has buffer storage, stop unbind\n", _tensor_name.c_str()); return true; } @@ -146,42 +150,46 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); Qnn_ClientBuffer_t client_buf = {}; QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); - QNN_LOG_DEBUG("[%s]clear client buffer", _tensor_name.c_str()); + QNN_LOG_DEBUG("[%s]clear client buffer\n", _tensor_name.c_str()); } - QNN_LOG_DEBUG("[%s][%s]unbind from buffer: %p, size: %d", get_backend_name(_device), _tensor_name.c_str(), - _buffer.get(), (int)_buffer->get_size()); + QNN_LOG_DEBUG("[%s][%s]unbind from buffer: %p, size: %d\n", get_backend_name(_device), _tensor_name.c_str(), + (void *) _buffer.get(), (int) _buffer->get_size()); _buffer.reset(); if (_ggml_tensor) { _ggml_tensor->extra = nullptr; - _ggml_tensor = nullptr; + _ggml_tensor = nullptr; } return true; } - const Qnn_Tensor_t &get_qnn_tensor() const { return _qnn_tensor; } + const Qnn_Tensor_t & get_qnn_tensor() const { return _qnn_tensor; } + Qnn_DataType_t get_data_type() const { return QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor); } - const qnn_dimension_array_t &get_dimensions() const { return _dimensions; } + + const qnn_dimension_array_t & get_dimensions() const { return _dimensions; } + uint32_t get_rank() const { return QNN_TENSOR_GET_RANK(_qnn_tensor); } + uint32_t get_qnn_tensor_id() const { return QNN_TENSOR_GET_ID(_qnn_tensor); } -private: + private: bool bind_buffer_impl(qnn_buffer_ptr buffer) { if (_buffer) { if (_buffer != buffer) { - QNN_LOG_WARN("[%s]has been bound to another buffer %p", _tensor_name.c_str(), _buffer.get()); + QNN_LOG_WARN("[%s]has been bound to another buffer %p\n", _tensor_name.c_str(), (void *) _buffer.get()); return false; } - QNN_LOG_DEBUG("[%s]already bound to same ggml tensor %p", _tensor_name.c_str(), _buffer.get()); + QNN_LOG_DEBUG("[%s]already bound to same ggml tensor %p\n", _tensor_name.c_str(), (void *) _buffer.get()); return true; } if (QNN_TENSOR_GET_TYPE(_qnn_tensor) == QNN_TENSOR_TYPE_NATIVE) { - QNN_LOG_DEBUG("[%s]tensor type(%d) not READ/WRITE, skipping", _tensor_name.c_str(), - (int)QNN_TENSOR_TYPE_NATIVE); + QNN_LOG_DEBUG("[%s]tensor type(%d) not READ/WRITE, skipping\n", _tensor_name.c_str(), + (int) QNN_TENSOR_TYPE_NATIVE); return true; } @@ -191,7 +199,7 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { _qnn_instance, buffer->get_size(), QNN_TENSOR_GET_RANK(_qnn_tensor), QNN_TENSOR_GET_DIMENSIONS(_qnn_tensor), QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor)); if (!rpc_buffer->is_valid()) { - QNN_LOG_WARN("[%s][%s]alloc rpc mem failed", get_backend_name(_device), _tensor_name.c_str()); + QNN_LOG_WARN("[%s][%s]alloc rpc mem failed\n", get_backend_name(_device), _tensor_name.c_str()); return false; } @@ -201,38 +209,38 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_MEMHANDLE); auto mem_handle = _rpc_buffer->get_mem_handle(); if (!mem_handle) { - QNN_LOG_WARN("[%s][%s]can't find rpcmem from qnn mem handle", get_backend_name(_device), + QNN_LOG_WARN("[%s][%s]can't find rpcmem from qnn mem handle\n", get_backend_name(_device), _tensor_name.c_str()); return false; } QNN_TENSOR_SET_MEM_HANDLE(_qnn_tensor, mem_handle); - QNN_LOG_DEBUG("[%s][%s]use mem handle %p", get_backend_name(_device), _tensor_name.c_str(), + QNN_LOG_DEBUG("[%s][%s]use mem handle %p\n", get_backend_name(_device), _tensor_name.c_str(), QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor)); } else { QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); - Qnn_ClientBuffer_t client_buf = {buffer->get_buffer(), (uint32_t)buffer->get_size()}; + Qnn_ClientBuffer_t client_buf = { buffer->get_buffer(), (uint32_t) buffer->get_size() }; QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); - QNN_LOG_DEBUG("[%s]use client buffer %p size %d", _tensor_name.c_str(), client_buf.data, - (int)client_buf.dataSize); + QNN_LOG_DEBUG("[%s]use client buffer %p size %d\n", _tensor_name.c_str(), client_buf.data, + (int) client_buf.dataSize); } _buffer = buffer; if (!write_to_qnn_tensor()) { - QNN_LOG_WARN("[%s]write to qnn tensor failed", _tensor_name.c_str()); + QNN_LOG_WARN("[%s]write to qnn tensor failed\n", _tensor_name.c_str()); return false; } - QNN_LOG_DEBUG("[%s][%s]bind to buffer: %p, size: %d", get_backend_name(_device), _tensor_name.c_str(), - buffer.get(), (int)buffer->get_size()); + QNN_LOG_DEBUG("[%s][%s]bind to buffer: %p, size: %d\n", get_backend_name(_device), _tensor_name.c_str(), + (void *) buffer.get(), (int) buffer->get_size()); return true; } bool write_to_qnn_tensor() { auto tensor_type = QNN_TENSOR_GET_TYPE(_qnn_tensor); if (tensor_type != QNN_TENSOR_TYPE_APP_WRITE && tensor_type != QNN_TENSOR_TYPE_APP_READWRITE) { - QNN_LOG_DEBUG("[%s]tensor type(%d) not WRITE", _tensor_name.c_str(), (int)tensor_type); + QNN_LOG_DEBUG("[%s]tensor type(%d) not WRITE\n", _tensor_name.c_str(), (int) tensor_type); return true; } @@ -241,14 +249,14 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { } // For CPU and GPU, the data is already in the tensor. - QNN_LOG_DEBUG("[%s][%s]write tensor to qnn", get_backend_name(_device), _tensor_name.c_str()); + QNN_LOG_DEBUG("[%s][%s]write tensor to qnn\n", get_backend_name(_device), _tensor_name.c_str()); return true; } bool read_from_qnn_tensor() { auto tensor_type = QNN_TENSOR_GET_TYPE(_qnn_tensor); if (tensor_type != QNN_TENSOR_TYPE_APP_READ && tensor_type != QNN_TENSOR_TYPE_APP_READWRITE) { - QNN_LOG_DEBUG("[%s]tensor type(%d) not READ", _tensor_name.c_str(), (int)tensor_type); + QNN_LOG_DEBUG("[%s]tensor type(%d) not READ\n", _tensor_name.c_str(), (int) tensor_type); return true; } @@ -257,7 +265,7 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { } // For CPU and GPU, the data is already in the tensor. - QNN_LOG_DEBUG("[%s][%s]read tensor from qnn", get_backend_name(_device), _tensor_name.c_str()); + QNN_LOG_DEBUG("[%s][%s]read tensor from qnn\n", get_backend_name(_device), _tensor_name.c_str()); return true; } @@ -265,7 +273,7 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { QNN_TENSOR_SET_DATA_TYPE(_qnn_tensor, data_type); // TODO: set the quantizeParams base on the tensor type - QNN_TENSOR_SET_RANK(_qnn_tensor, (uint32_t)rank); + QNN_TENSOR_SET_RANK(_qnn_tensor, (uint32_t) rank); QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); Qnn_ClientBuffer_t client_buf = {}; QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); @@ -290,7 +298,7 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { break; } QNN_TENSOR_SET_TYPE(_qnn_tensor, new_tensor_type); - QNN_LOG_DEBUG("[%s][%s]tensor changed to type %d", get_backend_name(_device), _tensor_name.c_str(), + QNN_LOG_DEBUG("[%s][%s]tensor changed to type %d\n", get_backend_name(_device), _tensor_name.c_str(), new_tensor_type); } @@ -299,31 +307,31 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { return false; } - std::string _tensor_name; - qnn_buffer_ptr _buffer; - bool _can_unbind = true; - QNNBackend _device; + std::string _tensor_name; + qnn_buffer_ptr _buffer; + bool _can_unbind = true; + QNNBackend _device; std::shared_ptr _qnn_instance; - Qnn_Tensor_t _qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion); - qnn_dimension_array_t _dimensions = {}; - Qnn_GraphHandle_t _graph_handle = nullptr; - qnn_buffer_ptr _rpc_buffer; - ggml_tensor *_ggml_tensor = nullptr; + Qnn_Tensor_t _qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion); + qnn_dimension_array_t _dimensions = {}; + Qnn_GraphHandle_t _graph_handle = nullptr; + qnn_buffer_ptr _rpc_buffer; + ggml_tensor * _ggml_tensor = nullptr; DISABLE_COPY(ggml_qnn_tensor); DISABLE_MOVE(ggml_qnn_tensor); }; -using qnn_tensor_ptr_t = std::shared_ptr; -using qnn_tensor_array_t = std::vector; +using qnn_tensor_ptr_t = std::shared_ptr; +using qnn_tensor_array_t = std::vector; using ggml_tensor_array_t = std::vector; -inline qnn_tensor_ptr_t get_qnn_tensor_ptr(ggml_tensor *ggml_tensor) { - return ggml_tensor->extra ? reinterpret_cast(ggml_tensor->extra)->shared_from_this() - : qnn_tensor_ptr_t(); +inline qnn_tensor_ptr_t get_qnn_tensor_ptr(ggml_tensor * ggml_tensor) { + return ggml_tensor->extra ? reinterpret_cast(ggml_tensor->extra)->shared_from_this() : + qnn_tensor_ptr_t(); } -inline int get_ggml_tensors_max_rank(const qnn::ggml_tensor_array_t &tensors) { +inline int get_ggml_tensors_max_rank(const qnn::ggml_tensor_array_t & tensors) { int max_rank = 0; for (auto tensor : tensors) { max_rank = std::max(max_rank, ggml_n_dims(tensor)); @@ -332,14 +340,14 @@ inline int get_ggml_tensors_max_rank(const qnn::ggml_tensor_array_t &tensors) { return max_rank; } -inline bool bind_tensors(const ggml_tensor_array_t &ggml_tensors, qnn_tensor_array_t &tensor_wrappers, - std::vector &qnn_tensors) { +inline bool bind_tensors(const ggml_tensor_array_t & ggml_tensors, qnn_tensor_array_t & tensor_wrappers, + std::vector & qnn_tensors) { GGML_ASSERT(tensor_wrappers.size() == ggml_tensors.size()); qnn_tensors.resize(ggml_tensors.size()); for (size_t i = 0; i < ggml_tensors.size(); i++) { - auto *ggml_tensor = ggml_tensors[i]; + auto * ggml_tensor = ggml_tensors[i]; if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor)) { - QNN_LOG_ERROR("bind tensor %s failed", ggml_get_name(ggml_tensor)); + QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); return false; } @@ -349,12 +357,12 @@ inline bool bind_tensors(const ggml_tensor_array_t &ggml_tensors, qnn_tensor_arr return true; } -inline bool bind_tensors(const ggml_tensor_array_t &ggml_tensors, qnn_tensor_array_t &tensor_wrappers) { +inline bool bind_tensors(const ggml_tensor_array_t & ggml_tensors, qnn_tensor_array_t & tensor_wrappers) { GGML_ASSERT(tensor_wrappers.size() == ggml_tensors.size()); for (size_t i = 0; i < ggml_tensors.size(); i++) { - auto *ggml_tensor = ggml_tensors[i]; + auto * ggml_tensor = ggml_tensors[i]; if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor)) { - QNN_LOG_ERROR("bind tensor %s failed", ggml_get_name(ggml_tensor)); + QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); return false; } } @@ -362,31 +370,31 @@ inline bool bind_tensors(const ggml_tensor_array_t &ggml_tensors, qnn_tensor_arr return true; } -inline void unbind_tensors(qnn_tensor_array_t &tensor_wrappers) { - for (auto &tensor : tensor_wrappers) { +inline void unbind_tensors(qnn_tensor_array_t & tensor_wrappers) { + for (auto & tensor : tensor_wrappers) { tensor->unbind(); } } struct tensor_create_common_params { - const char *name_prefix; - int tensor_rank; - bool is_input; - QNNBackend device; - Qnn_GraphHandle_t graph_handle; + const char * name_prefix; + int tensor_rank; + bool is_input; + QNNBackend device; + Qnn_GraphHandle_t graph_handle; std::shared_ptr qnn_instance; }; -inline void create_tensors_from_ggml_tensor(const tensor_create_common_params ¶ms, - const ggml_tensor_array_t &ggml_tensors, - qnn_tensor_array_t *tensor_wrappers, - std::vector *qnn_tensors) { +inline void create_tensors_from_ggml_tensor(const tensor_create_common_params & params, + const ggml_tensor_array_t & ggml_tensors, + qnn_tensor_array_t * tensor_wrappers, + std::vector * qnn_tensors) { if (qnn_tensors) { qnn_tensors->resize(ggml_tensors.size()); } if (!tensor_wrappers->empty()) { - QNN_LOG_DEBUG("tensor_wrappers is not empty, skip create tensors"); + QNN_LOG_DEBUG("tensor_wrappers is not empty, skip create tensors\n"); GGML_ASSERT(tensor_wrappers->size() == ggml_tensors.size()); return; } @@ -394,14 +402,14 @@ inline void create_tensors_from_ggml_tensor(const tensor_create_common_params &p tensor_wrappers->resize(ggml_tensors.size()); char buffer[GGML_MAX_NAME] = {}; - auto tensor_type = params.is_input ? ggml_qnn_tensor::INPUT : ggml_qnn_tensor::OUTPUT; + auto tensor_type = params.is_input ? ggml_qnn_tensor::INPUT : ggml_qnn_tensor::OUTPUT; for (size_t i = 0; i < ggml_tensors.size(); i++) { - snprintf(buffer, GGML_MAX_NAME, "%s%d", params.name_prefix, (int)i); - auto *ggml_tensor = ggml_tensors[i]; + snprintf(buffer, GGML_MAX_NAME, "%s%d", params.name_prefix, (int) i); + auto * ggml_tensor = ggml_tensors[i]; (*tensor_wrappers)[i] = std::make_shared(tensor_type, std::string(buffer), ggml_tensor->ne, ggml_tensor->type, params.tensor_rank, params.device, params.graph_handle, params.qnn_instance); } } -} // namespace qnn +} // namespace qnn diff --git a/ggml/src/ggml-qnn/utils.cpp b/ggml/src/ggml-qnn/utils.cpp index e9aa4d37374a6..f9178f90d556f 100644 --- a/ggml/src/ggml-qnn/utils.cpp +++ b/ggml/src/ggml-qnn/utils.cpp @@ -4,30 +4,28 @@ #include #include "ggml-qnn.h" - -#include "QnnGraph.h" #include "qnn-types.hpp" +#include "QnnGraph.h" #ifdef _WIN32 -#include +# include #else -#include -#include +# include +# include #endif namespace { -template -_Ty align_to_generic(size_t alignment, _Ty offset) { - return offset % alignment == 0 ? offset - : offset + (static_cast<_Ty>(alignment) - (offset % static_cast<_Ty>(alignment))); +template _Ty align_to_generic(size_t alignment, _Ty offset) { + return offset % alignment == 0 ? offset : + offset + (static_cast<_Ty>(alignment) - (offset % static_cast<_Ty>(alignment))); } -} // namespace +} // namespace namespace qnn { -qnn_dimension_array_t get_internal_dimension(const ggml_dimension_array_t &dims, uint32_t rank) { +qnn_dimension_array_t get_internal_dimension(const ggml_dimension_array_t & dims, uint32_t rank) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS should be 4"); GGML_ASSERT(rank <= GGML_MAX_DIMS && rank > 0); @@ -43,30 +41,29 @@ qnn_dimension_array_t get_internal_dimension(const ggml_dimension_array_t &dims, * The ggml tensor will have dimensions [3, 2], while the qnn tensor will have dimensions [2, 3]. */ for (uint32_t i = 0; i < rank; i++) { - internal_dims[i] = std::max((uint32_t)dims[rank - 1 - i], 1); + internal_dims[i] = std::max((uint32_t) dims[rank - 1 - i], 1); } return internal_dims; } -qnn_dimension_array_t get_view_internal_dimension(const ggml_tensor *tensor, size_t &element_offset_out) { - +qnn_dimension_array_t get_view_internal_dimension(const ggml_tensor * tensor, size_t & element_offset_out) { element_offset_out = 0; - auto *parent_tensor = tensor; + auto * parent_tensor = tensor; while (parent_tensor->view_src) { element_offset_out += parent_tensor->view_offs; parent_tensor = parent_tensor->view_src; } - const auto rank = get_ggml_tensor_rank(tensor); + const auto rank = get_ggml_tensor_rank(tensor); const auto parent_rank = get_ggml_tensor_rank(parent_tensor); GGML_ASSERT(parent_tensor->type == tensor->type); GGML_ASSERT(parent_rank == rank); const auto block_size = ggml_blck_size(tensor->type); element_offset_out = - element_offset_out * block_size / tensor->nb[0]; // calculate the element offset in the view tensor + element_offset_out * block_size / tensor->nb[0]; // calculate the element offset in the view tensor return get_internal_dimension(parent_tensor->ne, parent_rank); } @@ -141,7 +138,7 @@ size_t qnn_datatype_size(Qnn_DataType_t qnn_type) { return 0; } -const char *qnn_datatype_to_string(Qnn_DataType_t qnn_type) { +const char * qnn_datatype_to_string(Qnn_DataType_t qnn_type) { switch (qnn_type) { case QNN_DATATYPE_FLOAT_32: return "QNN_DATATYPE_FLOAT_32"; @@ -166,7 +163,7 @@ const char *qnn_datatype_to_string(Qnn_DataType_t qnn_type) { return "QNN_DATATYPE_UNDEFINED"; } -uint32_t get_ggml_tensor_rank(const ggml_tensor *tensor) { +uint32_t get_ggml_tensor_rank(const ggml_tensor * tensor) { uint32_t rank = 0; for (int i = 0; i < GGML_MAX_DIMS; i++) { if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) { @@ -176,12 +173,12 @@ uint32_t get_ggml_tensor_rank(const ggml_tensor *tensor) { return rank; } -const char *get_ggml_type_name(ggml_type type) { - const auto *traits = ggml_get_type_traits(type); +const char * get_ggml_type_name(ggml_type type) { + const auto * traits = ggml_get_type_traits(type); return traits->type_name; } -const char *get_backend_name(QNNBackend device_index) { +const char * get_backend_name(QNNBackend device_index) { switch (device_index) { case QNN_BACKEND_CPU: return "qnn-cpu"; @@ -195,7 +192,7 @@ const char *get_backend_name(QNNBackend device_index) { } } -const char *get_chipset_desc(uint32_t chipset_id) { +const char * get_chipset_desc(uint32_t chipset_id) { switch (chipset_id) { case SM8450: return "SD 8 Gen 1 (SM8450)"; @@ -212,7 +209,7 @@ const char *get_chipset_desc(uint32_t chipset_id) { } } -const char *get_htparch_desc(size_t htp_arch) { +const char * get_htparch_desc(size_t htp_arch) { switch (htp_arch) { case V68: return "QCOM_HTP_V68"; @@ -229,12 +226,18 @@ const char *get_htparch_desc(size_t htp_arch) { } } -intptr_t align_to(size_t alignment, intptr_t offset) { return align_to_generic(alignment, offset); } +intptr_t align_to(size_t alignment, intptr_t offset) { + return align_to_generic(alignment, offset); +} -uint32_t get_ggml_tensor_data_size(const ggml_tensor *tensor) { return (uint32_t)ggml_nbytes(tensor); } +uint32_t get_ggml_tensor_data_size(const ggml_tensor * tensor) { + return (uint32_t) ggml_nbytes(tensor); +} #ifdef _WIN32 -static void *_align_alloc(size_t alignment, size_t size) { return _aligned_malloc(size, alignment); } +static void * _align_alloc(size_t alignment, size_t size) { + return _aligned_malloc(size, alignment); +} static size_t _get_page_size() { SYSTEM_INFO si; @@ -242,22 +245,31 @@ static size_t _get_page_size() { return si.dwPageSize; } -void align_free(void *ptr) { _aligned_free(ptr); } +void align_free(void * ptr) { + _aligned_free(ptr); +} #else -static void *_align_alloc(size_t alignment, size_t size) { return std::aligned_alloc(alignment, size); } +static void * _align_alloc(size_t alignment, size_t size) { + return std::aligned_alloc(alignment, size); +} -static size_t _get_page_size() { return sysconf(_SC_PAGESIZE); } +static size_t _get_page_size() { + return sysconf(_SC_PAGESIZE); +} -void align_free(void *ptr) { std::free(ptr); } +void align_free(void * ptr) { + std::free(ptr); +} #endif -void *page_align_alloc(size_t size) { - const size_t alignment = _get_page_size(); - size_t size_aligned = align_to_generic(alignment, size); - QNN_LOG_DEBUG("_align_alloc success, alignment: %ld, size: %ld, size_aligned: %ld", alignment, size, size_aligned); - void *data = _align_alloc(alignment, size_aligned); +void * page_align_alloc(size_t size) { + const size_t alignment = _get_page_size(); + size_t size_aligned = align_to_generic(alignment, size); + QNN_LOG_DEBUG("_align_alloc success, alignment: %ld, size: %ld, size_aligned: %ld\n", alignment, size, size_aligned); + void * data = _align_alloc(alignment, size_aligned); if (!data) { - QNN_LOG_WARN("_align_alloc failed, alignment: %ld, size: %ld, size_aligned: %ld", alignment, size, size_aligned); + QNN_LOG_WARN("_align_alloc failed, alignment: %ld, size: %ld, size_aligned: %ld\n", alignment, size, + size_aligned); return nullptr; } @@ -270,7 +282,7 @@ void *page_align_alloc(size_t size) { // // ================================================================================================= // TODO: only support GGML_OP_ADD/GGML_OP_MUL/GGML_OP_MUL_MAT -const char *opname_from_ggmlop(enum ggml_op ggmlop) { +const char * opname_from_ggmlop(enum ggml_op ggmlop) { switch (ggmlop) { case GGML_OP_ADD: return QNN_OP_ELEMENT_WISE_ADD; @@ -284,7 +296,7 @@ const char *opname_from_ggmlop(enum ggml_op ggmlop) { return nullptr; } -const char *get_qnn_error_string(Qnn_ErrorHandle_t error) { +const char * get_qnn_error_string(Qnn_ErrorHandle_t error) { // A complete list of error codes can be found at here: // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/api_error_codes.html thread_local static char error_code[128] = {}; @@ -377,7 +389,7 @@ const char *get_qnn_error_string(Qnn_ErrorHandle_t error) { size_t get_system_total_memory_in_bytes() { MEMORYSTATUSEX mem = {}; - mem.dwLength = sizeof(mem); + mem.dwLength = sizeof(mem); if (GlobalMemoryStatusEx(&mem)) { return mem.ullTotalPhys; } @@ -387,7 +399,7 @@ size_t get_system_total_memory_in_bytes() { size_t get_system_free_memory_in_bytes() { MEMORYSTATUSEX mem = {}; - mem.dwLength = sizeof(mem); + mem.dwLength = sizeof(mem); if (GlobalMemoryStatusEx(&mem)) { return mem.ullAvailPhys; } @@ -403,8 +415,8 @@ size_t get_system_total_memory_in_bytes() { return (info.totalram + info.totalswap) * info.mem_unit; } - auto pages = (size_t)sysconf(_SC_PHYS_PAGES); - auto page_size = (size_t)sysconf(_SC_PAGE_SIZE); + auto pages = (size_t) sysconf(_SC_PHYS_PAGES); + auto page_size = (size_t) sysconf(_SC_PAGE_SIZE); return pages * page_size; } @@ -414,11 +426,11 @@ size_t get_system_free_memory_in_bytes() { return (info.freeram + info.freeswap) * info.mem_unit; } - auto avail_pages = (size_t)sysconf(_SC_AVPHYS_PAGES); - auto page_size = (size_t)sysconf(_SC_PAGE_SIZE); + auto avail_pages = (size_t) sysconf(_SC_AVPHYS_PAGES); + auto page_size = (size_t) sysconf(_SC_PAGE_SIZE); return avail_pages * page_size; } #endif -} // namespace qnn +} // namespace qnn diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp index cdff53e77314d..d6130a3df4b4e 100644 --- a/ggml/src/ggml-qnn/utils.hpp +++ b/ggml/src/ggml-qnn/utils.hpp @@ -5,38 +5,36 @@ #include #include -#include "ggml.h" - #include "ggml-qnn.h" - -#include "QnnTypes.h" +#include "ggml.h" #include "logger.hpp" +#include "QnnTypes.h" #define QNN_TENSOR_VER(x) ((x).v1) namespace qnn { using ggml_dimension_array_t = int64_t[GGML_MAX_DIMS]; -using ggml_stride_array_t = size_t[GGML_MAX_DIMS]; -using qnn_dimension_array_t = std::array; +using ggml_stride_array_t = size_t[GGML_MAX_DIMS]; +using qnn_dimension_array_t = std::array; -qnn_dimension_array_t get_internal_dimension(const ggml_dimension_array_t &dims, uint32_t rank); -qnn_dimension_array_t get_view_internal_dimension(const ggml_tensor *tensor, size_t &element_offser_out); +qnn_dimension_array_t get_internal_dimension(const ggml_dimension_array_t & dims, uint32_t rank); +qnn_dimension_array_t get_view_internal_dimension(const ggml_tensor * tensor, size_t & element_offser_out); -uint32_t get_ggml_tensor_rank(const ggml_tensor *tensor); -const char *get_ggml_type_name(ggml_type type); -const char *get_backend_name(QNNBackend device_index); -const char *get_chipset_desc(uint32_t chipset_id); -const char *get_htparch_desc(size_t htp_arch); -intptr_t align_to(size_t alignment, intptr_t offset); -uint32_t get_ggml_tensor_data_size(const ggml_tensor *tensor); +uint32_t get_ggml_tensor_rank(const ggml_tensor * tensor); +const char * get_ggml_type_name(ggml_type type); +const char * get_backend_name(QNNBackend device_index); +const char * get_chipset_desc(uint32_t chipset_id); +const char * get_htparch_desc(size_t htp_arch); +intptr_t align_to(size_t alignment, intptr_t offset); +uint32_t get_ggml_tensor_data_size(const ggml_tensor * tensor); -void *page_align_alloc(size_t size); -void align_free(void *ptr); +void * page_align_alloc(size_t size); +void align_free(void * ptr); -const char *opname_from_ggmlop(enum ggml_op ggmlop); +const char * opname_from_ggmlop(enum ggml_op ggmlop); -const char *get_qnn_error_string(Qnn_ErrorHandle_t error); +const char * get_qnn_error_string(Qnn_ErrorHandle_t error); constexpr const Qnn_TensorVersion_t kDefaultQnnTensorVersion = QNN_TENSOR_VERSION_1; @@ -51,7 +49,7 @@ inline Qnn_Tensor_t qnn_tensor_init(Qnn_TensorVersion_t version) { return tensor; } -inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t &tensor) { +inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t & tensor) { if (tensor.version == kDefaultQnnTensorVersion) { return QNN_TENSOR_VER(tensor).id; } @@ -59,156 +57,158 @@ inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t &tensor) { return 0u; } -inline const char *get_qnn_tensorname(const Qnn_Tensor_t &tensor) { +inline const char * get_qnn_tensorname(const Qnn_Tensor_t & tensor) { if (tensor.version == kDefaultQnnTensorVersion) { return QNN_TENSOR_VER(tensor).name; } return nullptr; } -inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t &tensor) { +inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t & tensor) { if (tensor.version == kDefaultQnnTensorVersion) { return QNN_TENSOR_VER(tensor).type; } return QNN_TENSOR_TYPE_UNDEFINED; } -inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_t &tensor) { +inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_t & tensor) { if (tensor.version == kDefaultQnnTensorVersion) { return QNN_TENSOR_VER(tensor).dataFormat; } return QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER; } -inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t &tensor) { +inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t & tensor) { if (tensor.version == kDefaultQnnTensorVersion) { return QNN_TENSOR_VER(tensor).dataType; } return QNN_DATATYPE_UNDEFINED; } -inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t &tensor) { +inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t & tensor) { if (tensor.version == kDefaultQnnTensorVersion) { return QNN_TENSOR_VER(tensor).quantizeParams; } return QNN_QUANTIZE_PARAMS_INIT; } -inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t &tensor) { +inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t & tensor) { if (tensor.version == kDefaultQnnTensorVersion) { return QNN_TENSOR_VER(tensor).rank; } return 0u; } -inline uint32_t *get_qnn_tensor_dimensions(const Qnn_Tensor_t &tensor) { +inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t & tensor) { if (tensor.version == kDefaultQnnTensorVersion) { return QNN_TENSOR_VER(tensor).dimensions; } return nullptr; } -inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t &tensor) { +inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t & tensor) { if (tensor.version == kDefaultQnnTensorVersion) { return QNN_TENSOR_VER(tensor).memType; } return QNN_TENSORMEMTYPE_UNDEFINED; } -inline Qnn_MemHandle_t get_qnn_tensor_memhandle(const Qnn_Tensor_t &tensor) { +inline Qnn_MemHandle_t get_qnn_tensor_memhandle(const Qnn_Tensor_t & tensor) { if (tensor.version == kDefaultQnnTensorVersion) { return QNN_TENSOR_VER(tensor).memHandle; } return nullptr; } -inline void set_qnn_tensor_id(Qnn_Tensor_t &tensor, uint32_t id) { +inline void set_qnn_tensor_id(Qnn_Tensor_t & tensor, uint32_t id) { if (tensor.version == kDefaultQnnTensorVersion) { QNN_TENSOR_VER(tensor).id = id; } } -inline void set_qnn_tensor_name(Qnn_Tensor_t &tensor, const char *name) { +inline void set_qnn_tensor_name(Qnn_Tensor_t & tensor, const char * name) { if (tensor.version == kDefaultQnnTensorVersion) { QNN_TENSOR_VER(tensor).name = name; } } -inline void set_qnn_tensor_type(Qnn_Tensor_t &tensor, Qnn_TensorType_t type) { +inline void set_qnn_tensor_type(Qnn_Tensor_t & tensor, Qnn_TensorType_t type) { if (tensor.version == kDefaultQnnTensorVersion) { QNN_TENSOR_VER(tensor).type = type; } } -inline void set_qnn_tensor_dataformat(Qnn_Tensor_t &tensor, Qnn_TensorDataFormat_t format) { +inline void set_qnn_tensor_dataformat(Qnn_Tensor_t & tensor, Qnn_TensorDataFormat_t format) { if (tensor.version == kDefaultQnnTensorVersion) { QNN_TENSOR_VER(tensor).dataFormat = format; } } -inline void set_qnn_tensor_datatype(Qnn_Tensor_t &tensor, Qnn_DataType_t dataType) { +inline void set_qnn_tensor_datatype(Qnn_Tensor_t & tensor, Qnn_DataType_t dataType) { if (tensor.version == kDefaultQnnTensorVersion) { QNN_TENSOR_VER(tensor).dataType = dataType; } } -inline void set_qnn_tensor_quantparams(Qnn_Tensor_t &tensor, Qnn_QuantizeParams_t params) { +inline void set_qnn_tensor_quantparams(Qnn_Tensor_t & tensor, Qnn_QuantizeParams_t params) { if (tensor.version == kDefaultQnnTensorVersion) { QNN_TENSOR_VER(tensor).quantizeParams = params; } } -inline void set_qnn_tensor_rank(Qnn_Tensor_t &tensor, uint32_t rank) { +inline void set_qnn_tensor_rank(Qnn_Tensor_t & tensor, uint32_t rank) { if (tensor.version == kDefaultQnnTensorVersion) { QNN_TENSOR_VER(tensor).rank = rank; } } -inline void set_qnn_tensor_dimensions(Qnn_Tensor_t &tensor, uint32_t *dims) { +inline void set_qnn_tensor_dimensions(Qnn_Tensor_t & tensor, uint32_t * dims) { if (tensor.version == kDefaultQnnTensorVersion) { QNN_TENSOR_VER(tensor).dimensions = dims; } } -inline void set_qnn_tensor_memtype(Qnn_Tensor_t &tensor, Qnn_TensorMemType_t mem_type) { +inline void set_qnn_tensor_memtype(Qnn_Tensor_t & tensor, Qnn_TensorMemType_t mem_type) { if (tensor.version == kDefaultQnnTensorVersion) { QNN_TENSOR_VER(tensor).memType = mem_type; } } -inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t &tensor, Qnn_ClientBuffer_t client_buf) { +inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t & tensor, Qnn_ClientBuffer_t client_buf) { if (tensor.version == kDefaultQnnTensorVersion) { QNN_TENSOR_VER(tensor).clientBuf = client_buf; } } -inline void set_qnn_tensor_memhandle(Qnn_Tensor_t &tensor, Qnn_MemHandle_t handle) { +inline void set_qnn_tensor_memhandle(Qnn_Tensor_t & tensor, Qnn_MemHandle_t handle) { if (tensor.version == kDefaultQnnTensorVersion) { QNN_TENSOR_VER(tensor).memHandle = handle; } } -inline void set_qnn_tensor_dyn_dimensions(Qnn_Tensor_t &tensor, uint8_t *isDynamicDimensions) { +inline void set_qnn_tensor_dyn_dimensions(Qnn_Tensor_t & tensor, uint8_t * isDynamicDimensions) { if (tensor.version == QNN_TENSOR_VERSION_2) { tensor.v2.isDynamicDimensions = isDynamicDimensions; } } Qnn_DataType_t qnn_datatype_from_ggml_datatype(ggml_type ggml_type); -ggml_type ggml_datatype_from_qnn_datatype(Qnn_DataType_t qnn_type); -size_t qnn_datatype_size(Qnn_DataType_t qnn_type); -const char *qnn_datatype_to_string(Qnn_DataType_t qnn_type); -size_t get_system_total_memory_in_bytes(); -size_t get_system_free_memory_in_bytes(); +ggml_type ggml_datatype_from_qnn_datatype(Qnn_DataType_t qnn_type); +size_t qnn_datatype_size(Qnn_DataType_t qnn_type); +const char * qnn_datatype_to_string(Qnn_DataType_t qnn_type); +size_t get_system_total_memory_in_bytes(); +size_t get_system_free_memory_in_bytes(); #if ENABLE_QNNBACKEND_PERF class qnn_perf { -public: - qnn_perf(const std::string &perf_name) : _perf_name(std::move(perf_name)) {}; + public: + qnn_perf(const std::string & perf_name) : _perf_name(std::move(perf_name)) {}; + ~qnn_perf() { info(); } - qnn_perf() = delete; - qnn_perf(const qnn_perf &) = delete; - qnn_perf &operator=(const qnn_perf &) = delete; + + qnn_perf() = delete; + qnn_perf(const qnn_perf &) = delete; + qnn_perf & operator=(const qnn_perf &) = delete; void start() { _begin_time = ggml_time_us(); } @@ -218,48 +218,51 @@ class qnn_perf { QNN_LOG_INFO("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration); } -private: - int64_t _begin_time = 0LL; - int64_t _end_time = 0LL; - int64_t _duration = 0LL; + private: + int64_t _begin_time = 0LL; + int64_t _end_time = 0LL; + int64_t _duration = 0LL; std::string _perf_name; }; #else class qnn_perf { -public: + public: qnn_perf(const std::string &) {} + ~qnn_perf() { info(); } - qnn_perf() = delete; - qnn_perf(const qnn_perf &) = delete; - qnn_perf &operator=(const qnn_perf &) = delete; + + qnn_perf() = delete; + qnn_perf(const qnn_perf &) = delete; + qnn_perf & operator=(const qnn_perf &) = delete; void start() {} + void info() {} }; #endif -} // namespace qnn +} // namespace qnn -#define QNN_TENSOR_GET_ID(tensor) qnn::get_qnn_tensorid(tensor) -#define QNN_TENSOR_GET_NAME(tensor) qnn::get_qnn_tensorname(tensor) -#define QNN_TENSOR_GET_TYPE(tensor) qnn::get_qnn_tensortype(tensor) -#define QNN_TENSOR_GET_DATA_FORMAT(tensor) qnn::get_qnn_tensor_dataformat(tensor) -#define QNN_TENSOR_GET_DATA_TYPE(tensor) qnn::get_qnn_tensor_datatype(tensor) +#define QNN_TENSOR_GET_ID(tensor) qnn::get_qnn_tensorid(tensor) +#define QNN_TENSOR_GET_NAME(tensor) qnn::get_qnn_tensorname(tensor) +#define QNN_TENSOR_GET_TYPE(tensor) qnn::get_qnn_tensortype(tensor) +#define QNN_TENSOR_GET_DATA_FORMAT(tensor) qnn::get_qnn_tensor_dataformat(tensor) +#define QNN_TENSOR_GET_DATA_TYPE(tensor) qnn::get_qnn_tensor_datatype(tensor) #define QNN_TENSOR_GET_QUANT_PARAMS(tensor) qnn::get_qnn_tensor_quantparams(tensor) -#define QNN_TENSOR_GET_RANK(tensor) qnn::get_qnn_tensor_rank(tensor) -#define QNN_TENSOR_GET_DIMENSIONS(tensor) qnn::get_qnn_tensor_dimensions(tensor) -#define QNN_TENSOR_GET_MEM_TYPE(tensor) qnn::get_qnn_tensor_memtype(tensor) -#define QNN_TENSOR_GET_MEM_HANDLE(tensor) qnn::get_qnn_tensor_memhandle(tensor) - -#define QNN_TENSOR_SET_ID(tensor, value) qnn::set_qnn_tensor_id(tensor, value) -#define QNN_TENSOR_SET_NAME(tensor, value) qnn::set_qnn_tensor_name(tensor, value) -#define QNN_TENSOR_SET_TYPE(tensor, value) qnn::set_qnn_tensor_type(tensor, value) -#define QNN_TENSOR_SET_DATA_FORMAT(tensor, value) qnn::set_qnn_tensor_dataformat(tensor, value) -#define QNN_TENSOR_SET_DATA_TYPE(tensor, value) qnn::set_qnn_tensor_datatype(tensor, value) -#define QNN_TENSOR_SET_QUANT_PARAMS(tensor, value) qnn::set_qnn_tensor_quantparams(tensor, value) -#define QNN_TENSOR_SET_RANK(tensor, value) qnn::set_qnn_tensor_rank(tensor, value) -#define QNN_TENSOR_SET_DIMENSIONS(tensor, value) qnn::set_qnn_tensor_dimensions(tensor, value) -#define QNN_TENSOR_SET_MEM_TYPE(tensor, value) qnn::set_qnn_tensor_memtype(tensor, value) -#define QNN_TENSOR_SET_CLIENT_BUF(tensor, value) qnn::set_qnn_tensor_clientbuf(tensor, value) -#define QNN_TENSOR_SET_MEM_HANDLE(tensor, value) qnn::set_qnn_tensor_memhandle(tensor, value) +#define QNN_TENSOR_GET_RANK(tensor) qnn::get_qnn_tensor_rank(tensor) +#define QNN_TENSOR_GET_DIMENSIONS(tensor) qnn::get_qnn_tensor_dimensions(tensor) +#define QNN_TENSOR_GET_MEM_TYPE(tensor) qnn::get_qnn_tensor_memtype(tensor) +#define QNN_TENSOR_GET_MEM_HANDLE(tensor) qnn::get_qnn_tensor_memhandle(tensor) + +#define QNN_TENSOR_SET_ID(tensor, value) qnn::set_qnn_tensor_id(tensor, value) +#define QNN_TENSOR_SET_NAME(tensor, value) qnn::set_qnn_tensor_name(tensor, value) +#define QNN_TENSOR_SET_TYPE(tensor, value) qnn::set_qnn_tensor_type(tensor, value) +#define QNN_TENSOR_SET_DATA_FORMAT(tensor, value) qnn::set_qnn_tensor_dataformat(tensor, value) +#define QNN_TENSOR_SET_DATA_TYPE(tensor, value) qnn::set_qnn_tensor_datatype(tensor, value) +#define QNN_TENSOR_SET_QUANT_PARAMS(tensor, value) qnn::set_qnn_tensor_quantparams(tensor, value) +#define QNN_TENSOR_SET_RANK(tensor, value) qnn::set_qnn_tensor_rank(tensor, value) +#define QNN_TENSOR_SET_DIMENSIONS(tensor, value) qnn::set_qnn_tensor_dimensions(tensor, value) +#define QNN_TENSOR_SET_MEM_TYPE(tensor, value) qnn::set_qnn_tensor_memtype(tensor, value) +#define QNN_TENSOR_SET_CLIENT_BUF(tensor, value) qnn::set_qnn_tensor_clientbuf(tensor, value) +#define QNN_TENSOR_SET_MEM_HANDLE(tensor, value) qnn::set_qnn_tensor_memhandle(tensor, value) #define QNN_TENSOR_SET_DYN_DIMENSIONS(tensor, value) qnn::set_qnn_tensor_dyn_dimensions(tensor, value) From f289752664beecc7a1a1fb214a9aa65d8e6410e6 Mon Sep 17 00:00:00 2001 From: nullname Date: Fri, 28 Feb 2025 19:18:16 +0800 Subject: [PATCH 143/166] [bugfix]make sure single node op will have the same type (#29) * debug * disable reshape * make sure single node op have same type * fix warning at the logger * Revert "disable reshape" This reverts commit 5aeca4ba9bec6db3f047f9da803df20f9f6612b3. --- ggml/src/ggml-qnn/backend-ops.cpp | 31 +++++++++++++++++++++++++++---- ggml/src/ggml-qnn/graph.cpp | 2 +- ggml/src/ggml-qnn/logger.cpp | 5 ++++- 3 files changed, 32 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 3a401dd037b97..95fe35b465417 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -369,6 +369,31 @@ bool ggnl_qnn_supports_op_tensor(ggml_backend_qnn_device_context * ctx, const gg return true; } +bool ggml_qnn_have_same_tensor_types(ggml_backend_qnn_device_context * ctx, const ggml_tensor * op) { + auto * src0 = op->src[0]; + auto * src1 = op->src[1]; + if (src1) { + if (src0->type != op->type || src1->type != op->type) { + QNN_LOG_DEBUG("[%s][%s]type src0(%s), src1(%s) and op(%s) are not equal\n", + qnn::get_backend_name(ctx->device), ggml_op_name(op->op), ggml_type_name(src0->type), + ggml_type_name(src1->type), ggml_type_name(op->type)); + return false; + } + } else { + if (src0->type != op->type) { + QNN_LOG_DEBUG("[%s][%s]type src0(%s) and op(%s) are not equal\n", qnn::get_backend_name(ctx->device), + ggml_op_name(op->op), ggml_type_name(src0->type), ggml_type_name(op->type)); + return false; + } + } + +#ifdef NDEBUG + GGML_UNUSED(ctx); +#endif + + return true; +} + bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context * ctx, const ggml_tensor * op) { constexpr const size_t kMaxNpuTensorSize = 8192L * 2048 + 8192 * 512 + 2048 * 512; constexpr const auto get_tensor_size = [](const ggml_tensor * tensor) -> size_t { @@ -393,10 +418,8 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context * ctx, const gg // fall through, from test here, the convert op is super slow on NPU: // https://github.com/usefulsensors/qc_npu_benchmark case QNN_BACKEND_GPU: - if (src0->type != src1->type || src0->type != op->type) { + if (ggml_qnn_have_same_tensor_types(ctx, op)) { // there's no convert op for GPU. - QNN_LOG_DEBUG("[qnn-gpu][MUL_MAT]type src0(%s), src1(%s) and op(%s) are not equal\n", - ggml_type_name(src0->type), ggml_type_name(src1->type), ggml_type_name(op->type)); return false; } break; @@ -472,7 +495,7 @@ bool device_supports_op(ggml_backend_qnn_device_context * ctx, const ggml_tensor break; default: - // default to supported + is_op_supported = ggml_qnn_have_same_tensor_types(ctx, op); break; } } diff --git a/ggml/src/ggml-qnn/graph.cpp b/ggml/src/ggml-qnn/graph.cpp index b3ab161e9f6ca..2a282771c2a2d 100644 --- a/ggml/src/ggml-qnn/graph.cpp +++ b/ggml/src/ggml-qnn/graph.cpp @@ -246,7 +246,7 @@ qnn_graph::qnn_graph(const std::string & graph_name, QNNBackend device, std::sha return; } - QNN_LOG_INFO("[%s][%s]create succeed\n", get_backend_name(device), graph_name.c_str()); + QNN_LOG_DEBUG("[%s][%s]create succeed\n", get_backend_name(device), graph_name.c_str()); _graph_handle = graph_handle; _qnn_interface = qnn_interface; } diff --git a/ggml/src/ggml-qnn/logger.cpp b/ggml/src/ggml-qnn/logger.cpp index 5418d03be45a4..0ffa12e7b1bb3 100644 --- a/ggml/src/ggml-qnn/logger.cpp +++ b/ggml/src/ggml-qnn/logger.cpp @@ -13,7 +13,7 @@ void qnn::sdk_logcallback(const char * fmt, QnnLog_Level_t level, uint64_t /*tim static std::mutex log_mutex; static char s_ggml_qnn_logbuf[4096]; - char log_level_desc = 'U'; + char log_level_desc; switch (level) { case QNN_LOG_LEVEL_ERROR: log_level_desc = 'E'; @@ -30,6 +30,9 @@ void qnn::sdk_logcallback(const char * fmt, QnnLog_Level_t level, uint64_t /*tim case QNN_LOG_LEVEL_VERBOSE: log_level_desc = 'V'; break; + default: + log_level_desc = 'U'; + break; } { From 8b652dd6eca7004dc27747b73b5a4bcdf3ebfa51 Mon Sep 17 00:00:00 2001 From: nullname Date: Fri, 28 Feb 2025 22:54:57 +0800 Subject: [PATCH 144/166] bug: fix benchmark debug warning (#31) * print build type * wip * print compiling flags * wip * wip --- ggml/src/ggml-qnn/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-qnn/CMakeLists.txt b/ggml/src/ggml-qnn/CMakeLists.txt index ccf51e1a55a07..520bbd1f46dbb 100644 --- a/ggml/src/ggml-qnn/CMakeLists.txt +++ b/ggml/src/ggml-qnn/CMakeLists.txt @@ -20,10 +20,10 @@ if(NOT DEFINED GGML_QNN_SDK_PATH) endif() endif() +message("CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}") +message("CMAKE_CXX_FLAGS_RELEASE: ${CMAKE_CXX_FLAGS_RELEASE}") message("QNN_SDK_PATH: ${GGML_QNN_SDK_PATH}") -set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3") - file(GLOB QNN_SOURCES "${CMAKE_CURRENT_LIST_DIR}/*.cpp") ggml_add_backend_library(ggml-qnn ${QNN_SOURCES} From 31847c8301b19e0ea4f451cb1bfd44aa1035bf98 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 5 Mar 2025 22:25:36 +0800 Subject: [PATCH 145/166] fix compiling error after merge --- ggml/src/ggml-qnn/ggml-qnn.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp index 626ba2cce9520..fd18a1a623297 100644 --- a/ggml/src/ggml-qnn/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn/ggml-qnn.cpp @@ -99,10 +99,10 @@ void * ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t buffer) { return ctx->get_buffer(); } -void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { +ggml_status ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { GGML_UNUSED(buffer); GGML_UNUSED(tensor); - // TODO: we should create the qnn tensor along with the ggml tensor + return GGML_STATUS_SUCCESS; } void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, From a1ab67478fac9b71e2cf952cd9ac671d34ac1eb0 Mon Sep 17 00:00:00 2001 From: nullname Date: Sat, 22 Mar 2025 12:34:31 +0800 Subject: [PATCH 146/166] [feat] add more op (#35) * move op key generate function to kOpCaps * fix op desc print * try fix rms_norm * Revert "try fix rms_norm" This reverts commit 33b296098012909cb482fc29b52b28098dc971cd. * add quantization type support by converting them to float * enable quantization tensor for mulmat in gpu/npu * fix asan error * add log and assert * insert output convert operator after mulmat * add log * fix some error in running * disable permute again * add log * add error function * Revert "add error function" This reverts commit f92ff47798ac8053fb776c55efbb1a98469c7af1. * add log * more log * disable convert op in graph * wip * add f16 config for graph * set f16 precision for f16 graph * fix override data type * add comment * add config flag to enable quantize type * add log * more quantized type for cpu and gpu backend * enable all quant types for cpu and gpu backend * rename * wip * add log * remove unused functions * skip permute * remove get_qnn_op_input_param_count * fallback to generic_get_op_desc if no op_desc * revert 'skip permute' * Revert "revert 'skip permute'" This reverts commit 5761e31fd23c69c4cabf6fd9fac1a0d3e5a74968. * wip * add log * print qnn tensor type * add log * limit the max size of tensor * add log * fix tensor size limiter * small improve on tensor info printer * disable sqrt and div to pass test-backend-ops for 8 gen 2 * remove debug log in release build * add log * skip permute in src * wip * disable reshape * skip mul at decoder start * wip * add log * add qnn_scoped_timer * add perf tracker in graph * add cmake options GGML_QNN_ENABLE_PERFORMANCE_TRACKING * fix flag name * use milli-second * wip * fix comment string * add file for profiler * change qnn-cpu to GGML_BACKEND_DEVICE_TYPE_ACCEL, so that we can run tests on cpu * wip * profiler: refactoring * wip * add implement for print_profile_events * set-up profiler for graph * set profiler to graph execute * pretty print events * unified log print prefix * print event count * enable optrace * print duration at event end * wip * add more detailed soc information * wip * move device caps array into qnn-lib.cpp * remove lib_name in device_context * move get_graph_key_from_cgraph to graph.cpp * add override type for tensor key * use override_type instead of original data type for graph key * append op type to tensor name to fix error in qwen * remove todo * wip --- ggml/include/ggml-qnn.h | 5 +- ggml/src/ggml-qnn/CMakeLists.txt | 9 + ggml/src/ggml-qnn/backend-ops.cpp | 303 ++++++++------------- ggml/src/ggml-qnn/backend.hpp | 15 +- ggml/src/ggml-qnn/buffer.hpp | 12 +- ggml/src/ggml-qnn/convert.cpp | 155 +++++++++++ ggml/src/ggml-qnn/convert.hpp | 26 ++ ggml/src/ggml-qnn/ggml-qnn.cpp | 110 +++----- ggml/src/ggml-qnn/graph.cpp | 390 ++++++++++++++++++++------- ggml/src/ggml-qnn/graph.hpp | 50 +++- ggml/src/ggml-qnn/logger.hpp | 10 +- ggml/src/ggml-qnn/op-config-base.hpp | 4 +- ggml/src/ggml-qnn/op-config-caps.cpp | 132 +++++---- ggml/src/ggml-qnn/op-config-impl.cpp | 94 ++++--- ggml/src/ggml-qnn/op-config-impl.hpp | 56 ++-- ggml/src/ggml-qnn/op-config.hpp | 13 +- ggml/src/ggml-qnn/profiler.cpp | 170 ++++++++++++ ggml/src/ggml-qnn/profiler.hpp | 100 +++++++ ggml/src/ggml-qnn/qnn-lib.cpp | 99 ++++--- ggml/src/ggml-qnn/qnn-lib.hpp | 78 +++--- ggml/src/ggml-qnn/qnn-types.hpp | 14 +- ggml/src/ggml-qnn/tensor.hpp | 92 ++++--- ggml/src/ggml-qnn/utils.cpp | 103 +++++-- ggml/src/ggml-qnn/utils.hpp | 49 +--- 24 files changed, 1385 insertions(+), 704 deletions(-) create mode 100644 ggml/src/ggml-qnn/convert.cpp create mode 100644 ggml/src/ggml-qnn/convert.hpp create mode 100644 ggml/src/ggml-qnn/profiler.cpp create mode 100644 ggml/src/ggml-qnn/profiler.hpp diff --git a/ggml/include/ggml-qnn.h b/ggml/include/ggml-qnn.h index 2b25ce40d79e5..48194106cfad9 100644 --- a/ggml/include/ggml-qnn.h +++ b/ggml/include/ggml-qnn.h @@ -1,14 +1,13 @@ #pragma once -#include "ggml.h" - #include "ggml-backend.h" +#include "ggml.h" #ifdef __cplusplus extern "C" { #endif -#define GGML_QNN_NAME "QNN" +#define GGML_QNN_NAME "qnn" #define GGML_QNN_MAX_DEVICES QNN_BACKEND_COUNT enum QNNBackend { diff --git a/ggml/src/ggml-qnn/CMakeLists.txt b/ggml/src/ggml-qnn/CMakeLists.txt index 520bbd1f46dbb..b3591f903ddf9 100644 --- a/ggml/src/ggml-qnn/CMakeLists.txt +++ b/ggml/src/ggml-qnn/CMakeLists.txt @@ -42,4 +42,13 @@ target_compile_definitions(ggml-qnn PRIVATE GGML_QNN_DEFAULT_LIB_SEARCH_PATH="${ if(GGML_QNN_ENABLE_CPU_BACKEND) message("GGML_QNN_ENABLE_CPU_BACKEND is enabled") target_compile_definitions(ggml-qnn PRIVATE GGML_QNN_ENABLE_CPU_BACKEND) +else() + message("GGML_QNN_ENABLE_CPU_BACKEND is disabled") +endif() + +if(GGML_QNN_ENABLE_PERFORMANCE_TRACKING) + message("GGML_QNN_ENABLE_PERFORMANCE_TRACKING is enabled") + target_compile_definitions(ggml-qnn PRIVATE GGML_QNN_ENABLE_PERFORMANCE_TRACKING) +else() + message("GGML_QNN_ENABLE_PERFORMANCE_TRACKING is disabled") endif() diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 95fe35b465417..ecafe7096331f 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -12,156 +12,10 @@ namespace { -bool qnn_is_op_valid(ggml_backend_qnn_device_context * ctx, const ggml_tensor * dst) { - if (!ctx || !dst) { - QNN_LOG_WARN("invalid params\n"); - return false; - } - - auto instance = ctx->instance; - if (!instance) { - QNN_LOG_WARN("invalid instance\n"); - return false; - } - - const auto param_count = qnn::get_qnn_op_input_param_count(dst); - switch (param_count) { - case 1: - return dst->src[0]; - case 2: - return dst->src[0] && dst->src[1]; - default: - QNN_LOG_WARN("invalid op param count %d\n", (int) param_count); - break; - } - - return false; -} - -#ifndef NDEBUG -void print_ggml_tensor(const ggml_tensor * tensor) { - QNN_LOG_DEBUG("%s: type:%s ne: %ldx%ldx%ldx%ld, nb: %ldx%ldx%ldx%ld\n", tensor->name, ggml_type_name(tensor->type), - (long) tensor->ne[0], (long) tensor->ne[1], (long) tensor->ne[2], (long) tensor->ne[3], - (long) tensor->nb[0], (long) tensor->nb[1], (long) tensor->nb[2], (long) tensor->nb[3]); -} -#endif - -} // namespace - -namespace { - -typedef bool (*ggml_qnn_op_t)(ggml_backend_qnn_device_context * ctx, ggml_tensor * dst); - -void append_tensor_dimensions(const ggml_tensor * tensor, std::string & output) { - char buffer[256] = {}; - const auto * type_name = qnn::get_ggml_type_name(tensor->type); - int len = 0; - switch (ggml_n_dims(tensor)) { - case 1: - len = snprintf(buffer, sizeof(buffer), "%ld%s", (long) tensor->ne[0], type_name); - break; - case 2: - len = snprintf(buffer, sizeof(buffer), "%ldx%ld%s", (long) tensor->ne[0], (long) tensor->ne[1], type_name); - break; - case 3: - len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ld%s", (long) tensor->ne[0], (long) tensor->ne[1], - (long) tensor->ne[2], type_name); - break; - case 4: - default: - len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ldx%ld%s", (long) tensor->ne[0], (long) tensor->ne[1], - (long) tensor->ne[2], (long) tensor->ne[3], type_name); - break; - } - GGML_ASSERT(len > 0 && len < (int) sizeof(buffer)); - output.append(buffer, len); -} - -void get_graph_key_from_op(const ggml_tensor * op, std::string & output) { - GGML_ASSERT(op->op != GGML_OP_NONE); - output += ggml_op_desc(op); - output += qnn::get_ggml_type_name(op->type); - const auto param_count = qnn::get_qnn_op_input_param_count(op); - for (size_t i = 0; i < param_count; ++i) { - auto * input = op->src[i]; - if (!input) { - break; - } - - output += '_'; - append_tensor_dimensions(input, output); - } -} - -void get_op_key_with_src_op_desc(const ggml_tensor * op, std::string & output) { - output += ggml_op_desc(op); - output += '('; - if (op->src[0]) { - output += ggml_op_desc(op->src[0]); - } - for (size_t i = 1; i < GGML_MAX_DIMS && op->src[i]; ++i) { - output += ','; - output += ggml_op_desc(op->src[i]); - } - output += ')'; -} - -/** - * @brief Generates a unique key for a given computation graph (cgraph). - * - * This key is used to cache the graph, enabling efficient reuse of previously - * compiled graphs. The key is constructed by concatenating the descriptions - * of the operations and their associated tensor dimensions within the graph. - * - * Example key format: "MUL_MATf32_256x16x10f32_256x1x10f32#LOG#ADD#ADDf32_16x1x10f32" - * - * @param cgraph The computation graph for which the key is generated. - * @param output The string where the generated key will be stored. - * - * TODO: Improve the key generation logic to handle more complex graph structures and edge cases. - */ -void get_graph_key_from_cgraph(const ggml_cgraph * cgraph, std::string & output) { - if (cgraph->n_nodes == 0) { - QNN_LOG_DEBUG("empty cgraph\n"); - return; - } - - { - bool is_start = true; - for (int i = 0; i < cgraph->n_nodes; ++i) { - auto * op = cgraph->nodes[i]; - if (ggml_is_empty(op)) { - QNN_LOG_DEBUG("empty op in graph, skipping\n"); - continue; - } - - if (op->op == GGML_OP_NONE) { - QNN_LOG_DEBUG("GGML_OP_NONE in graph, skipping\n"); - continue; - } - - if (is_start) { - get_graph_key_from_op(cgraph->nodes[0], output); - is_start = false; - } else { - output += '#'; - get_op_key_with_src_op_desc(op, output); - } - } - } - - if (cgraph->n_nodes > 1) { - auto * last_op = cgraph->nodes[cgraph->n_nodes - 1]; - output += qnn::get_ggml_type_name(last_op->type); - output += '_'; - append_tensor_dimensions(last_op, output); - } -} - qnn::qnn_graph * get_qnn_graph_from_cache(ggml_backend_qnn_device_context * ctx, const ggml_cgraph * cgraph) { auto & graph_cache = ctx->qnn_graph_cache; std::string graph_key; - get_graph_key_from_cgraph(cgraph, graph_key); + auto op_data_type = qnn::qnn_graph::get_graph_key_from_cgraph(cgraph, graph_key); if (graph_key.empty()) { QNN_LOG_DEBUG("[%s]empty graph key for cgraph: %p, size: %d\n", qnn::get_backend_name(ctx->device), (const void *) cgraph, (int) cgraph->n_nodes); @@ -171,11 +25,20 @@ qnn::qnn_graph * get_qnn_graph_from_cache(ggml_backend_qnn_device_context * ctx, auto it = graph_cache.find(graph_key); qnn::qnn_graph * graph_ptr = nullptr; if (it != graph_cache.end()) { - QNN_LOG_DEBUG("[%s]found graph %s in cache\n", qnn::get_backend_name(ctx->device), graph_key.c_str()); + auto it = graph_cache.find(graph_key); + QNN_LOG_DEBUG("[%s]found graph %s in cache, cache size: %d\n", qnn::get_backend_name(ctx->device), + graph_key.c_str(), (int) graph_cache.size()); graph_ptr = it->second.get(); } else { - auto graph = - std::make_unique(graph_key, ctx->device, ctx->instance, ctx->socinfo.vtcm_size_in_mb); + auto precision = qnn::qnn_graph::kHtpDefault; + if (op_data_type == GGML_TYPE_F16) { + QNN_LOG_DEBUG("[%s][%s]set graph precision to FP16\n", qnn::get_backend_name(ctx->device), + graph_key.c_str()); + precision = qnn::qnn_graph::kHtpFp16; + } + + auto graph = std::make_unique(graph_key, ctx->device, ctx->instance, precision, + ctx->socinfo.vtcm_size_in_mb); if (!graph->is_valid()) { return nullptr; } @@ -187,6 +50,8 @@ qnn::qnn_graph * get_qnn_graph_from_cache(ggml_backend_qnn_device_context * ctx, graph_ptr = graph.get(); graph_cache[graph_key] = std::move(graph); + QNN_LOG_DEBUG("[%s]add graph %s to cache, cache size: %d\n", qnn::get_backend_name(ctx->device), + graph_key.c_str(), (int) graph_cache.size()); } return graph_ptr; @@ -201,9 +66,9 @@ constexpr const bool kQnnSupportedOps[] = { false, // GGML_OP_ACC true, // GGML_OP_SUB true, // GGML_OP_MUL - true, // GGML_OP_DIV + false, // GGML_OP_DIV, disabled for now cause failed on test-backend-ops false, // GGML_OP_SQR - true, // GGML_OP_SQRT + false, // GGML_OP_SQRT, disabled for now cause failed on test-backend-ops true, // GGML_OP_LOG false, // GGML_OP_SIN false, // GGML_OP_COS @@ -229,7 +94,7 @@ constexpr const bool kQnnSupportedOps[] = { false, // GGML_OP_SET false, // GGML_OP_CPY false, // GGML_OP_CONT - true, // GGML_OP_RESHAPE + false, // GGML_OP_RESHAPE false, // GGML_OP_VIEW false, // GGML_OP_PERMUTE false, // GGML_OP_TRANSPOSE @@ -306,14 +171,39 @@ constexpr const bool kQnnSupportedOps[] = { static_assert(kQnnSupportedOps[GGML_OP_NONE], "GGML_OP_NONE is not true"); static_assert(kQnnSupportedOps[GGML_OP_ADD], "GGML_OP_ADD is not true"); static_assert(kQnnSupportedOps[GGML_OP_MUL], "GGML_OP_MUL is not true"); -static_assert(kQnnSupportedOps[GGML_OP_MUL_MAT], - "GGML_OP_MUL_MAT is not true, please check the kQnnSupportedOps table in the backend-ops.cpp file"); -static_assert(kQnnSupportedOps[GGML_OP_RESHAPE], "GGML_OP_RESHAPE is not true"); +static_assert(kQnnSupportedOps[GGML_OP_MUL_MAT], "GGML_OP_MUL_MAT is not true"); +static_assert(!kQnnSupportedOps[GGML_OP_RESHAPE], "GGML_OP_RESHAPE should not be true"); static_assert(!kQnnSupportedOps[GGML_OP_VIEW], "GGML_OP_VIEW is not false"); static_assert(std::size(kQnnSupportedOps) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), "GGML_OP_COUNT does not match the size of the kQnnSupportedOps table"); -bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context * ctx, const ggml_tensor * tensor) { +inline bool is_type_bit_enabled(uint64_t bits, ggml_type type) { + return bits & (uint64_t(1) << type); +} + +inline bool is_tensor_size_valid(ggml_backend_qnn_device_context * ctx, const ggml_tensor * tensor) { + constexpr const auto get_tensor_size_in_bytes = [](const ggml_tensor * tensor, ggml_type type) -> size_t { + return tensor->ne[0] * tensor->ne[1] * tensor->ne[2] * tensor->ne[3] * ggml_type_size(type); + }; + + auto type = tensor->type; + if (ggml_is_quantized(type) && ctx->enable_cpu_dequantize) { + type = GGML_TYPE_F32; // TODO: [quantize] fix me if plan to dequantize to other types + } + + const auto tensor_size = get_tensor_size_in_bytes(tensor, type); + if (ctx->max_tensor_size_in_bytes && tensor_size >= ctx->max_tensor_size_in_bytes) { + QNN_LOG_DEBUG("[%s]tensor(%s_%dx%dx%dx%d) size(%lld) exceeds the limit(%lld)\n", + qnn::get_backend_name(ctx->device), ggml_get_name(tensor), (int) tensor->ne[0], + (int) tensor->ne[1], (int) tensor->ne[2], (int) tensor->ne[3], (long long int) tensor_size, + (long long int) ctx->max_tensor_size_in_bytes); + return false; + } + + return true; +} + +bool is_tensor_type_valid(ggml_backend_qnn_device_context * ctx, const ggml_tensor * tensor) { if (!tensor) { QNN_LOG_DEBUG("tensor is nullptr\n"); return false; @@ -332,9 +222,7 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context * ctx, const ggml_ switch (tensor->type) { case GGML_TYPE_F32: case GGML_TYPE_F16: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q4_0: - if (!(ctx->supported_types & (uint64_t(1) << tensor->type))) { + if (!is_type_bit_enabled(ctx->supported_types, tensor->type)) { QNN_LOG_DEBUG("[%s]unsupported data type %s, supported_types: 0x%x\n", qnn::get_backend_name(ctx->device), ggml_type_name(tensor->type), (unsigned int) ctx->supported_types); @@ -350,18 +238,29 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context * ctx, const ggml_ return true; } +bool is_data_reinterpretation_op(ggml_op op) { + return op == GGML_OP_VIEW || op == GGML_OP_PERMUTE; +} + bool ggnl_qnn_supports_op_tensor(ggml_backend_qnn_device_context * ctx, const ggml_tensor * op) { if (op->op == GGML_OP_NONE) { return true; } - if (!ggml_qnn_supports_tensor(ctx, op)) { + if (!is_tensor_type_valid(ctx, op) || !is_tensor_size_valid(ctx, op)) { return false; } - const auto param_count = qnn::get_qnn_op_input_param_count(op); - for (size_t i = 0; i < param_count; ++i) { - if (!ggml_qnn_supports_tensor(ctx, op->src[i])) { + // TODO: fix for other op + const bool cpu_dequant = ctx->enable_cpu_dequantize && op->op == GGML_OP_MUL_MAT; + for (size_t i = 0; i < GGML_MAX_SRC && op->src[i]; ++i) { + auto * src = op->src[i]; + if (!is_tensor_size_valid(ctx, src)) { + return false; + } + + // passthrough the quantized tensor for CPU dequantization + if (!is_tensor_type_valid(ctx, src) && (!cpu_dequant || !ggml_is_quantized(src->type))) { return false; } } @@ -394,14 +293,17 @@ bool ggml_qnn_have_same_tensor_types(ggml_backend_qnn_device_context * ctx, cons return true; } +// TODO: move to caps array? bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context * ctx, const ggml_tensor * op) { - constexpr const size_t kMaxNpuTensorSize = 8192L * 2048 + 8192 * 512 + 2048 * 512; - constexpr const auto get_tensor_size = [](const ggml_tensor * tensor) -> size_t { - return tensor->ne[0] * tensor->ne[1] * tensor->ne[2] * tensor->ne[3]; - }; - auto * src0 = op->src[0]; auto * src1 = op->src[1]; + if (is_data_reinterpretation_op(src0->op) || is_data_reinterpretation_op(src1->op)) { + // TODO: remove the blocker here when we support permute op + QNN_LOG_DEBUG("[%s][MUL_MAT]data reorganization op is not supported, (%s, %s)\n", + qnn::get_backend_name(ctx->device), ggml_op_name(src0->op), ggml_op_name(src1->op)); + return false; + } + switch (ctx->device) { case QNN_BACKEND_NPU: if (src1->ne[2] != src0->ne[2] || src1->ne[3] != src0->ne[3]) { @@ -411,15 +313,21 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context * ctx, const gg */ QNN_LOG_DEBUG("[qnn-npu][MUL_MAT]src0 and src1 dimensions are not equal\n"); return false; - } else if (get_tensor_size(src0) + get_tensor_size(src1) + get_tensor_size(op) >= kMaxNpuTensorSize) { - QNN_LOG_DEBUG("[qnn-npu][MUL_MAT]tensor size is too large\n"); - return false; } // fall through, from test here, the convert op is super slow on NPU: // https://github.com/usefulsensors/qc_npu_benchmark case QNN_BACKEND_GPU: - if (ggml_qnn_have_same_tensor_types(ctx, op)) { - // there's no convert op for GPU. + if (!ggml_qnn_have_same_tensor_types(ctx, op) && op->type != GGML_TYPE_F32) { + // for different tensor types and not float32, we don't support it currently, since there's no convert + QNN_LOG_DEBUG("[%s][MUL_MAT]src0 and src1 and dst types are not equal\n", + qnn::get_backend_name(ctx->device)); + return false; + } + if (op->type == GGML_TYPE_F32 && ggml_is_quantized(src0->type) && + !is_type_bit_enabled(ctx->cpu_preprocess_types, src0->type)) { + // for such cases that src0 is quantized and op is float32, check if the quant type is enabled + QNN_LOG_DEBUG("[%s][MUL_MAT]quantized src0 type %s is not enabled\n", + qnn::get_backend_name(ctx->device), ggml_type_name(src0->type)); return false; } break; @@ -436,6 +344,19 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context * ctx, const gg return true; } +#ifndef NDEBUG + +void print_tensor_info(ggml_backend_qnn_device_context * ctx, const ggml_tensor * op, bool is_supported) { + const char * supported = is_supported ? "supported" : "unsupported"; + std::string op_key; + qnn::get_qnn_op_desc(op, true, GGML_TYPE_COUNT, op_key); + + QNN_LOG_DEBUG("[%s][%s]op was %s, support/unsupported: %d/%d\n", qnn::get_backend_name(ctx->device), op_key.c_str(), + supported, ctx->supported_op_count.load(), ctx->unsupported_op_count.load()); +} + +#endif + } // namespace namespace qnn { @@ -448,22 +369,16 @@ bool device_supports_op(ggml_backend_qnn_device_context * ctx, const ggml_tensor if (!kQnnSupportedOps[qnn::get_qnn_op_index(op)]) { #ifndef NDEBUG - std::string op_key; - get_graph_key_from_op(op, op_key); ctx->unsupported_op_count++; - QNN_LOG_DEBUG("[%s][%s]op was unsupported, support/unsupported: %d/%d\n", qnn::get_backend_name(ctx->device), - op_key.c_str(), ctx->supported_op_count.load(), ctx->unsupported_op_count.load()); + print_tensor_info(ctx, op, false); #endif return false; } if (!ggnl_qnn_supports_op_tensor(ctx, op)) { #ifndef NDEBUG - std::string tensor_dims; - append_tensor_dimensions(op, tensor_dims); - QNN_LOG_DEBUG("[%s][%s]unsupported tensor(%s), support/unsupported: %d/%d\n", - qnn::get_backend_name(ctx->device), ggml_op_name(op->op), tensor_dims.c_str(), - ctx->supported_op_count.load(), ctx->unsupported_op_count.load()); + ctx->unsupported_op_count++; + print_tensor_info(ctx, op, false); #endif return false; } @@ -480,13 +395,23 @@ bool device_supports_op(ggml_backend_qnn_device_context * ctx, const ggml_tensor auto * src0 = op->src[0]; auto * src1 = op->src[1]; switch (op->op) { + case GGML_OP_MUL: + // TODO: fix this when we have the support for mul with rms_norm + if (ctx->enable_cpu_dequantize && (src0->op == GGML_OP_RMS_NORM || src1->op == GGML_OP_RMS_NORM)) { + QNN_LOG_DEBUG("[%s][%s]skip unsupported mul with rms norm, (%s, %s)\n", + qnn::get_backend_name(ctx->device), ggml_op_desc(op), ggml_op_desc(src0), + ggml_op_desc(src1)); + is_op_supported = false; + break; + } + // fall through, just skip the mul with rms_norm, in llama, its at start of decoder block case GGML_OP_ADD: case GGML_OP_SUB: - case GGML_OP_MUL: case GGML_OP_DIV: + // TODO: move to op caps array? if (!ggml_are_same_shape(src0, src1)) { QNN_LOG_DEBUG("[%s][%s] src0 and src1 dimensions are not equal\n", - qnn::get_backend_name(ctx->device), ggml_op_name(op->op)); + qnn::get_backend_name(ctx->device), ggml_op_desc(op)); is_op_supported = false; } break; @@ -503,13 +428,11 @@ bool device_supports_op(ggml_backend_qnn_device_context * ctx, const ggml_tensor #ifndef NDEBUG if (is_op_supported) { ctx->supported_op_count++; - QNN_LOG_DEBUG("[%s][%s]op was supported, support/unsupported: %d/%d\n", qnn::get_backend_name(ctx->device), - ggml_op_name(op->op), ctx->supported_op_count.load(), ctx->unsupported_op_count.load()); } else { ctx->unsupported_op_count++; - QNN_LOG_DEBUG("[%s][%s]op was unsupported, support/unsupported: %d/%d\n", qnn::get_backend_name(ctx->device), - ggml_op_name(op->op), ctx->supported_op_count.load(), ctx->unsupported_op_count.load()); } + + print_tensor_info(ctx, op, is_op_supported); #endif return is_op_supported; @@ -520,7 +443,7 @@ bool device_compute_graph(ggml_backend_qnn_device_context * ctx, ggml_cgraph * c (int) cgraph->n_nodes); auto qnn_graph = get_qnn_graph_from_cache(ctx, cgraph); - bool success = qnn_graph && qnn_graph->execute(cgraph); + bool success = qnn_graph && qnn_graph->execute(cgraph, ctx->convert_context); QNN_LOG_DEBUG("[%s]compute graph, success: %d\n", qnn::get_backend_name(ctx->device), (int) success); return success; diff --git a/ggml/src/ggml-qnn/backend.hpp b/ggml/src/ggml-qnn/backend.hpp index 253b0b672383d..f2484a7a973f6 100644 --- a/ggml/src/ggml-qnn/backend.hpp +++ b/ggml/src/ggml-qnn/backend.hpp @@ -10,6 +10,7 @@ #include #include +#include "convert.hpp" #include "ggml-backend.h" #include "ggml-qnn.h" #include "ggml.h" @@ -25,26 +26,30 @@ struct ggml_backend_qnn_device_context { QNNBackend device; size_t threads; std::string name; - std::string lib_name; + std::string description; // initialize in qnn init qnn::qcom_socinfo socinfo = {}; - uint64_t supported_types; + size_t max_tensor_size_in_bytes; std::shared_ptr instance; std::shared_ptr qnn_interface; - qnn::qnn_graph_cache_t qnn_graph_cache; + qnn::qnn_graph_cache_t qnn_graph_cache; + std::shared_ptr convert_context = std::make_shared(); #ifndef NDEBUG std::atomic_uint32_t supported_op_count = 0; std::atomic_uint32_t unsupported_op_count = 0; #endif + bool enable_cpu_dequantize = false; + uint64_t supported_types; + uint64_t cpu_preprocess_types; + explicit ggml_backend_qnn_device_context(QNNBackend device, size_t threads, const char * name, - const char * lib_name, uint64_t supported_types) : + uint64_t supported_types) : device(device), threads(threads), name(name), - lib_name(lib_name), supported_types(supported_types) {} }; diff --git a/ggml/src/ggml-qnn/buffer.hpp b/ggml/src/ggml-qnn/buffer.hpp index 43c4666dd15b1..2840f78fb51ba 100644 --- a/ggml/src/ggml-qnn/buffer.hpp +++ b/ggml/src/ggml-qnn/buffer.hpp @@ -69,8 +69,8 @@ using qnn_buffer_ptr = std::shared_ptr; */ class qnn_rpc_buffer : public qnn_buffer_interface { public: - qnn_rpc_buffer(std::shared_ptr qnn_instance, const size_t size, const uint32_t rank, - uint32_t * dimensions, Qnn_DataType_t data_type) : + qnn_rpc_buffer(qnn_instance_ptr qnn_instance, const size_t size, const uint32_t rank, uint32_t * dimensions, + Qnn_DataType_t data_type) : _size(size), _qnn_instance(qnn_instance) { _qnn_rpc_buffer = static_cast(qnn_instance->alloc_rpcmem(size, alignof(uint8_t *))); @@ -105,10 +105,10 @@ class qnn_rpc_buffer : public qnn_buffer_interface { Qnn_MemHandle_t get_mem_handle() const override { return _qnn_rpc_mem_handle; } private: - size_t _size = 0; - uint8_t * _qnn_rpc_buffer = nullptr; - Qnn_MemHandle_t _qnn_rpc_mem_handle = nullptr; - std::shared_ptr _qnn_instance; + size_t _size = 0; + uint8_t * _qnn_rpc_buffer = nullptr; + Qnn_MemHandle_t _qnn_rpc_mem_handle = nullptr; + qnn_instance_ptr _qnn_instance; DISABLE_COPY(qnn_rpc_buffer); DISABLE_MOVE(qnn_rpc_buffer); diff --git a/ggml/src/ggml-qnn/convert.cpp b/ggml/src/ggml-qnn/convert.cpp new file mode 100644 index 0000000000000..9719bac345eee --- /dev/null +++ b/ggml/src/ggml-qnn/convert.cpp @@ -0,0 +1,155 @@ + +#include "convert.hpp" + +#include "logger.hpp" + +namespace { + +size_t get_convert_buffer_size(const qnn::ggml_dimension_array_t & dimensions, ggml_type dst_type) { + GGML_ASSERT(ggml_blck_size(dst_type) == 1); + size_t nbytes = ggml_type_size(dst_type); + for (size_t i = 0; i < GGML_MAX_DIMS; ++i) { + nbytes *= dimensions[i]; // tight packing + } + + return nbytes; +} + +// from ggml_backend_blas_mul_mat, when omp available, use it otherwise will fall back to standard lib solution +// TODO: remove this when we can fall back the convert to blas backend +#ifdef GGML_USE_OPENMP + +void convert_tensor_impl(const ggml_tensor * src, int max_threads, + std::shared_ptr & output_buffer) { + const auto ne03 = src->ne[3]; + const auto ne02 = src->ne[2]; + const auto ne01 = src->ne[1]; + const auto ne00 = src->ne[0]; + const auto ne_plane = ne01 * ne00; + const auto nb03 = src->nb[3]; + const auto nb02 = src->nb[2]; + const auto nb01 = src->nb[1]; + const int min_cols_per_thread = 4096; + void * wdata = output_buffer->get_buffer(); + const auto to_float = ggml_get_type_traits(src->type)->to_float; + GGML_ASSERT(to_float); + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + const void * x = (char *) src->data + i02 * nb02 + i03 * nb03; + float * const wplane = (float *) wdata + i02 * ne_plane + i03 * ne02 * ne_plane; + + const int min_rows_per_thread = std::max((int) (min_cols_per_thread / ne00), 1); + const int n_threads = std::max(std::min(max_threads, (int) (ne01 / min_rows_per_thread)), 1); + +# pragma omp parallel for num_threads(n_threads) + for (int64_t i01 = 0; i01 < ne01; i01++) { + to_float((const char *) x + i01 * nb01, wplane + i01 * ne00, ne00); + } + } + } + + return output_buffer; +} + +#else + +void convert_tensor_impl(const ggml_tensor * src, int max_threads, std::vector> & tasks, + std::shared_ptr & output_buffer) { + const auto ne03 = src->ne[3]; + const auto ne02 = src->ne[2]; + const auto ne01 = src->ne[1]; + const auto ne00 = src->ne[0]; + const auto ne_plane = ne01 * ne00; + const auto nb03 = src->nb[3]; + const auto nb02 = src->nb[2]; + const auto nb01 = src->nb[1]; + const int min_cols_per_thread = 4096; + void * wdata = output_buffer->get_buffer(); + const auto to_float = ggml_get_type_traits(src->type)->to_float; + GGML_ASSERT(to_float); + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + const void * x = (char *) src->data + i02 * nb02 + i03 * nb03; + float * const wplane = (float *) wdata + i02 * ne_plane + i03 * ne02 * ne_plane; + + const int min_rows_per_thread = std::max((int) (min_cols_per_thread / ne00), 1); + const int n_threads = std::max(std::min(max_threads, (int) (ne01 / min_rows_per_thread)), 1); + + for (int i = 1; i < n_threads; i++) { + const int64_t start = i * ne01 / n_threads; + const int64_t end = (i + 1) * ne01 / n_threads; + if (start < end) { + tasks.push_back(std::async(std::launch::async, [=]() { + for (int64_t i01 = start; i01 < end; i01++) { + to_float((const char *) x + i01 * nb01, wplane + i01 * ne00, ne00); + } + })); + } + } + { + // reuse the current thread for the first task + const int64_t start = 0; + const int64_t end = ne01 / n_threads; + for (int64_t i01 = start; i01 < end; i01++) { + to_float((const char *) x + i01 * nb01, wplane + i01 * ne00, ne00); + } + } + } + } + + // wait for all tasks to finish + for (auto & task : tasks) { + task.get(); + } + tasks.clear(); +} + +#endif + +} // namespace + +namespace qnn { + +std::vector convert(std::shared_ptr convert_context, + const ggml_tensor_array_t & tensors, ggml_type target_data_type) { + convert_context->buffers.resize(tensors.size()); + std::vector output_buffers(tensors.size()); + for (size_t i = 0; i < tensors.size(); ++i) { + const ggml_tensor * src = tensors[i]; + if (src->type == target_data_type) { + continue; + } + + auto & data_buffer = convert_context->buffers[i]; + const auto dst_size = get_convert_buffer_size(src->ne, target_data_type); + if (!data_buffer || data_buffer->get_size() < dst_size) { +#ifndef NDEBUG + auto old_size = data_buffer ? data_buffer->get_size() : 0; + QNN_LOG_DEBUG("create buffer[%d] for tensor %s(%s), old_size: %d, new_size: %d\n", (int) i, + ggml_get_name(src), ggml_type_name(src->type), (int) old_size, (int) dst_size); +#endif + data_buffer = std::make_shared(dst_size); + } + + // TODO: add more restrictions to the buffer slice here + std::shared_ptr output_buffer = + std::make_shared(data_buffer->get_buffer(), dst_size); + + QNN_LOG_DEBUG("convert tensor(%s) from %s to %s, size: %d, n_threads: %d\n", ggml_get_name(src), + ggml_type_name(src->type), ggml_type_name(target_data_type), (int) dst_size, + convert_context->n_threads); + +#ifdef GGML_USE_OPENMP + convert_tensor_impl(src, convert_context->n_threads, output_buffer); +#else + convert_tensor_impl(src, convert_context->n_threads, convert_context->tasks, output_buffer); +#endif + output_buffers[i] = output_buffer; + } + + return output_buffers; +} + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/convert.hpp b/ggml/src/ggml-qnn/convert.hpp new file mode 100644 index 0000000000000..818004c587ba8 --- /dev/null +++ b/ggml/src/ggml-qnn/convert.hpp @@ -0,0 +1,26 @@ +#pragma once + +#include +#include +#include + +#include "buffer.hpp" +#include "ggml-qnn.h" +#include "tensor.hpp" +#include "utils.hpp" + +namespace qnn { + +// see also: ggml_backend_blas_context +struct qnn_convert_context_t { + int n_threads = std::thread::hardware_concurrency(); + std::vector> buffers; +#ifndef GGML_USE_OPENMP + std::vector> tasks; +#endif +}; + +std::vector convert(std::shared_ptr convert_context, + const ggml_tensor_array_t & tensors, ggml_type target_data_type); + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp index fd18a1a623297..1d3e45562c6ef 100644 --- a/ggml/src/ggml-qnn/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn/ggml-qnn.cpp @@ -4,78 +4,16 @@ #include #include +#include "backend-ops.hpp" +#include "backend.hpp" #include "ggml-backend-impl.h" #include "ggml-impl.h" -#include "ggml-qnn/backend-ops.hpp" -#include "ggml-qnn/backend.hpp" -#include "ggml-qnn/logger.hpp" -#include "ggml-qnn/tensor.hpp" -#include "ggml-qnn/utils.hpp" - -// ================================================================================================= -// -// self-defined macro / data structure -// -// ================================================================================================= -#ifdef NDEBUG -# define ENABLE_QNNBACKEND_PERF 0 // enable/disable op's perf info -#else -# define ENABLE_QNNBACKEND_PERF 1 // enable/disable op's perf info -#endif - -#define QNN_BACKEND_NAME "qnn" +#include "logger.hpp" +#include "tensor.hpp" +#include "utils.hpp" namespace { -#ifdef _WIN32 -constexpr const char * kQnnCpuLibName = "QnnCpu.dll"; -constexpr const char * kQnnGpuLibName = "QnnGpu.dll"; -constexpr const char * kQnnNpuLibName = "QnnHtp.dll"; -#else -constexpr const char * kQnnCpuLibName = "libQnnCpu.so"; -constexpr const char * kQnnGpuLibName = "libQnnGpu.so"; -constexpr const char * kQnnNpuLibName = "libQnnHtp.so"; -#endif - -struct qnn_device_caps { - const char * name; - const char * description; - const char * lib_name; - enum ggml_backend_dev_type type; - - // TODO: should get this caps from device - uint64_t supported_types; -}; - -// TODO: should move this to qnn-lib.cpp -constexpr const qnn_device_caps kDeviceCaps[] = { - { - "qnn-cpu", "Qualcomm Kryo CPU", - kQnnCpuLibName, GGML_BACKEND_DEVICE_TYPE_CPU, - (1 << GGML_TYPE_I8) | (1 << GGML_TYPE_F32), - }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/CpuOpDefSupplement.html#matmul - { - "qnn-gpu", "Qualcomm Adreno GPU", - kQnnGpuLibName, GGML_BACKEND_DEVICE_TYPE_GPU, - (1 << GGML_TYPE_F32) | (1 << GGML_TYPE_F16), - }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/GpuOpDefSupplement.html#matmul - { - "qnn-npu", "Qualcomm NPU", - kQnnNpuLibName, GGML_BACKEND_DEVICE_TYPE_ACCEL, - (1 << GGML_TYPE_F32) | (1 << GGML_TYPE_F16) | (1 << GGML_TYPE_I16) | (1 << GGML_TYPE_I8), - }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/HtpOpDefSupplement.html#matmul -}; - -static_assert(sizeof(kDeviceCaps) / sizeof(kDeviceCaps[0]) == GGML_QNN_MAX_DEVICES, - "The number of qnn devices should be equal to GGML_QNN_MAX_DEVICES"); -static_assert(kDeviceCaps[QNN_BACKEND_NPU].type == GGML_BACKEND_DEVICE_TYPE_ACCEL, - "The NPU device should be an accelerator device"); -static_assert(kDeviceCaps[QNN_BACKEND_GPU].type == GGML_BACKEND_DEVICE_TYPE_GPU, - "The NPU device should be an accelerator device"); - -static_assert(kDeviceCaps[QNN_BACKEND_CPU].type == GGML_BACKEND_DEVICE_TYPE_CPU, - "The NPU device should be an accelerator device"); - ggml_backend_qnn_device_context * get_device_context(ggml_backend_dev_t dev) { return reinterpret_cast(dev->context); } @@ -266,13 +204,13 @@ constexpr const ggml_backend_i ggml_backend_qnn_interface = { * ----------------------------------------------------------------------------------------------- */ const char * ggml_backend_qnn_device_get_name(ggml_backend_dev_t dev) { - const auto & caps = kDeviceCaps[get_device_context(dev)->device]; - return caps.name; + auto * dev_ctx = get_device_context(dev); + return qnn::get_backend_name(dev_ctx->device); } const char * ggml_backend_qnn_device_get_description(ggml_backend_dev_t dev) { - const auto & caps = kDeviceCaps[get_device_context(dev)->device]; - return caps.description; + auto * dev_ctx = get_device_context(dev); + return dev_ctx->description.empty() ? qnn::get_backend_desc(dev_ctx->device) : dev_ctx->description.c_str(); } void ggml_backend_qnn_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { @@ -283,7 +221,7 @@ void ggml_backend_qnn_device_get_memory(ggml_backend_dev_t dev, size_t * free, s } enum ggml_backend_dev_type ggml_backend_qnn_device_get_type(ggml_backend_dev_t dev) { - return kDeviceCaps[get_device_context(dev)->device].type; + return qnn::get_device_caps(get_device_context(dev)->device).type; } void ggml_backend_qnn_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) { @@ -310,14 +248,14 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, extend_lib_search_path = GGML_QNN_DEFAULT_LIB_SEARCH_PATH; QNN_LOG_WARN( "extend_lib_search_path is nullptr, will " - "use " GGML_QNN_DEFAULT_LIB_SEARCH_PATH " as default"); + "use " GGML_QNN_DEFAULT_LIB_SEARCH_PATH " as default\n"); } auto * dev_ctx = get_device_context(dev); const auto device = dev_ctx->device; QNN_LOG_DEBUG("device %s\n", qnn::get_backend_name(device)); QNN_LOG_DEBUG("extend_lib_search_path %s\n", extend_lib_search_path); - auto instance = std::make_shared(extend_lib_search_path, dev_ctx->lib_name); + auto instance = std::make_shared(extend_lib_search_path, device); auto result = instance->qnn_init(nullptr); if (result != 0) { QNN_LOG_WARN("failed to init qnn backend %s\n", qnn::get_backend_name(device)); @@ -331,10 +269,21 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, std::string device_name = qnn::get_backend_name(device); QNN_LOG_INFO("qnn device name %s\n", device_name.c_str()); - dev_ctx->instance = instance; - dev_ctx->qnn_interface = qnn_interface; - dev_ctx->socinfo = instance->get_soc_info(); - dev_ctx->supported_types = kDeviceCaps[device].supported_types; + const auto & device_caps = qnn::get_device_caps(device); + dev_ctx->instance = instance; + dev_ctx->qnn_interface = qnn_interface; + dev_ctx->socinfo = instance->get_soc_info(); + dev_ctx->supported_types = device_caps.supported_types; + dev_ctx->cpu_preprocess_types = device_caps.cpu_preprocess_types; + dev_ctx->max_tensor_size_in_bytes = device_caps.max_tensor_size_in_bytes; + { + char buffer[256]; + snprintf(buffer, sizeof(buffer), "%s(%s)", qnn::get_chipset_desc(dev_ctx->socinfo.soc_model), + qnn::get_backend_desc(dev_ctx->device)); + dev_ctx->description = buffer; + } + // TODO: remove npu from here if hardware quantization is supported + dev_ctx->enable_cpu_dequantize = device == QNN_BACKEND_CPU; ggml_backend_t qnn_backend = new ggml_backend{ /* .guid = */ ggml_backend_qnn_guid(), @@ -425,16 +374,17 @@ struct ggml_backend_qnn_reg_impl : ggml_backend_reg { * here we skip the initialization of CPU device, * cause it'll block unsupported ops fallback to ggml cpu backend */ + QNN_LOG_DEBUG("qnn backend registry skip CPU device\n"); continue; } #endif + const auto & device_caps = qnn::get_device_caps(device_enum); device_contexts.emplace_back(std::make_unique( /* .device = */ device_enum, // init from the last device, i.e. NPU /* .threads = */ 1, /* .name = */ qnn::get_backend_name(device_enum), - /* .lib_name = */ kDeviceCaps[device_enum].lib_name, - /* .supported_types = */ kDeviceCaps[device_enum].supported_types)); + /* .supported_types = */ device_caps.supported_types)); devices.emplace_back(ggml_backend_device{ /* iface = */ ggml_backend_qnn_device_interface, diff --git a/ggml/src/ggml-qnn/graph.cpp b/ggml/src/ggml-qnn/graph.cpp index 2a282771c2a2d..3021a6f0a2fb5 100644 --- a/ggml/src/ggml-qnn/graph.cpp +++ b/ggml/src/ggml-qnn/graph.cpp @@ -7,15 +7,27 @@ #include "ggml-impl.h" #include "logger.hpp" #include "op-config.hpp" +#include "profiler.hpp" #include "tensor.hpp" +#ifdef GGML_QNN_ENABLE_PERFORMANCE_TRACKING +# define GRAPH_PROFILE_HANDLE (_event_tracer ? _event_tracer->get_handle() : nullptr) +# define GRAPH_PROFILE_PRINT() \ + if (_event_tracer) { \ + _event_tracer->print_profile_events(); \ + } \ + (void) 0 +#else +# define GRAPH_PROFILE_HANDLE (nullptr) +# define GRAPH_PROFILE_PRINT() (void) 0 +#endif + namespace { using qnn_tensor_cache_t = std::unordered_map; int get_op_max_rank(const ggml_tensor * op) { - int max_rank = ggml_n_dims(op); - const int count = (int) qnn::get_qnn_op_input_param_count(op); - for (int i = 0; i < count; ++i) { + int max_rank = ggml_n_dims(op); + for (int i = 0; i < GGML_MAX_DIMS && op->src[i]; ++i) { max_rank = std::max(max_rank, ggml_n_dims(op->src[i])); } @@ -23,7 +35,8 @@ int get_op_max_rank(const ggml_tensor * op) { } qnn::qnn_tensor_ptr_t create_tensor_with_cache(ggml_tensor * tensor, qnn::ggml_qnn_tensor::tensor_type_t type, int rank, - QNNBackend device, Qnn_GraphHandle_t graph_handle, + ggml_type override_data_type, QNNBackend device, + Qnn_GraphHandle_t graph_handle, std::shared_ptr qnn_instance, qnn_tensor_cache_t & tensor_cache) { GGML_ASSERT(tensor); @@ -31,21 +44,30 @@ qnn::qnn_tensor_ptr_t create_tensor_with_cache(ggml_tensor * tensor, qnn::ggml_q return tensor_cache[tensor]; } - auto qnn_tensor = std::make_shared(type, tensor->name, tensor->ne, tensor->type, rank, device, - graph_handle, qnn_instance); + QNN_LOG_DEBUG("[%s]create_tensor_with_cache, data_type: %s, override_data_type: %s\n", + qnn::get_backend_name(device), ggml_type_name(tensor->type), ggml_type_name(override_data_type)); + auto data_type = override_data_type != GGML_TYPE_COUNT ? override_data_type : tensor->type; + + // We've observed that some tensors have the same name with different op types will be added to the same graph + // which will cause the graph build failed. To avoid this, we append the op type to the tensor name. + char tensor_name[256]; + snprintf(tensor_name, sizeof(tensor_name), "%s_%s", ggml_get_name(tensor), ggml_op_desc(tensor)); + auto qnn_tensor = std::make_shared(type, std::string(tensor_name), tensor->ne, data_type, + rank, device, graph_handle, qnn_instance); tensor_cache[tensor] = qnn_tensor; return qnn_tensor; } qnn::qnn_tensor_array_t create_tensors_with_cache(const qnn::ggml_tensor_array_t & ggml_tensors, - qnn::ggml_qnn_tensor::tensor_type_t type, int rank, QNNBackend device, + qnn::ggml_qnn_tensor::tensor_type_t type, int rank, + ggml_type override_data_type, QNNBackend device, Qnn_GraphHandle_t graph_handle, std::shared_ptr qnn_instance, qnn_tensor_cache_t & tensor_cache) { qnn::qnn_tensor_array_t tensors; for (auto * tensor : ggml_tensors) { - tensors.push_back( - create_tensor_with_cache(tensor, type, rank, device, graph_handle, qnn_instance, tensor_cache)); + tensors.push_back(create_tensor_with_cache(tensor, type, rank, override_data_type, device, graph_handle, + qnn_instance, tensor_cache)); } return tensors; @@ -54,23 +76,23 @@ qnn::qnn_tensor_array_t create_tensors_with_cache(const qnn::ggml_tensor_array_t qnn::qnn_op_config_ptr_t create_operation_from_op_tensor(ggml_tensor * dst, const std::string & name, int rank, QNNBackend device, Qnn_GraphHandle_t graph_handle, std::shared_ptr qnn_instance, - bool is_intermediate, qnn_tensor_cache_t & tensor_cache) { + qnn_tensor_cache_t & tensor_cache) { auto operation = qnn::create_op(dst, name, qnn_instance); // input tensors qnn::qnn_tensor_array_t input_qnn_tensors; - auto tensor_type = is_intermediate ? qnn::ggml_qnn_tensor::INTERMEDIATE : qnn::ggml_qnn_tensor::INPUT; - for (size_t i = 0; i < qnn::get_qnn_op_input_param_count(dst); ++i) { - auto input_qnn_tensor = - create_tensor_with_cache(dst->src[i], tensor_type, rank, device, graph_handle, qnn_instance, tensor_cache); + for (size_t i = 0; i < GGML_MAX_DIMS && dst->src[i]; ++i) { + auto * src = dst->src[i]; + auto input_qnn_tensor = create_tensor_with_cache(src, qnn::ggml_qnn_tensor::INTERMEDIATE, rank, GGML_TYPE_COUNT, + device, graph_handle, qnn_instance, tensor_cache); input_qnn_tensors.push_back(input_qnn_tensor); } operation->set_input_tensors(input_qnn_tensors); // output tensor - tensor_type = is_intermediate ? qnn::ggml_qnn_tensor::INTERMEDIATE : qnn::ggml_qnn_tensor::OUTPUT; qnn::qnn_tensor_array_t output_qnn_tensors = - create_tensors_with_cache({ dst }, tensor_type, rank, device, graph_handle, qnn_instance, tensor_cache); + create_tensors_with_cache({ dst }, qnn::ggml_qnn_tensor::INTERMEDIATE, rank, GGML_TYPE_COUNT, device, + graph_handle, qnn_instance, tensor_cache); operation->set_output_tensors(output_qnn_tensors); // initialize operation @@ -82,29 +104,6 @@ qnn::qnn_op_config_ptr_t create_operation_from_op_tensor(ggml_tensor * dst, cons return operation; } -bool bind_src_tensors(ggml_tensor * op, qnn::qnn_tensor_array_t & tensor_wrappers, - std::vector & qnn_tensors) { - if (op->op == GGML_OP_NONE) { - QNN_LOG_DEBUG("op %s is not a valid op\n", ggml_get_name(op)); - return false; - } - - const auto param_count = qnn::get_qnn_op_input_param_count(op); - GGML_ASSERT(tensor_wrappers.size() == param_count); - qnn_tensors.resize(param_count); - for (size_t i = 0; i < param_count; ++i) { - auto * ggml_tensor = op->src[i]; - if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor)) { - QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); - return false; - } - - qnn_tensors[i] = tensor_wrappers[i]->get_qnn_tensor(); - } - - return true; -} - /** * @brief Extracts input and output tensors from a computational graph. * @@ -134,11 +133,15 @@ int get_io_tensors_from_graph(const ggml_cgraph * cgraph, qnn::ggml_tensor_array continue; } - if (dst->op == GGML_OP_NONE || dst->op == GGML_OP_VIEW) { + if (dst->op == GGML_OP_NONE || dst->op == GGML_OP_VIEW || dst->op == GGML_OP_PERMUTE) { // TODO: remove GGML_OP_VIEW after view op is supported + QNN_LOG_DEBUG("node[%d]%s(%s), type: %s, skipped\n", i, ggml_get_name(dst), ggml_op_desc(dst), + ggml_type_name(dst->type)); continue; } + QNN_LOG_DEBUG("node[%d]%s(%s), type: %s\n", i, ggml_get_name(dst), ggml_op_desc(dst), + ggml_type_name(dst->type)); rank = std::max(rank, ggml_n_dims(dst)); if (connectivity_map.count(dst) == 0) { connectivity_map[dst] = { @@ -150,10 +153,12 @@ int get_io_tensors_from_graph(const ggml_cgraph * cgraph, qnn::ggml_tensor_array ++(connectivity_map[dst].in_degree); } - for (size_t i = 0; i < GGML_MAX_DIMS && dst->src[i]; ++i) { - auto * src = dst->src[i]; + for (size_t j = 0; j < GGML_MAX_DIMS && dst->src[j]; ++j) { + auto * src = dst->src[j]; rank = std::max(rank, ggml_n_dims(src)); + QNN_LOG_DEBUG("node[%d]: src[%d]: %s(%s), type: %s\n", i, (int) j, ggml_get_name(src), ggml_op_desc(src), + ggml_type_name(src->type)); if (connectivity_map.count(src) == 0) { connectivity_map[src] = { 0, @@ -187,16 +192,155 @@ int get_io_tensors_from_graph(const ggml_cgraph * cgraph, qnn::ggml_tensor_array return rank; } +/* + * for src0_F32, src1_F32, dst_F32 -> GGML_TYPE_COUNT + * for src0_F16, src1_F16, dst_F16 -> GGML_TYPE_COUNT + * for src0_F16, src1_F32, dst_F32 -> GGML_TYPE_F32 + * for src0_q4, src1_F32, dst_F32 -> GGML_TYPE_F32 + * for src0_q4, src1_F16, dst_F32 -> GGML_TYPE_F32 + */ +ggml_type get_override_data_type(const qnn::ggml_tensor_array_t & inputs, const qnn::ggml_tensor_array_t & outputs) { + GGML_ASSERT(!inputs.empty()); + ggml_type override_data_type = inputs.front()->type; + bool is_same_data_type = true; + for (auto * tensor : inputs) { + QNN_LOG_DEBUG("input_tensor: %s(%s), override_data_type(%s)\n", ggml_get_name(tensor), + ggml_type_name(tensor->type), ggml_type_name(override_data_type)); + is_same_data_type = is_same_data_type && tensor->type == override_data_type; + override_data_type = std::min(override_data_type, tensor->type); + } + + for (auto * tensor : outputs) { + QNN_LOG_DEBUG("output_tensor: %s(%s), override_data_type(%s)\n", ggml_get_name(tensor), + ggml_type_name(tensor->type), ggml_type_name(override_data_type)); + is_same_data_type = is_same_data_type && tensor->type == override_data_type; + override_data_type = std::min(override_data_type, tensor->type); + } + + return is_same_data_type ? GGML_TYPE_COUNT : override_data_type; +} + +static const QnnHtpGraph_CustomConfig_t kDefaultHvxConfig = []() { + QnnHtpGraph_CustomConfig_t hvx_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT; + hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; + hvx_config.numHvxThreads = 8; + return hvx_config; +}(); + +static const QnnHtpGraph_CustomConfig_t kDefaultDlbcConfig = []() { + QnnHtpGraph_CustomConfig_t dlbc_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT; + dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; + dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; + dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC + return dlbc_config; +}(); + +/* + * 1 = Faster preparation time, less optimal graph + * 2 = Longer preparation time, more optimal graph + * 3 = Longest preparation time, most likely even more optimal graph: + * QNN_HTP_DEVICE_CONFIG_OPTION_SOC configuration will be taken into account when possible, details see HTP Backend Specific Page + */ +static const QnnHtpGraph_CustomConfig_t kDefaultOptConfig = []() { + QnnHtpGraph_CustomConfig_t opt_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT; + opt_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; + opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; +#ifndef NDEBUG + opt_config.optimizationOption.floatValue = 3; +#else + opt_config.optimizationOption.floatValue = 1; +#endif + return opt_config; +}(); + +static const QnnHtpGraph_CustomConfig_t kHtpPrecisionConfigF16 = []() { + QnnHtpGraph_CustomConfig_t precision_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT; + precision_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_PRECISION; + precision_config.precision = QNN_PRECISION_FLOAT16; + return precision_config; +}(); + +constexpr QnnHtpGraph_CustomConfig_t make_vtcm_config(size_t vtcm_size_in_mb) { + QnnHtpGraph_CustomConfig_t vtcm_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT; + vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; + vtcm_config.vtcmSizeInMB = (uint32_t) vtcm_size_in_mb; + return vtcm_config; +} + +constexpr QnnGraph_Config_t make_graph_config(const QnnHtpGraph_CustomConfig_t * custom_config) { + QnnGraph_Config_t graph_config = QNN_GRAPH_CONFIG_INIT; + graph_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_config.customConfig = const_cast(custom_config); + return graph_config; +} + } // namespace namespace qnn { -qnn_graph::qnn_graph(const std::string & graph_name, QNNBackend device, std::shared_ptr qnn_instance, - size_t vtcm_size_in_mb) : +ggml_type qnn_graph::get_graph_key_from_cgraph(const ggml_cgraph * cgraph, std::string & output) { + if (cgraph->n_nodes == 0) { + QNN_LOG_DEBUG("empty cgraph\n"); + return GGML_TYPE_COUNT; + } + + ggml_type override_type = GGML_TYPE_COUNT; + { + // TODO: can we have a better approach to get the override_type here? + // though it is O(n) + O(mlog(m)) complexity, our graph is small, so it is fine + ggml_tensor_array_t inputs; + ggml_tensor_array_t outputs; + get_io_tensors_from_graph(cgraph, inputs, outputs); + if (!inputs.empty() && !outputs.empty()) { + override_type = get_override_data_type(inputs, outputs); + QNN_LOG_DEBUG("get_graph_key, override_type: %s\n", ggml_type_name(override_type)); + } else { + QNN_LOG_DEBUG("get_graph_key, no input or output tensors\n"); + } + } + + ggml_type min_op_type = GGML_TYPE_COUNT; + { + bool is_start = true; + for (int i = 0; i < cgraph->n_nodes; ++i) { + auto * op = cgraph->nodes[i]; + if (ggml_is_empty(op)) { + QNN_LOG_DEBUG("empty op in graph, skipping\n"); + continue; + } + + if (op->op == GGML_OP_NONE || op->op == GGML_OP_VIEW || op->op == GGML_OP_PERMUTE) { + QNN_LOG_DEBUG("%s in graph, skipping\n", ggml_op_desc(op)); + continue; + } + + min_op_type = std::min(min_op_type, op->type); + if (is_start) { + qnn::get_qnn_op_desc(op, is_start, override_type, output); + is_start = false; + } else { + output += '#'; + qnn::get_qnn_op_desc(op, is_start, override_type, output); + } + } + } + + if (cgraph->n_nodes > 1) { + auto * last_op = cgraph->nodes[cgraph->n_nodes - 1]; + output += qnn::get_ggml_type_name(last_op->type); + output += '_'; + qnn::append_tensor_shape_and_type(last_op, output); + } + + return min_op_type; +} + +qnn_graph::qnn_graph(const std::string & graph_name, QNNBackend device, qnn_instance_ptr qnn_instance, + htp_precision precision, size_t vtcm_size_in_mb) : _graph_name(graph_name), _device(device), _qnn_instance(qnn_instance) { - QNN_LOG_DEBUG("[%s][%s]created\n", get_backend_name(device), graph_name.c_str()); + QNN_LOG_DEBUG("[%s][%s]creating\n", get_backend_name(device), graph_name.c_str()); auto qnn_interface = qnn_instance->get_qnn_interface(); auto qnn_context = qnn_instance->get_qnn_context_handle(); @@ -204,38 +348,29 @@ qnn_graph::qnn_graph(const std::string & graph_name, QNNBackend device, std::sha Qnn_GraphHandle_t graph_handle = nullptr; if (device == QNN_BACKEND_NPU) { // TODO: fix graph config here for NPU - QnnHtpGraph_CustomConfig_t hvx_config; - hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; - hvx_config.numHvxThreads = 8; - QnnGraph_Config_t graph_hvx_config; - graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_hvx_config.customConfig = &hvx_config; - - QnnHtpGraph_CustomConfig_t dlbc_config; - dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; - dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; - dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC - QnnGraph_Config_t graph_dlbc_config; - graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_dlbc_config.customConfig = &dlbc_config; - - QnnHtpGraph_CustomConfig_t opt_config; - opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; - opt_config.optimizationOption.floatValue = 1; // 1 / 3 - QnnGraph_Config_t graph_opt_config; - graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_opt_config.customConfig = &opt_config; - - QnnHtpGraph_CustomConfig_t vtcm_config; - vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; - vtcm_config.vtcmSizeInMB = (uint32_t) vtcm_size_in_mb; - QnnGraph_Config_t graph_vtcm_config; - graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_vtcm_config.customConfig = &vtcm_config; - - const QnnGraph_Config_t * graph_configs[] = { &graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config, - &graph_opt_config, nullptr }; - error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), graph_configs, &graph_handle); + std::vector graph_configs; + + auto hvx_config = make_graph_config(&kDefaultHvxConfig); + graph_configs.push_back(&hvx_config); + + auto dlbc_config = make_graph_config(&kDefaultDlbcConfig); + graph_configs.push_back(&dlbc_config); + + auto opt_config = make_graph_config(&kDefaultOptConfig); + graph_configs.push_back(&opt_config); + + auto vctm_sub_config = make_vtcm_config(vtcm_size_in_mb); + auto vtcm_config = make_graph_config(&vctm_sub_config); + graph_configs.push_back(&vtcm_config); + + if (precision == qnn_graph::kHtpFp16) { + auto precision_config = make_graph_config(&kHtpPrecisionConfigF16); + graph_configs.push_back(&precision_config); + QNN_LOG_DEBUG("[%s][%s]set precision to F16\n", get_backend_name(device), graph_name.c_str()); + } + + graph_configs.push_back(nullptr); + error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), graph_configs.data(), &graph_handle); } else { error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), nullptr, &graph_handle); } @@ -246,9 +381,16 @@ qnn_graph::qnn_graph(const std::string & graph_name, QNNBackend device, std::sha return; } - QNN_LOG_DEBUG("[%s][%s]create succeed\n", get_backend_name(device), graph_name.c_str()); +#ifdef GGML_QNN_ENABLE_PERFORMANCE_TRACKING + if (device == QNN_BACKEND_NPU) { + _event_tracer = std::make_shared( + graph_name, qnn_interface, qnn_instance->get_qnn_backend_handle(), qnn_event_tracer::PROFILE_OP_TRACE); + } +#endif + _graph_handle = graph_handle; _qnn_interface = qnn_interface; + QNN_LOG_DEBUG("[%s][%s]create succeed\n", get_backend_name(device), graph_name.c_str()); } qnn_graph::~qnn_graph() { @@ -261,15 +403,28 @@ bool qnn_graph::build_graph_from_ggml_graph(const ggml_cgraph * cgraph) { ggml_tensor_array_t inputs; ggml_tensor_array_t outputs; int rank = get_io_tensors_from_graph(cgraph, inputs, outputs); - QNN_LOG_DEBUG("[%s]rank: %d, input_set: %d, output_set: %d\n", get_backend_name(_device), rank, int(inputs.size()), - int(outputs.size())); + QNN_LOG_DEBUG("[%s][%s]rank: %d, graph_nodes: %d, input_set: %d, output_set: %d\n", get_backend_name(_device), + _graph_name.c_str(), rank, cgraph->n_nodes, int(inputs.size()), int(outputs.size())); { + static_assert( + GGML_TYPE_COUNT > GGML_TYPE_Q8_0 && GGML_TYPE_Q8_0 > GGML_TYPE_F16 && GGML_TYPE_F16 > GGML_TYPE_F32, + "GGML_TYPE enum order is not correct"); + + QNN_SCOPED_PERFORMANCE_TRACKER("[%s][%s]build_graph_from_ggml_graph", get_backend_name(_device), + _graph_name.c_str()); + + auto override_data_type = get_override_data_type(inputs, outputs); + if (override_data_type != GGML_TYPE_COUNT) { + QNN_LOG_DEBUG("[%s][%s]set override_data_type: %s\n", get_backend_name(_device), _graph_name.c_str(), + ggml_type_name(override_data_type)); + } + qnn_tensor_cache_t tensor_cache; - auto input_tensors = create_tensors_with_cache(inputs, ggml_qnn_tensor::INPUT, rank, _device, _graph_handle, - _qnn_instance, tensor_cache); - auto output_tensors = create_tensors_with_cache(outputs, ggml_qnn_tensor::OUTPUT, rank, _device, _graph_handle, - _qnn_instance, tensor_cache); + auto input_tensors = create_tensors_with_cache(inputs, ggml_qnn_tensor::INPUT, rank, override_data_type, + _device, _graph_handle, _qnn_instance, tensor_cache); + auto output_tensors = create_tensors_with_cache(outputs, ggml_qnn_tensor::OUTPUT, rank, GGML_TYPE_COUNT, + _device, _graph_handle, _qnn_instance, tensor_cache); qnn_op_config_array_t operations; for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * dst = cgraph->nodes[i]; @@ -277,14 +432,21 @@ bool qnn_graph::build_graph_from_ggml_graph(const ggml_cgraph * cgraph) { continue; } - if (dst->op == GGML_OP_NONE || dst->op == GGML_OP_VIEW) { + if (dst->op == GGML_OP_NONE || dst->op == GGML_OP_VIEW || dst->op == GGML_OP_PERMUTE) { // TODO: remove GGML_OP_VIEW after view op is supported continue; } - QNN_LOG_DEBUG("[%s]create op: %s\n", get_backend_name(_device), get_qnn_op_name(dst)); +#ifndef NDEBUG + { + std::string op_desc; + get_qnn_op_desc(dst, true, GGML_TYPE_COUNT, op_desc); + QNN_LOG_DEBUG("[%s]create op(%s) with qnn op(%s)\n", get_backend_name(_device), op_desc.c_str(), + get_qnn_op_name(dst)); + } +#endif auto operation = create_operation_from_op_tensor(dst, dst->name, rank, _device, _graph_handle, - _qnn_instance, true, tensor_cache); // TODO: fix op name + _qnn_instance, tensor_cache); // TODO: fix op name operations.push_back(operation); } @@ -300,59 +462,81 @@ bool qnn_graph::build_graph_from_ggml_graph(const ggml_cgraph * cgraph) { return true; } -bool qnn_graph::execute(const ggml_cgraph * cgraph) { +bool qnn_graph::execute(const ggml_cgraph * cgraph, std::shared_ptr convert_context) { ggml_tensor_array_t inputs; ggml_tensor_array_t outputs; + { + QNN_SCOPED_PERFORMANCE_TRACKER("[%s][%s]get_io_tensors_from_graph", get_backend_name(_device), + _graph_name.c_str()); #ifdef NDEBUG - get_io_tensors_from_graph(cgraph, inputs, outputs); + get_io_tensors_from_graph(cgraph, inputs, outputs); #else - int rank = get_io_tensors_from_graph(cgraph, inputs, outputs); - QNN_LOG_DEBUG("[%s]rank: %d, input_set: %d, output_set: %d\n", get_backend_name(_device), rank, int(inputs.size()), - int(outputs.size())); + int rank = get_io_tensors_from_graph(cgraph, inputs, outputs); + QNN_LOG_DEBUG("[%s]rank: %d, input_set: %d, output_set: %d\n", get_backend_name(_device), rank, + int(inputs.size()), int(outputs.size())); #endif + } { - if (!qnn::bind_tensors(inputs, _tensor_inputs, _qnn_tensor_inputs)) { - QNN_LOG_ERROR("[%s][%s]bind input tensors failed\n", get_backend_name(_device), _graph_name.c_str()); - return false; + QNN_SCOPED_PERFORMANCE_TRACKER("[%s][%s]bind_tensors", get_backend_name(_device), _graph_name.c_str()); + auto override_data_type = get_override_data_type(inputs, outputs); + if (override_data_type != GGML_TYPE_COUNT) { + QNN_LOG_DEBUG("[%s][%s]override_data_type: %s\n", get_backend_name(_device), _graph_name.c_str(), + ggml_type_name(override_data_type)); + auto buffers = convert(convert_context, inputs, override_data_type); + if (!qnn::bind_tensors_with_custom_buffers(inputs, buffers, _tensor_inputs, _qnn_tensor_inputs)) { + QNN_LOG_ERROR("[%s][%s]bind input tensors failed\n", get_backend_name(_device), _graph_name.c_str()); + return false; + } + } else { + if (!qnn::bind_tensors(inputs, _tensor_inputs, _qnn_tensor_inputs)) { + QNN_LOG_ERROR("[%s][%s]bind input tensors failed\n", get_backend_name(_device), _graph_name.c_str()); + return false; + } } if (!qnn::bind_tensors(outputs, _tensor_outputs, _qnn_tensor_outputs)) { QNN_LOG_ERROR("[%s][%s]bind output tensors failed\n", get_backend_name(_device), _graph_name.c_str()); return false; } + } + { + QNN_SCOPED_PERFORMANCE_TRACKER("[%s][%s]execute", get_backend_name(_device), _graph_name.c_str()); auto & qnn_tensor_inputs = _qnn_tensor_inputs; auto & qnn_tensor_outputs = _qnn_tensor_outputs; - auto error = - _qnn_interface->qnn_graph_execute(_graph_handle, qnn_tensor_inputs.data(), qnn_tensor_inputs.size(), - qnn_tensor_outputs.data(), qnn_tensor_outputs.size(), nullptr, nullptr); + auto error = _qnn_interface->qnn_graph_execute(_graph_handle, qnn_tensor_inputs.data(), + qnn_tensor_inputs.size(), qnn_tensor_outputs.data(), + qnn_tensor_outputs.size(), GRAPH_PROFILE_HANDLE, nullptr); unbind_tensors(_tensor_inputs); unbind_tensors(_tensor_outputs); - if (error != QNN_SUCCESS) { if (_device == QNN_BACKEND_NPU && error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) { - QNN_LOG_WARN("[%s][%s]NPU crashed. SSR detected. Caused QNN graph execute error.\n", + QNN_LOG_WARN("[%s][%s][execute]NPU crashed. SSR detected. Caused QNN graph execute error.\n", get_backend_name(_device), _graph_name.c_str()); } else { - QNN_LOG_ERROR("[%s][%s]error: %s\n", get_backend_name(_device), _graph_name.c_str(), + QNN_LOG_ERROR("[%s][%s][execute]error: %s\n", get_backend_name(_device), _graph_name.c_str(), get_qnn_error_string(error)); } return false; } QNN_LOG_DEBUG("[%s][%s]execute succeed\n", get_backend_name(_device), _graph_name.c_str()); - return true; } + + GRAPH_PROFILE_PRINT(); + return true; } bool qnn_graph::finalize() { + QNN_SCOPED_PERFORMANCE_TRACKER("[%s][%s]finalize", get_backend_name(_device), _graph_name.c_str()); + if (!qnn::add_op_to_graph(_graph_handle, _operations)) { QNN_LOG_ERROR("[%s]add nodes failed\n", _graph_name.c_str()); return false; } - auto error = _qnn_interface->qnn_graph_finalize(_graph_handle, nullptr, nullptr); + auto error = _qnn_interface->qnn_graph_finalize(_graph_handle, GRAPH_PROFILE_HANDLE, nullptr); if (error != QNN_SUCCESS) { QNN_LOG_ERROR("[%s][%s]qnn_graph_finalize.error: %s\n", get_backend_name(_device), _graph_name.c_str(), get_qnn_error_string(error)); diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index dc1ed0b3f8896..a913b8bba38b0 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -5,8 +5,10 @@ #include #include +#include "convert.hpp" #include "ggml-qnn.h" #include "op-config.hpp" +#include "profiler.hpp" #include "qnn-lib.hpp" namespace qnn { @@ -21,19 +23,42 @@ namespace qnn { */ class qnn_graph { public: - explicit qnn_graph(const std::string & graph_name, QNNBackend device, std::shared_ptr qnn_instance, - size_t vtcm_size_in_mb); + enum htp_precision { + kHtpDefault = 0, + kHtpFp16, + }; + + /** + * @brief Generates a unique key for a given computation graph (cgraph). + * + * This key is used to cache the graph, enabling efficient reuse of previously + * compiled graphs. The key is constructed by concatenating the descriptions + * of the operations and their associated tensor dimensions within the graph. + * + * Example key format: "MUL_MATf32_2048x8192q4_K_2048x2f32#MUL(SILU,MUL_MAT)#MUL_MAT(NONE,MUL)#ADD(MUL_MAT,ADD)f32_2048x2f32" + * + * @param cgraph The computation graph for which the key is generated. + * @param output The string where the generated key will be stored. + * @return The max ggml_type of all tensors in the graph. + * + * TODO: Improve the key generation logic to handle more complex graph structures and edge cases. + */ + static ggml_type get_graph_key_from_cgraph(const ggml_cgraph * cgraph, std::string & output); + + explicit qnn_graph(const std::string & graph_name, QNNBackend device, qnn_instance_ptr qnn_instance, + htp_precision precision, size_t vtcm_size_in_mb); + ~qnn_graph(); bool build_graph_from_ggml_graph(const ggml_cgraph * cgraph); - bool execute(const ggml_cgraph * cgraph); + bool execute(const ggml_cgraph * cgraph, std::shared_ptr convert_context); bool is_valid() const { return _graph_handle != nullptr; } Qnn_GraphHandle_t get_graph_handler() const { return _graph_handle; } - std::shared_ptr get_qnn_instance() { return _qnn_instance; } + qnn_instance_ptr get_qnn_instance() { return _qnn_instance; } const std::string & get_name() const { return _graph_name; } @@ -42,18 +67,23 @@ class qnn_graph { private: bool finalize(); - const std::string _graph_name; - const QNNBackend _device; - Qnn_GraphHandle_t _graph_handle = nullptr; - std::shared_ptr _qnn_instance; - std::shared_ptr _qnn_interface; - qnn_op_config_array_t _operations; + const std::string _graph_name; + const QNNBackend _device; + Qnn_GraphHandle_t _graph_handle = nullptr; + qnn_instance_ptr _qnn_instance; + qnn_interface_ptr _qnn_interface; + qnn_op_config_array_t _operations; qnn_tensor_array_t _tensor_inputs; qnn_tensor_array_t _tensor_outputs; std::vector _qnn_tensor_inputs; std::vector _qnn_tensor_outputs; +#ifdef GGML_QNN_ENABLE_PERFORMANCE_TRACKING + // profiler + qnn_event_tracer_ptr _event_tracer; +#endif + DISABLE_COPY(qnn_graph); DISABLE_MOVE(qnn_graph); }; diff --git a/ggml/src/ggml-qnn/logger.hpp b/ggml/src/ggml-qnn/logger.hpp index cf94ce22174b6..309ae3e985a28 100644 --- a/ggml/src/ggml-qnn/logger.hpp +++ b/ggml/src/ggml-qnn/logger.hpp @@ -1,10 +1,11 @@ #pragma once +#include + #include #include "ggml-impl.h" #include "ggml.h" -#include "QnnLog.h" namespace qnn { void sdk_logcallback(const char * fmt, QnnLog_Level_t level, uint64_t timestamp, va_list argp); @@ -13,4 +14,9 @@ void sdk_logcallback(const char * fmt, QnnLog_Level_t level, uint64_t timestamp, #define QNN_LOG_ERROR(...) (GGML_LOG_ERROR(__VA_ARGS__)) #define QNN_LOG_WARN(...) (GGML_LOG_WARN(__VA_ARGS__)) #define QNN_LOG_INFO(...) (GGML_LOG_INFO(__VA_ARGS__)) -#define QNN_LOG_DEBUG(...) (GGML_LOG_DEBUG(__VA_ARGS__)) + +#ifndef NDEBUG +# define QNN_LOG_DEBUG(...) (GGML_LOG_DEBUG(__VA_ARGS__)) +#else +# define QNN_LOG_DEBUG(...) +#endif diff --git a/ggml/src/ggml-qnn/op-config-base.hpp b/ggml/src/ggml-qnn/op-config-base.hpp index b24b53bf2a3b6..87ca798272058 100644 --- a/ggml/src/ggml-qnn/op-config-base.hpp +++ b/ggml/src/ggml-qnn/op-config-base.hpp @@ -70,7 +70,7 @@ class ggml_qnn_op_config { * * @return A reference to a vector of qnn_tensor_ptr_t objects representing the input tensors. */ - virtual const qnn_tensor_array_t & get_input_tensors() = 0; + virtual qnn_tensor_array_t & get_input_tensors() = 0; /** * @brief Pure virtual function to retrieve the output tensors of a QNN. @@ -81,7 +81,7 @@ class ggml_qnn_op_config { * * @return A reference to a vector of qnn_tensor_ptr_t objects representing the output tensors. */ - virtual const qnn_tensor_array_t & get_output_tensors() = 0; + virtual qnn_tensor_array_t & get_output_tensors() = 0; /** * @brief Adds an operation to the given graph. diff --git a/ggml/src/ggml-qnn/op-config-caps.cpp b/ggml/src/ggml-qnn/op-config-caps.cpp index 16b50503bea4c..a29ea28ad6f03 100644 --- a/ggml/src/ggml-qnn/op-config-caps.cpp +++ b/ggml/src/ggml-qnn/op-config-caps.cpp @@ -3,30 +3,77 @@ namespace { -using op_constructor_t = std::shared_ptr (*)(const ggml_tensor *, const std::string &, +using op_constructor_t = std::shared_ptr (*)(const ggml_tensor *, const std::string &, std::shared_ptr); -using op_dims_calc_func_t = void (*)(const std::vector & input_dims, - qnn::ggml_dimension_array_t & output_dims); -void element_wise_op_dims(const std::vector & input_dims, - qnn::ggml_dimension_array_t & output_dims) { - for (size_t i = 1; i < std::size(output_dims); i++) { - output_dims[i] = input_dims.front()[i]; +using op_description_generator_t = void (*)(const ggml_tensor * op, bool append_dimensions, + ggml_type override_data_type, std::string & output); + +void append_tensor_shape_and_type_impl(const ggml_tensor * tensor, ggml_type override_data_type, std::string & output) { + char buffer[256] = {}; + const auto * type_name = qnn::get_ggml_type_name(std::min(tensor->type, override_data_type)); + int len = 0; + switch (ggml_n_dims(tensor)) { + case 1: + len = snprintf(buffer, sizeof(buffer), "%ld%s", (long) tensor->ne[0], type_name); + break; + case 2: + len = snprintf(buffer, sizeof(buffer), "%ldx%ld%s", (long) tensor->ne[0], (long) tensor->ne[1], type_name); + break; + case 3: + len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ld%s", (long) tensor->ne[0], (long) tensor->ne[1], + (long) tensor->ne[2], type_name); + break; + case 4: + default: + len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ldx%ld%s", (long) tensor->ne[0], (long) tensor->ne[1], + (long) tensor->ne[2], (long) tensor->ne[3], type_name); + break; } + GGML_ASSERT(len > 0 && len < (int) sizeof(buffer)); + output.append(buffer, len); } -void mat_mul_op_dims(const std::vector & input_dims, - qnn::ggml_dimension_array_t & output_dims) { - GGML_ASSERT(input_dims.size() == 2); - output_dims[0] = input_dims.front()[1]; - output_dims[1] = input_dims.back()[1]; +void get_graph_key_from_op(const ggml_tensor * op, ggml_type override_data_type, std::string & output) { + output += ggml_op_desc(op); + output += qnn::get_ggml_type_name(op->type); + for (size_t i = 0; i < GGML_MAX_SRC && op->src[i]; ++i) { + auto * src = op->src[i]; + if (!src) { + break; + } + + output += '_'; + append_tensor_shape_and_type_impl(src, override_data_type, output); + } +} + +void get_op_key_with_src_op_desc(const ggml_tensor * op, std::string & output) { + output += ggml_op_desc(op); + output += '('; + if (op->src[0]) { + output += ggml_op_desc(op->src[0]); + } + for (size_t i = 1; i < GGML_MAX_SRC && op->src[i]; ++i) { + output += ','; + output += ggml_op_desc(op->src[i]); + } + output += ')'; +} + +void generic_get_op_desc(const ggml_tensor * op, bool append_dimensions, ggml_type override_data_type, + std::string & output) { + if (append_dimensions) { + get_graph_key_from_op(op, override_data_type, output); + } else { + get_op_key_with_src_op_desc(op, output); + } } struct qnn_op_caps_t { - const char * qnn_op_name = nullptr; - const size_t input_param_count = 0; - op_dims_calc_func_t calc_dims_func = nullptr; - const char * qnn_param_name = nullptr; + const char * qnn_op_name = nullptr; + op_description_generator_t get_desc = nullptr; + const char * qnn_param_name = nullptr; }; constexpr const qnn_op_caps_t kOpCaps[] = { @@ -35,41 +82,29 @@ constexpr const qnn_op_caps_t kOpCaps[] = { { // GGML_OP_ADD QNN_OP_ELEMENT_WISE_ADD, // qnn_op_name - 2, // input_param_count - element_wise_op_dims, // calc_dims_func }, {}, // GGML_OP_ADD1 {}, // GGML_OP_ACC { // GGML_OP_SUB QNN_OP_ELEMENT_WISE_SUBTRACT, // qnn_op_name - 2, // input_param_count - element_wise_op_dims, // calc_dims_func }, { // GGML_OP_MUL QNN_OP_ELEMENT_WISE_MULTIPLY, // qnn_op_name - 2, // input_param_count - element_wise_op_dims, // calc_dims_func }, { // GGML_OP_DIV QNN_OP_ELEMENT_WISE_DIVIDE, // qnn_op_name - 2, // input_param_count - element_wise_op_dims, // calc_dims_func }, {}, // GGML_OP_SQR { // GGML_OP_SQRT QNN_OP_ELEMENT_WISE_SQUARE_ROOT, // qnn_op_name - 1, // input_param_count - element_wise_op_dims, // calc_dims_func }, { // GGML_OP_LOG QNN_OP_ELEMENT_WISE_LOG, // qnn_op_name - 1, // input_param_count - element_wise_op_dims, // calc_dims_func }, {}, // GGML_OP_SIN {}, // GGML_OP_COS @@ -86,17 +121,14 @@ constexpr const qnn_op_caps_t kOpCaps[] = { { // GGML_OP_RMS_NORM QNN_OP_RMS_NORM, // qnn_op_name - 1, // input_param_count - nullptr, // TODO: calc_dims_func + generic_get_op_desc, // get_desc QNN_OP_RMS_NORM_PARAM_EPSILON, // qnn_param_name }, {}, // GGML_OP_RMS_NORM_BACK {}, // GGML_OP_GROUP_NORM { // GGML_OP_MUL_MAT - QNN_OP_MAT_MUL, // qnn_op_name - 2, // input_param_count - mat_mul_op_dims, // calc_dims_func + QNN_OP_MAT_MUL, // qnn_op_name }, {}, // GGML_OP_MUL_MAT_ID {}, // GGML_OP_OUT_PROD @@ -107,8 +139,6 @@ constexpr const qnn_op_caps_t kOpCaps[] = { { // GGML_OP_RESHAPE QNN_OP_RESHAPE, // qnn_op_name - 1, // input_param_count - nullptr, // TODO: calc_dims_func }, {}, // GGML_OP_VIEW {}, // GGML_OP_PERMUTE @@ -179,8 +209,6 @@ constexpr const qnn_op_caps_t kOpCaps[] = { { // GGML_UNARY_OP_GELU QNN_OP_GELU, // qnn_op_name - 1, // input_param_count - nullptr, // TODO: calc_dims_func }, {}, // GGML_UNARY_OP_GELU_QUICK {}, // GGML_UNARY_OP_SILU @@ -189,15 +217,11 @@ constexpr const qnn_op_caps_t kOpCaps[] = { {}, // GGML_UNARY_OP_EXP }; -static_assert(kOpCaps[GGML_OP_NONE].calc_dims_func == nullptr, "GGML_OP_NONE should not have calc_dims_func function"); -static_assert(kOpCaps[GGML_OP_ADD].calc_dims_func == element_wise_op_dims, - "GGML_OP_ADD does not have element_wise_op_dims function"); -static_assert(kOpCaps[GGML_OP_MUL_MAT].calc_dims_func == mat_mul_op_dims, - "GGML_OP_ADD does not have element_wise_op_dims function"); -static_assert(kOpCaps[GGML_OP_LOG].calc_dims_func == element_wise_op_dims, - "GGML_OP_LOG does not have element_wise_op_dims function"); -static_assert(kOpCaps[GGML_OP_COUNT + GGML_UNARY_OP_GELU].input_param_count == 1, - "GGML_UNARY_OP_GELU does not have 1 input parameter"); +static_assert(kOpCaps[GGML_OP_NONE].get_desc == nullptr, "GGML_OP_NONE should not have get_desc function"); +static_assert(kOpCaps[GGML_OP_ADD].qnn_op_name, "GGML_OP_ADD does not have qnn_op_name in the kOpCaps table"); +static_assert(kOpCaps[GGML_OP_MUL_MAT].qnn_op_name, "GGML_OP_MUL_MAT does not have qnn_op_name in the kOpCaps table"); +static_assert(kOpCaps[GGML_OP_MUL].qnn_op_name, "GGML_OP_MUL does not have qnn_op_name in the kOpCaps table"); +static_assert(kOpCaps[GGML_OP_LOG].qnn_op_name, "GGML_OP_LOG does not have qnn_op_name in the kOpCaps table"); static_assert(std::size(kOpCaps) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), "GGML_OP_COUNT does not match the size of the kOpCaps table"); @@ -368,6 +392,10 @@ static_assert(std::size(kOpConstructors) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT namespace qnn { +void append_tensor_shape_and_type(const ggml_tensor * tensor, std::string & output) { + append_tensor_shape_and_type_impl(tensor, GGML_TYPE_COUNT, output); +} + size_t get_qnn_op_index(const ggml_tensor * tensor) { if (tensor->op == GGML_OP_UNARY) { return kGgmlUnaryOpStart + ggml_get_unary_op(tensor); @@ -383,14 +411,20 @@ const char * get_qnn_op_name(const ggml_tensor * op) { return kOpCaps[op_index].qnn_op_name; } -size_t get_qnn_op_input_param_count(const ggml_tensor * op) { +void get_qnn_op_desc(const ggml_tensor * op, bool append_dimensions, ggml_type override_data_type, + std::string & output) { auto op_index = get_qnn_op_index(op); GGML_ASSERT(op_index < std::size(kOpCaps)); - return kOpCaps[op_index].input_param_count; + auto get_desc = kOpCaps[op_index].get_desc; + if (get_desc) { + get_desc(op, append_dimensions, override_data_type, output); + } else { + generic_get_op_desc(op, append_dimensions, override_data_type, output); + } } std::shared_ptr create_op(const ggml_tensor * op, const std::string & name, - std::shared_ptr qnn_instance) { + qnn_instance_ptr qnn_instance) { auto op_index = get_qnn_op_index(op); GGML_ASSERT(op_index < std::size(kOpCaps)); auto op_constructor = kOpConstructors[op_index]; diff --git a/ggml/src/ggml-qnn/op-config-impl.cpp b/ggml/src/ggml-qnn/op-config-impl.cpp index 14638a554e066..b85f14504573a 100644 --- a/ggml/src/ggml-qnn/op-config-impl.cpp +++ b/ggml/src/ggml-qnn/op-config-impl.cpp @@ -84,12 +84,12 @@ void ggml_qnn_op_config_base::set_input_tensors(qnn::qnn_tensor_array_t & tensor } void ggml_qnn_op_config_base::set_input_tensors(qnn::qnn_tensor_array_t && tensor_inputs) { - _tensor_inputs = tensor_inputs; + _tensor_inputs = std::move(tensor_inputs); _qnn_tensor_inputs.resize(_tensor_inputs.size()); } void ggml_qnn_op_config_base::set_output_tensors(qnn::qnn_tensor_array_t & tensor_outputs) { - _tensor_outputs = std::move(tensor_outputs); + _tensor_outputs = tensor_outputs; _qnn_tensor_outputs.resize(_tensor_outputs.size()); } @@ -99,10 +99,11 @@ void ggml_qnn_op_config_base::set_output_tensors(qnn::qnn_tensor_array_t && tens } bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) { + QNN_LOG_DEBUG("[%s]add to graph start\n", _name.c_str()); + GGML_ASSERT(_qnn_tensor_inputs.size() == _tensor_inputs.size()); GGML_ASSERT(_qnn_tensor_outputs.size() == _tensor_outputs.size()); - QNN_LOG_DEBUG("[%s]add to graph start\n", _name.c_str()); for (size_t i = 0; i < _tensor_inputs.size(); i++) { auto tensor = _tensor_inputs[i]; if (!tensor->alloc_qnn_tensor_id()) { @@ -110,7 +111,8 @@ bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) { return false; } - QNN_LOG_DEBUG("[%s]input tensor id: %d\n", _name.c_str(), tensor->get_qnn_tensor_id()); + QNN_LOG_DEBUG("[%s]input tensor(%s), id(%d)\n", _name.c_str(), tensor->get_tensor_name().c_str(), + tensor->get_qnn_tensor_id()); _qnn_tensor_inputs[i] = tensor->get_qnn_tensor(); } @@ -121,7 +123,8 @@ bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) { return false; } - QNN_LOG_DEBUG("[%s]output tensor id: %d\n", _name.c_str(), tensor->get_qnn_tensor_id()); + QNN_LOG_DEBUG("[%s]output tensor(%s), id(%d)\n", _name.c_str(), tensor->get_tensor_name().c_str(), + tensor->get_qnn_tensor_id()); _qnn_tensor_outputs[i] = tensor->get_qnn_tensor(); } @@ -222,18 +225,30 @@ bool ggml_qnn_matmul_op_config::initialize_op_nodes(QNNBackend device, Qnn_Graph GGML_ASSERT(_tensor_outputs.size() == 1); // create convert nodes - const auto tensor_rank = _tensor_inputs.front()->get_rank(); - qnn_tensor_array_t mat_mul_tensor_inputs = _tensor_inputs; - qnn_tensor_array_t mat_mul_tensor_outputs = _tensor_outputs; - if (!create_convert_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs, mat_mul_tensor_outputs)) { - QNN_LOG_ERROR("create convert nodes failed\n"); - return false; - } + const auto tensor_rank = _tensor_inputs.front()->get_rank(); + qnn_tensor_array_t mat_mul_tensor_inputs = _tensor_inputs; + auto tensor_type = create_input_convert_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs); mat_mul_tensor_inputs.front() = create_gather_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs.front(), mat_mul_tensor_inputs.back()->get_dimensions()); - return create_mat_mul_nodes(mat_mul_tensor_inputs, mat_mul_tensor_outputs); + + if (device != QNN_BACKEND_GPU && _tensor_outputs.front()->get_data_type() != tensor_type) { + auto convert_out = create_output_convert_nodes(device, graph_handle, tensor_rank, tensor_type, _tensor_outputs); + if (!create_mat_mul_nodes(mat_mul_tensor_inputs, convert_out->get_input_tensors())) { + QNN_LOG_ERROR("create mat_mul nodes failed\n"); + return false; + } + + _operations.push_back(convert_out); + } else { + if (!create_mat_mul_nodes(mat_mul_tensor_inputs, _tensor_outputs)) { + QNN_LOG_ERROR("create mat_mul nodes failed\n"); + return false; + } + } + + return true; } qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, @@ -256,7 +271,7 @@ qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend devic constexpr const auto create_node = [](const std::string & name, const int rank, const int axis, const qnn_dimension_array_t & dimensions, qnn_tensor_ptr_t tensor_input, QNNBackend device, Qnn_GraphHandle_t graph_handle, - std::shared_ptr qnn_instance, qnn_tensor_ptr_t & tensor_output) -> qnn_op_config_ptr_t { + qnn_instance_ptr qnn_instance, qnn_tensor_ptr_t & tensor_output) -> qnn_op_config_ptr_t { auto gather_out = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, name + "_out", dimensions, tensor_input->get_data_type(), rank, device, graph_handle, qnn_instance); @@ -303,18 +318,16 @@ qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend devic return gather1_out; } -bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, - qnn_tensor_array_t & tensor_inputs, - qnn_tensor_array_t & tensor_outputs) { +Qnn_DataType_t ggml_qnn_matmul_op_config::create_input_convert_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, + const int rank, + qnn_tensor_array_t & tensor_inputs) { if (device == QNN_BACKEND_GPU) { // there's no convert op for GPU, so we should create matmul nodes directly. - return true; + return QNN_DATATYPE_UNDEFINED; } // create tensors for convert node auto tensor_type = get_tensor_type(tensor_inputs); - QNN_LOG_DEBUG("input tensor type: %s\n", qnn_datatype_to_string(tensor_type)); - for (size_t i = 0; i < tensor_inputs.size(); ++i) { // create input convert nodes auto convert_in = tensor_inputs[i]; @@ -327,29 +340,35 @@ bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_Grap convert_in->get_dimensions(), tensor_type, rank, device, graph_handle, _qnn_instance); auto convert = std::make_shared(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_CONVERT, _qnn_instance); + QNN_OP_CAST, _qnn_instance); + QNN_LOG_DEBUG("[%s][MUL_MAT]create: %s, type: %s\n", get_backend_name(device), convert_name.c_str(), + qnn_datatype_to_string(tensor_type)); convert->set_input_tensors({ convert_in }); convert->set_output_tensors({ convert_out }); tensor_inputs[i] = convert_out; _operations.push_back(convert); } - if (tensor_outputs.front()->get_data_type() != tensor_type) { - // create output convert node - std::string convert_name("convert_dst"); - auto convert_out = tensor_outputs.front(); - auto convert_in = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_in", - convert_out->get_dimensions(), tensor_type, rank, device, - graph_handle, _qnn_instance); - auto output_convert = std::make_shared(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_CONVERT, _qnn_instance); - output_convert->set_input_tensors({ convert_in }); - output_convert->set_output_tensors({ convert_out }); - tensor_outputs.front() = convert_in; - _operations.push_back(output_convert); - } + return tensor_type; +} - return true; +qnn_op_config_ptr_t ggml_qnn_matmul_op_config::create_output_convert_nodes(QNNBackend device, + Qnn_GraphHandle_t graph_handle, + const int rank, Qnn_DataType_t tensor_type, + qnn_tensor_array_t & tensor_outputs) { + GGML_ASSERT(tensor_outputs.size() == 1); + // create output convert node + std::string convert_name("convert_dst"); + auto convert_in = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_in", + tensor_outputs.front()->get_dimensions(), tensor_type, rank, + device, graph_handle, _qnn_instance); + auto output_convert = std::make_shared(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_CAST, _qnn_instance); + QNN_LOG_DEBUG("[%s][MUL_MAT]create: %s, type: %s\n", get_backend_name(device), convert_name.c_str(), + qnn_datatype_to_string(tensor_type)); + output_convert->set_input_tensors({ convert_in }); + output_convert->set_output_tensors(tensor_outputs); + return output_convert; } bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(qnn_tensor_array_t & tensor_inputs, @@ -413,8 +432,7 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(qnn_tensor_array_t & tensor mat_mul->add_scalar_param(QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1, scalar); // set tensor to mat_mul - std::swap(tensor_inputs[0], tensor_inputs[1]); - mat_mul->set_input_tensors(tensor_inputs); + mat_mul->set_input_tensors({ tensor_inputs[1], tensor_inputs[0] }); mat_mul->set_output_tensors(tensor_outputs); _operations.push_back(mat_mul); diff --git a/ggml/src/ggml-qnn/op-config-impl.hpp b/ggml/src/ggml-qnn/op-config-impl.hpp index 8e2f107b2dae1..558b5cafbe4cb 100644 --- a/ggml/src/ggml-qnn/op-config-impl.hpp +++ b/ggml/src/ggml-qnn/op-config-impl.hpp @@ -15,7 +15,7 @@ namespace qnn { class ggml_qnn_op_config_base : public ggml_qnn_op_config { public: explicit ggml_qnn_op_config_base(const std::string & name, const std::string & package_name, - const std::string & op_type, std::shared_ptr qnn_instance) : + const std::string & op_type, qnn_instance_ptr qnn_instance) : _name(name), _package_name(package_name), _op_type(op_type), @@ -36,24 +36,24 @@ class ggml_qnn_op_config_base : public ggml_qnn_op_config { void unbind_input_tensors() override; void unbind_output_tensors() override; - const qnn_tensor_array_t & get_input_tensors() override { return _tensor_inputs; } + qnn_tensor_array_t & get_input_tensors() override { return _tensor_inputs; } - const qnn_tensor_array_t & get_output_tensors() override { return _tensor_outputs; } + qnn_tensor_array_t & get_output_tensors() override { return _tensor_outputs; } protected: Qnn_OpConfig_t get_op_config(); - std::string _name; - std::string _package_name; - std::string _op_type; - std::shared_ptr _qnn_instance; - qnn_tensor_array_t _tensor_inputs; - qnn_tensor_array_t _tensor_outputs; - qnn_tensor_array_t _tensor_parameters; - std::vector _qnn_tensor_inputs; - std::vector _qnn_tensor_outputs; - std::vector _qnn_parameters; - std::vector _param_names; + std::string _name; + std::string _package_name; + std::string _op_type; + qnn_instance_ptr _qnn_instance; + qnn_tensor_array_t _tensor_inputs; + qnn_tensor_array_t _tensor_outputs; + qnn_tensor_array_t _tensor_parameters; + std::vector _qnn_tensor_inputs; + std::vector _qnn_tensor_outputs; + std::vector _qnn_parameters; + std::vector _param_names; DISABLE_COPY(ggml_qnn_op_config_base); DISABLE_MOVE(ggml_qnn_op_config_base); @@ -62,7 +62,7 @@ class ggml_qnn_op_config_base : public ggml_qnn_op_config { class ggml_qnn_single_op_config : public ggml_qnn_op_config_base { public: explicit ggml_qnn_single_op_config(const std::string & name, const std::string & package_name, - const std::string & op_type, std::shared_ptr qnn_instance) : + const std::string & op_type, qnn_instance_ptr qnn_instance) : ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) override; @@ -75,7 +75,7 @@ class ggml_qnn_single_op_config : public ggml_qnn_op_config_base { class ggml_qnn_rmsnorm_op_config : public ggml_qnn_op_config_base { public: explicit ggml_qnn_rmsnorm_op_config(const std::string & name, const std::string & package_name, - const std::string & op_type, std::shared_ptr qnn_instance) : + const std::string & op_type, qnn_instance_ptr qnn_instance) : ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) override; @@ -87,7 +87,7 @@ class ggml_qnn_rmsnorm_op_config : public ggml_qnn_op_config_base { class ggml_qnn_aggregate_op_config : public ggml_qnn_op_config { public: - explicit ggml_qnn_aggregate_op_config(const std::string & name, std::shared_ptr qnn_instance) : + explicit ggml_qnn_aggregate_op_config(const std::string & name, qnn_instance_ptr qnn_instance) : _name(name), _qnn_instance(qnn_instance) {} @@ -121,13 +121,13 @@ class ggml_qnn_aggregate_op_config : public ggml_qnn_op_config { } } - const qnn_tensor_array_t & get_input_tensors() override { return _tensor_inputs; } + qnn_tensor_array_t & get_input_tensors() override { return _tensor_inputs; } - const qnn_tensor_array_t & get_output_tensors() override { return _tensor_outputs; } + qnn_tensor_array_t & get_output_tensors() override { return _tensor_outputs; } protected: - std::string _name; - std::shared_ptr _qnn_instance; + std::string _name; + qnn_instance_ptr _qnn_instance; std::vector _operations; qnn_tensor_array_t _tensor_inputs; @@ -140,17 +140,19 @@ class ggml_qnn_aggregate_op_config : public ggml_qnn_op_config { class ggml_qnn_matmul_op_config : public ggml_qnn_aggregate_op_config { public: - ggml_qnn_matmul_op_config(const std::string & name, std::shared_ptr qnn_instance) : + ggml_qnn_matmul_op_config(const std::string & name, qnn_instance_ptr qnn_instance) : ggml_qnn_aggregate_op_config(name, qnn_instance) {} bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) override; private: - qnn_tensor_ptr_t create_gather_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, - qnn_tensor_ptr_t tensor_input, qnn_dimension_array_t output_dimensions); - bool create_convert_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, - qnn_tensor_array_t & tensor_inputs, qnn_tensor_array_t & tensor_outputs); - bool create_mat_mul_nodes(qnn_tensor_array_t & tensor_inputs, qnn_tensor_array_t & tensor_outputs); + qnn_tensor_ptr_t create_gather_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, + qnn_tensor_ptr_t tensor_input, qnn_dimension_array_t output_dimensions); + Qnn_DataType_t create_input_convert_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, + qnn_tensor_array_t & tensor_inputs); + qnn_op_config_ptr_t create_output_convert_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, + Qnn_DataType_t tensor_type, qnn_tensor_array_t & tensor_outputs); + bool create_mat_mul_nodes(qnn_tensor_array_t & tensor_inputs, qnn_tensor_array_t & tensor_outputs); DISABLE_COPY(ggml_qnn_matmul_op_config); DISABLE_MOVE(ggml_qnn_matmul_op_config); diff --git a/ggml/src/ggml-qnn/op-config.hpp b/ggml/src/ggml-qnn/op-config.hpp index d613a2116c04a..635a831a06c20 100644 --- a/ggml/src/ggml-qnn/op-config.hpp +++ b/ggml/src/ggml-qnn/op-config.hpp @@ -14,11 +14,16 @@ namespace qnn { constexpr const size_t kGgmlUnaryOpStart = GGML_OP_COUNT; -size_t get_qnn_op_index(const ggml_tensor * tensor); -const char * get_qnn_op_name(const ggml_tensor * op); -size_t get_qnn_op_input_param_count(const ggml_tensor * op); +// TODO: move to a better place +void append_tensor_shape_and_type(const ggml_tensor * tensor, std::string & output); + +size_t get_qnn_op_index(const ggml_tensor * tensor); +const char * get_qnn_op_name(const ggml_tensor * op); +void get_qnn_op_desc(const ggml_tensor * op, bool append_dimensions, ggml_type override_data_type, + std::string & output); + std::shared_ptr create_op(const ggml_tensor * op, const std::string & name, - std::shared_ptr qnn_instance); + qnn_instance_ptr qnn_instance); inline bool add_op_to_graph(Qnn_GraphHandle_t graph_handle, std::vector & operations) { for (auto & op : operations) { diff --git a/ggml/src/ggml-qnn/profiler.cpp b/ggml/src/ggml-qnn/profiler.cpp new file mode 100644 index 0000000000000..5625c3acf7ebb --- /dev/null +++ b/ggml/src/ggml-qnn/profiler.cpp @@ -0,0 +1,170 @@ + +#include "profiler.hpp" + +#include +#include + +#include "logger.hpp" +#include "qnn-lib.hpp" + +namespace { + +std::string get_duration_string(const QnnProfile_EventData_t & event_data) { + char time_str[128] = {}; + switch (event_data.unit) { + case QNN_PROFILE_EVENTUNIT_CYCLES: + snprintf(time_str, sizeof(time_str), "cycles: %lld", (long long int) event_data.value); + break; + case QNN_PROFILE_EVENTUNIT_COUNT: + snprintf(time_str, sizeof(time_str), "count: %lld", (long long int) event_data.value); + break; + case QNN_PROFILE_EVENTUNIT_BYTES: + snprintf(time_str, sizeof(time_str), "size: %lld bytes", (long long int) event_data.value); + break; + case QNN_PROFILE_EVENTUNIT_MICROSEC: + { + double duration_ms = event_data.value / 1000.0; + snprintf(time_str, sizeof(time_str), "duration: %.3f ms", duration_ms); + } + break; + default: + break; + } + + return time_str; +} + +} // namespace + +namespace qnn { + +qnn_event_tracer::qnn_event_tracer(const std::string & prefix, std::shared_ptr interface, + Qnn_BackendHandle_t backend_handle, sdk_profile_level level) : + _interface(interface), + _prefix(prefix) { + QnnProfile_Level_t qnn_profile_level = 0; + switch (level) { + case sdk_profile_level::PROFILE_BASIC: + qnn_profile_level = QNN_PROFILE_LEVEL_BASIC; + break; + case sdk_profile_level::PROFILE_OP_TRACE: + case sdk_profile_level::PROFILE_DETAIL: + qnn_profile_level = QNN_PROFILE_LEVEL_DETAILED; + break; + case sdk_profile_level::PROFILE_OFF: + default: + QNN_LOG_WARN("[profiler][%s]invalid profile level %d, using PROFILE_OFF\n", _prefix.c_str(), level); + return; + } + + auto error = _interface->qnn_profile_create(backend_handle, qnn_profile_level, &_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_ERROR("[profiler][%s]failed to create QNN profile_handle. Backend ID %u, error %ld\n", _prefix.c_str(), + _interface->get_backend_id(), (long) QNN_GET_ERROR_CODE(error)); + _handle = nullptr; + return; + } + + if (level == sdk_profile_level::PROFILE_OP_TRACE) { + QnnProfile_Config_t qnn_profile_config = QNN_PROFILE_CONFIG_INIT; + qnn_profile_config.option = QNN_PROFILE_CONFIG_OPTION_ENABLE_OPTRACE; + std::array profile_configs = { &qnn_profile_config, nullptr }; + error = _interface->qnn_profile_set_config(_handle, profile_configs.data()); + if (error != QNN_SUCCESS) { + QNN_LOG_ERROR("[profiler][%s]failed to set QNN profile event. Backend ID %u, error %ld\n", _prefix.c_str(), + _interface->get_backend_id(), (long) QNN_GET_ERROR_CODE(error)); + _interface->qnn_profile_free(_handle); + _handle = nullptr; + return; + } + } + + QNN_LOG_DEBUG("[profiler][%s]created, Backend ID %u, level %d\n", _prefix.c_str(), _interface->get_backend_id(), + level); +} + +qnn_event_tracer::~qnn_event_tracer() { + if (_handle) { + Qnn_ErrorHandle_t error = _interface->qnn_profile_free(_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_ERROR("[profiler][%s]failed to free QNN profile_handle. Backend ID %u, error %ld\n", + _prefix.c_str(), _interface->get_backend_id(), (long) QNN_GET_ERROR_CODE(error)); + } + _handle = nullptr; + } +} + +void qnn_event_tracer::print_profile_events() { + const QnnProfile_EventId_t * events_ptr = nullptr; + uint32_t num_events = 0; + auto error = _interface->qnn_profile_get_events(_handle, &events_ptr, &num_events); + if (error != QNN_SUCCESS) { + QNN_LOG_ERROR("[profiler][%s]failed to get QNN profile events. Backend ID %u, error %ld\n", _prefix.c_str(), + _interface->get_backend_id(), (long) QNN_GET_ERROR_CODE(error)); + return; + } + + if (!num_events) { + QNN_LOG_INFO("[profiler][%s]no QNN profile events\n", _prefix.c_str()); + return; + } + + QNN_LOG_INFO("[profiler][%s]print_profile_events start ----------------\n", _prefix.c_str()); + // see also: https://github.com/pytorch/executorch/blob/0ccf5093823761cf8ad98c75e5fe81f15ea42366/backends/qualcomm/runtime/backends/QnnProfiler.cpp#L73 + QnnProfile_EventData_t event_data; + for (uint32_t i = 0; i < num_events; ++i) { + error = _interface->qnn_profile_get_event_data(events_ptr[i], &event_data); + if (error != QNN_SUCCESS) { + QNN_LOG_ERROR("[profiler][%s]failed to get QNN profile event data. Backend ID %u, event[%d], error: %ld\n", + _prefix.c_str(), _interface->get_backend_id(), i, (long) QNN_GET_ERROR_CODE(error)); + continue; + } + + const QnnProfile_EventId_t * sub_events_ptr = nullptr; + uint32_t num_sub_events = 0; + error = _interface->qnn_profile_get_sub_events(events_ptr[i], &sub_events_ptr, &num_sub_events); + if (error != QNN_SUCCESS) { + QNN_LOG_ERROR("[profiler][%s]failed to get QNN profile sub events. Backend ID %u, event[%d], error: %ld\n", + _prefix.c_str(), _interface->get_backend_id(), i, (long) QNN_GET_ERROR_CODE(error)); + continue; + } + + auto duration = get_duration_string(event_data); + if (!num_sub_events) { + QNN_LOG_INFO("[profiler][%s]event[%d]: %s, %s\n", _prefix.c_str(), i, event_data.identifier, + duration.c_str()); + continue; + } + + QNN_LOG_INFO("[profiler][%s]event[%d]: %s, sub_count: %d, start -------------\n", _prefix.c_str(), i, + event_data.identifier, num_sub_events); + QnnProfile_EventData_t sub_event_data; + for (std::uint32_t j = 0; j < num_sub_events; ++j) { + error = _interface->qnn_profile_get_event_data(sub_events_ptr[j], &sub_event_data); + if (error != QNN_SUCCESS) { + QNN_LOG_ERROR( + "[profiler][%s]failed to get QNN profile sub event data. Backend ID %u, event[%d], sub_event[%d], " + "error: %ld\n", + _prefix.c_str(), _interface->get_backend_id(), i, j, (long) QNN_GET_ERROR_CODE(error)); + continue; + } + + if (sub_event_data.type != QNN_PROFILE_EVENTTYPE_NODE) { + QNN_LOG_DEBUG("[profiler][%s]sub_event[%d]%s, type %d, skipping\n", _prefix.c_str(), j, + sub_event_data.identifier, sub_event_data.type); + continue; + } + + auto sub_duration = get_duration_string(sub_event_data); + QNN_LOG_INFO("[profiler][%s]sub_event[%d]: %s, %s\n", _prefix.c_str(), j, sub_event_data.identifier, + sub_duration.c_str()); + } + + QNN_LOG_INFO("[profiler][%s]event[%d]: %s, %s, end --------------\n", _prefix.c_str(), i, event_data.identifier, + duration.c_str()); + } + + QNN_LOG_INFO("[profiler][%s]print_profile_events end -----------------\n", _prefix.c_str()); +} + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/profiler.hpp b/ggml/src/ggml-qnn/profiler.hpp new file mode 100644 index 0000000000000..34db09e0bf865 --- /dev/null +++ b/ggml/src/ggml-qnn/profiler.hpp @@ -0,0 +1,100 @@ +#pragma once + +#include + +#include +#include +#include +#include + +#include "logger.hpp" +#include "qnn-types.hpp" + +namespace qnn { + +#ifdef GGML_QNN_ENABLE_PERFORMANCE_TRACKING + +class qnn_scoped_timer { + public: + qnn_scoped_timer(const std::string & log_prefix) : _log_prefix(std::move(log_prefix)) { + _begin_us = ggml_time_us(); + } + + qnn_scoped_timer(qnn_scoped_timer && other) { + _begin_us = other._begin_us; + _log_prefix = std::move(other._log_prefix); + } + + ~qnn_scoped_timer() { print(); } + + void operator=(qnn_scoped_timer && other) { + _begin_us = other._begin_us; + _log_prefix = std::move(other._log_prefix); + } + + void print() const { + auto duration = (ggml_time_us() - _begin_us) / 1000.0; + QNN_LOG_INFO("[profiler]%s, duration: %.4f ms\n", _log_prefix.c_str(), duration); + } + + + private: + int64_t _begin_us = 0LL; + std::string _log_prefix; + + qnn_scoped_timer(const qnn_scoped_timer &) = delete; + void operator=(const qnn_scoped_timer &) = delete; +}; + +inline qnn_scoped_timer make_scope_perf_timer(const char * format, ...) { + va_list args; + va_start(args, format); + char buffer[4096]; + vsnprintf(buffer, sizeof(buffer), format, args); + va_end(args); + return qnn_scoped_timer(buffer); +} + +#else + +inline void make_scope_perf_timer(const char *, ...) {} + +#endif + +// forward declaration of qnn_interface +class qnn_interface; + +class qnn_event_tracer { + public: + // ref: + // https://github.com/pytorch/executorch/blob/ae3d558d5e6aa04fc52a3065399fe6a773702f52/backends/qualcomm/serialization/qc_schema.py#L53 + // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/overview.html#supported-snapdragon-devices + enum sdk_profile_level { PROFILE_OFF = 0, PROFILE_BASIC, PROFILE_DETAIL, PROFILE_OP_TRACE }; + + explicit qnn_event_tracer(const std::string & prefix, std::shared_ptr interface, + Qnn_BackendHandle_t backend_handle, sdk_profile_level level); + ~qnn_event_tracer(); + + Qnn_ProfileHandle_t get_handle() const { return _handle; } + + void print_profile_events(); + + private: + std::shared_ptr _interface; + Qnn_ProfileHandle_t _handle = nullptr; + std::string _prefix; + + DISABLE_COPY(qnn_event_tracer); + DISABLE_MOVE(qnn_event_tracer); +}; + +using qnn_event_tracer_ptr = std::shared_ptr; + +} // namespace qnn + +#ifdef GGML_QNN_ENABLE_PERFORMANCE_TRACKING +# define QNN_SCOPED_PERFORMANCE_TRACKER(fmt, ...) \ + auto __qnn_timer_##__LINE__ = qnn::make_scope_perf_timer(fmt, __VA_ARGS__) +#else +# define QNN_SCOPED_PERFORMANCE_TRACKER(fmt, ...) ((void) 0) +#endif diff --git a/ggml/src/ggml-qnn/qnn-lib.cpp b/ggml/src/ggml-qnn/qnn-lib.cpp index 3e4aa7fcd4b4b..2ec76939c9e2e 100644 --- a/ggml/src/ggml-qnn/qnn-lib.cpp +++ b/ggml/src/ggml-qnn/qnn-lib.cpp @@ -12,12 +12,51 @@ namespace { #ifdef _WIN32 constexpr const char * kQnnSystemLibName = "QnnSystem.dll"; constexpr const char * kQnnRpcLibName = "libcdsprpc.dll"; +constexpr const char * kQnnCpuLibName = "QnnCpu.dll"; +constexpr const char * kQnnGpuLibName = "QnnGpu.dll"; +constexpr const char * kQnnNpuLibName = "QnnHtp.dll"; #else constexpr const char * kQnnSystemLibName = "libQnnSystem.so"; constexpr const char * kQnnRpcLibName = "libcdsprpc.so"; - +constexpr const char * kQnnCpuLibName = "libQnnCpu.so"; +constexpr const char * kQnnGpuLibName = "libQnnGpu.so"; +constexpr const char * kQnnNpuLibName = "libQnnHtp.so"; #endif +constexpr const qnn::device_caps kDeviceCaps[] = { + { + // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/CpuOpDefSupplement.html#matmul + kQnnCpuLibName, GGML_BACKEND_DEVICE_TYPE_ACCEL, (1L << GGML_TYPE_I8) | (1L << GGML_TYPE_F32), + 0xFFFFFE, // all quantized types can be offload to CPU, at current implementation, those types will be dequantized into float32 on cpu + 0, // 0 for no limitation + }, + { + // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/GpuOpDefSupplement.html#matmul + kQnnGpuLibName, GGML_BACKEND_DEVICE_TYPE_GPU, (1L << GGML_TYPE_F32) | (1L << GGML_TYPE_F16), + // all quantized types can be offload to GPU, at current implementation, those types will be dequantized into float32 on cpu + 0xFFFFFE, (128256L * 4096 * + sizeof(float)), // tested on 8 gen 2, failed to allocate tensor with size 128256x4096 and float32 + }, + { + // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/HtpOpDefSupplement.html#matmul + kQnnNpuLibName, GGML_BACKEND_DEVICE_TYPE_ACCEL, + (1L << GGML_TYPE_F32) | (1L << GGML_TYPE_F16) | (1L << GGML_TYPE_I16), + (1L << GGML_TYPE_Q2_K) | (1L << GGML_TYPE_Q3_K) | (1L << GGML_TYPE_Q4_K) | (1L << GGML_TYPE_Q8_K), + (8192L * 2048 + 8192 * 512 + 2048 * 512) * sizeof(float), // TODO: should have a better way to get this value + }, +}; + +static_assert(sizeof(kDeviceCaps) / sizeof(kDeviceCaps[0]) == GGML_QNN_MAX_DEVICES, + "The number of qnn devices should be equal to GGML_QNN_MAX_DEVICES"); +static_assert(kDeviceCaps[QNN_BACKEND_NPU].type == GGML_BACKEND_DEVICE_TYPE_ACCEL, + "The NPU device should be an accelerator device"); +static_assert(kDeviceCaps[QNN_BACKEND_GPU].type == GGML_BACKEND_DEVICE_TYPE_GPU, + "The GPU device should be an GPU device"); +static_assert( + kDeviceCaps[QNN_BACKEND_CPU].type == GGML_BACKEND_DEVICE_TYPE_ACCEL, + "The CPU device should be an accelerator device"); // we treat qnn-cpu as a supplementary accelerator device +static_assert(GGML_TYPE_Q4_0 == 2 && GGML_TYPE_Q8_K == 15, "The quantized type order is not correct"); + void insert_path(std::string & path, std::string insert_path, const char separator = ':') { if (!insert_path.empty() && !path.empty()) { insert_path += separator; @@ -108,9 +147,8 @@ qnn_system_interface::~qnn_system_interface() { } } -qnn_instance::qnn_instance(const std::string & lib_path, const std::string & backend_lib_name) : - _additional_lib_load_path(lib_path), - _backend_lib_name(std::move(backend_lib_name)) { +qnn_instance::qnn_instance(const std::string & lib_path, QNNBackend device) : _additional_lib_load_path(lib_path) { + _backend_lib_name = kDeviceCaps[device].lib_name; if (set_qnn_lib_search_path(lib_path)) { QNN_LOG_DEBUG("[%s] set_qnn_lib_search_path succeed\n", _backend_lib_name.c_str()); } else { @@ -181,21 +219,27 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { qnn_status = _qnn_interface->qnn_device_get_platform_info(nullptr, &p_info); if (qnn_status == QNN_SUCCESS) { QNN_LOG_INFO("device counts %d\n", p_info->v1.numHwDevices); - QnnDevice_HardwareDeviceInfo_t * infos = p_info->v1.hwDevices; - QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = {}; + QnnDevice_HardwareDeviceInfo_t * infos = p_info->v1.hwDevices; for (uint32_t i = 0; i < p_info->v1.numHwDevices; i++) { QNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d\n", (int) infos[i].v1.deviceId, (int) infos[i].v1.deviceType, (int) infos[i].v1.numCores); - QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension; - chipinfo = devinfo->onChipDevice; - size_t htp_arch = (size_t) chipinfo.arch; + QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension; + QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = devinfo->onChipDevice; + size_t htp_arch = (size_t) chipinfo.arch; QNN_LOG_INFO("htp_type:%d(%s)\n", devinfo->devType, (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : ""); - QNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB\n", (int) chipinfo.socModel, - qnn::get_chipset_desc(chipinfo.socModel), (int) htp_arch, qnn::get_htparch_desc(htp_arch), - (int) chipinfo.vtcmSize); - _soc_info = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize }; + QNN_LOG_INFO("soc_model:%s(%s), htp_arch:%s(%d), vtcm_size:%d MB\n", + get_chipset_desc(chipinfo.socModel), get_chipset_model(chipinfo.socModel), + get_htparch_desc(htp_arch), (int) htp_arch, (int) chipinfo.vtcmSize); } + + if (p_info->v1.numHwDevices) { + QnnDevice_DeviceInfoExtension_t devinfo = infos[p_info->v1.numHwDevices - 1].v1.deviceInfoExtension; + QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = devinfo->onChipDevice; + size_t htp_arch = (size_t) chipinfo.arch; + _soc_info = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize }; + } + _qnn_interface->qnn_device_free_platform_info(nullptr, p_info); } else { // For emulator, we can't get platform info @@ -229,20 +273,6 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { QNN_LOG_INFO("create QNN device successfully\n"); } - if (_profile_level != sdk_profile_level::profile_off) { - QNN_LOG_INFO("profiling turned on; level = %d\n", _profile_level); - auto profile_level = - _profile_level == sdk_profile_level::profile_detail ? QNN_PROFILE_LEVEL_DETAILED : QNN_PROFILE_LEVEL_BASIC; - - if (QNN_PROFILE_NO_ERROR != - _qnn_interface->qnn_profile_create(_qnn_backend_handle, profile_level, &_qnn_profile_handle)) { - QNN_LOG_WARN("unable to create profile handle in the backend\n"); - return 6; - } else { - QNN_LOG_DEBUG("initialize qnn profile successfully\n"); - } - } - _rpc_lib_handle = load_lib_with_fallback(kQnnRpcLibName, _additional_lib_load_path); if (_rpc_lib_handle) { _pfn_rpc_mem_alloc = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_alloc")); @@ -339,7 +369,7 @@ int qnn_instance::qnn_finalize() { } if (_qnn_context_handle) { - error = _qnn_interface->qnn_context_free(_qnn_context_handle, _qnn_profile_handle); + error = _qnn_interface->qnn_context_free(_qnn_context_handle, nullptr); if (error != QNN_SUCCESS) { QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), (int) QNN_GET_ERROR_CODE(error)); @@ -347,15 +377,6 @@ int qnn_instance::qnn_finalize() { _qnn_context_handle = nullptr; } - if (_qnn_profile_handle) { - error = _qnn_interface->qnn_profile_free(_qnn_profile_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), - (int) QNN_GET_ERROR_CODE(error)); - } - _qnn_profile_handle = nullptr; - } - if (_qnn_device_handle) { error = _qnn_interface->qnn_device_free(_qnn_device_handle); if (error != QNN_SUCCESS) { @@ -535,4 +556,8 @@ int qnn_instance::unload_backend() { return 0; } +const device_caps & get_device_caps(QNNBackend device) { + return kDeviceCaps[device]; +} + } // namespace qnn diff --git a/ggml/src/ggml-qnn/qnn-lib.hpp b/ggml/src/ggml-qnn/qnn-lib.hpp index bb6006acda19c..3d0084b868da8 100644 --- a/ggml/src/ggml-qnn/qnn-lib.hpp +++ b/ggml/src/ggml-qnn/qnn-lib.hpp @@ -82,70 +82,48 @@ class qnn_interface { // QnnBackend DEFINE_SHIM_FUNCTION_INTERFACE(backend_create, backendCreate); - DEFINE_SHIM_FUNCTION_INTERFACE(backend_free, backendFree); - DEFINE_SHIM_FUNCTION_INTERFACE(backend_register_op_package, backendRegisterOpPackage); - DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, backendValidateOpConfig); - DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, backendGetApiVersion); + // QnnDevice DEFINE_SHIM_FUNCTION_INTERFACE(device_create, deviceCreate); - DEFINE_SHIM_FUNCTION_INTERFACE(device_free, deviceFree); - DEFINE_SHIM_FUNCTION_INTERFACE(device_get_infrastructure, deviceGetInfrastructure); - DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, deviceGetPlatformInfo); - DEFINE_SHIM_FUNCTION_INTERFACE(device_free_platform_info, deviceFreePlatformInfo); - DEFINE_SHIM_FUNCTION_INTERFACE(device_get_info, deviceGetInfo); // QnnContext DEFINE_SHIM_FUNCTION_INTERFACE(context_create, contextCreate); - DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary_size, contextGetBinarySize); - DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary, contextGetBinary); - DEFINE_SHIM_FUNCTION_INTERFACE(context_create_from_binary, contextCreateFromBinary); - DEFINE_SHIM_FUNCTION_INTERFACE(context_free, contextFree); // QnnGraph DEFINE_SHIM_FUNCTION_INTERFACE(graph_create, graphCreate); - DEFINE_SHIM_FUNCTION_INTERFACE(graph_add_node, graphAddNode); - DEFINE_SHIM_FUNCTION_INTERFACE(graph_finalize, graphFinalize); - DEFINE_SHIM_FUNCTION_INTERFACE(graph_execute, graphExecute); - DEFINE_SHIM_FUNCTION_INTERFACE(graph_retrieve, graphRetrieve); // QnnLog DEFINE_SHIM_FUNCTION_INTERFACE(log_create, logCreate); - DEFINE_SHIM_FUNCTION_INTERFACE(log_free, logFree); - DEFINE_SHIM_FUNCTION_INTERFACE(log_set_log_level, logSetLogLevel); // QnnProfile DEFINE_SHIM_FUNCTION_INTERFACE(profile_create, profileCreate); - + DEFINE_SHIM_FUNCTION_INTERFACE(profile_set_config, profileSetConfig); DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_events, profileGetEvents); - DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_sub_events, profileGetSubEvents); - DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_event_data, profileGetEventData); - DEFINE_SHIM_FUNCTION_INTERFACE(profile_free, profileFree); // QnnMem DEFINE_SHIM_FUNCTION_INTERFACE(mem_register, memRegister); - DEFINE_SHIM_FUNCTION_INTERFACE(mem_de_register, memDeRegister); // QnnProperty @@ -153,7 +131,6 @@ class qnn_interface { // QnnTensor DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_context_tensor, tensorCreateContextTensor); - DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, tensorCreateGraphTensor); uint32_t get_backend_id() const { return _qnn_interface.backendId; } @@ -169,18 +146,20 @@ class qnn_interface { #pragma GCC diagnostic pop +using qnn_interface_ptr = std::shared_ptr; + class qnn_instance { public: using BackendIdType = decltype(QnnInterface_t{}.backendId); - explicit qnn_instance(const std::string & lib_path, const std::string & backend_lib_name); + explicit qnn_instance(const std::string & lib_path, QNNBackend device); ~qnn_instance() {} int qnn_init(const QnnSaver_Config_t ** saver_config); int qnn_finalize(); - std::shared_ptr get_qnn_interface() { + qnn_interface_ptr get_qnn_interface() { if (!_qnn_interface) { QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); } @@ -189,8 +168,6 @@ class qnn_instance { Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; } - Qnn_ProfileHandle_t get_qnn_profile_handle() { return _qnn_profile_handle; } - Qnn_DeviceHandle_t get_qnn_device_handle() { return _qnn_device_handle; } Qnn_BackendHandle_t get_qnn_backend_handle() { return _qnn_backend_handle; } @@ -256,7 +233,7 @@ class qnn_instance { } int set_high_performance_mode() { - if (nullptr == _qnn_htp_perfinfra) { + if (!_qnn_htp_perfinfra) { QNN_LOG_WARN("perf intra is null\n"); return 1; } @@ -425,29 +402,20 @@ class qnn_instance { std::string _backend_lib_name; BackendIdType _backend_id; - QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_DEBUG; - #ifdef NDEBUG - qnn::sdk_profile_level _profile_level = qnn::sdk_profile_level::profile_off; + QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_INFO; // TODO: should we consider changing this dynamically? #else - qnn::sdk_profile_level _profile_level = qnn::sdk_profile_level::profile_detail; + QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_DEBUG; #endif std::shared_ptr _qnn_sys_interface; std::shared_ptr _qnn_interface; - Qnn_GraphHandle_t _qnn_graph_handle = nullptr; - - Qnn_LogHandle_t _qnn_log_handle = nullptr; - - Qnn_ProfileHandle_t _qnn_profile_handle = nullptr; - - Qnn_DeviceHandle_t _qnn_device_handle = nullptr; - - Qnn_BackendHandle_t _qnn_backend_handle = nullptr; - - Qnn_ContextHandle_t _qnn_context_handle = nullptr; - + Qnn_GraphHandle_t _qnn_graph_handle = nullptr; + Qnn_LogHandle_t _qnn_log_handle = nullptr; + Qnn_DeviceHandle_t _qnn_device_handle = nullptr; + Qnn_BackendHandle_t _qnn_backend_handle = nullptr; + Qnn_ContextHandle_t _qnn_context_handle = nullptr; QnnHtpDevice_PerfInfrastructure_t * _qnn_htp_perfinfra = nullptr; uint32_t _qnn_power_configid = 1; @@ -473,4 +441,22 @@ class qnn_instance { qnn::qcom_socinfo _soc_info = {}; }; +using qnn_instance_ptr = std::shared_ptr; + +struct device_caps { + const char * lib_name; + enum ggml_backend_dev_type type; + + // TODO: should we get this from device? + uint64_t supported_types; + + // TODO: should we merge this with supported_types? + uint64_t cpu_preprocess_types; + + // TODO: should we get this from device? + size_t max_tensor_size_in_bytes; +}; + +const device_caps & get_device_caps(QNNBackend device); + } // namespace qnn diff --git a/ggml/src/ggml-qnn/qnn-types.hpp b/ggml/src/ggml-qnn/qnn-types.hpp index 8284036bb7503..957f8b681f3da 100644 --- a/ggml/src/ggml-qnn/qnn-types.hpp +++ b/ggml/src/ggml-qnn/qnn-types.hpp @@ -8,15 +8,6 @@ #include "System/QnnSystemInterface.h" namespace qnn { -// ================================================================================================= -// -// helper data type / data structure / macros / functions of -// Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK -// ref: -// https://github.com/pytorch/executorch/blob/ae3d558d5e6aa04fc52a3065399fe6a773702f52/backends/qualcomm/serialization/qc_schema.py#L53 -// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/overview.html#supported-snapdragon-devices -// ================================================================================================= -enum sdk_profile_level { profile_off = 0, profile_basic, profile_detail }; enum qcom_htp_arch { NONE = 0, @@ -29,12 +20,15 @@ enum qcom_htp_arch { enum qcom_chipset { UNKNOWN_SM = 0, + SM8350 = 30, // v68, SD 888/888+ SM8450 = 36, // v69, SD 8 Gen 1 + SA8295 = 39, // v68 SM8475 = 42, // v69, SD 8+ Gen 1 SM8550 = 43, // v73, SD 8 Gen 2 SSG2115P = 46, // v73 + SM7675 = 70, // V73, SD 7+ Gen 3 + SM8635 = 68, // v73, SD 8s Gen 3 SM8650 = 57, // v75, SD 8 Gen 3 - SA8295 = 39, // v68 SM8750 = 69, // v79, SD 8 Gen 4 }; diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index 660223caf728a..608a80fcf5aaa 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -25,8 +25,7 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string & name, const qnn_dimension_array_t & dimensions, Qnn_DataType_t data_type, int rank, - QNNBackend device, Qnn_GraphHandle_t graph_handle, - std::shared_ptr qnn_instance) : + QNNBackend device, Qnn_GraphHandle_t graph_handle, qnn_instance_ptr qnn_instance) : _tensor_name(name), _device(device), _qnn_instance(qnn_instance), @@ -46,8 +45,7 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string & name, const ggml_dimension_array_t & dimensions, ggml_type data_type, int rank, - QNNBackend device, Qnn_GraphHandle_t graph_handle, - std::shared_ptr qnn_instance) : + QNNBackend device, Qnn_GraphHandle_t graph_handle, qnn_instance_ptr qnn_instance) : ggml_qnn_tensor(tensor_type, name, get_internal_dimension(dimensions, rank), qnn_datatype_from_ggml_datatype(data_type), rank, device, graph_handle, qnn_instance) {} @@ -85,7 +83,7 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { auto qnn_interface = _qnn_instance->get_qnn_interface(); auto error = qnn_interface->qnn_tensor_create_graph_tensor(_graph_handle, &qnn_tensor); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("[%s]allocate id failed , error: %d\n", _tensor_name.c_str(), (int) error); + QNN_LOG_ERROR("[%s]allocate id failed, error: %s\n", _tensor_name.c_str(), get_qnn_error_string(error)); return false; } @@ -95,7 +93,7 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { return true; } - bool bind_ggml_tensor(ggml_tensor * tensor) { + bool bind_ggml_tensor(ggml_tensor * tensor, qnn_buffer_ptr buffer) { if (!_can_unbind) { QNN_LOG_DEBUG("[%s]already has buffer storage, skip bind\n", _tensor_name.c_str()); return true; @@ -111,8 +109,12 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { } #endif - auto buffer = - std::make_shared(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); + if (!buffer) { + buffer = + std::make_shared(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); + QNN_LOG_DEBUG("[%s][%s]attach buffer to tensor(%s), size: %d\n", get_backend_name(_device), + _tensor_name.c_str(), tensor->name, (int) buffer->get_size()); + } if (!bind_buffer_impl(buffer)) { QNN_LOG_WARN("[%s]failed to bind ggml tensor(%s)\n", _tensor_name.c_str(), ggml_get_name(tensor)); return false; @@ -154,7 +156,7 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { } QNN_LOG_DEBUG("[%s][%s]unbind from buffer: %p, size: %d\n", get_backend_name(_device), _tensor_name.c_str(), - (void *) _buffer.get(), (int) _buffer->get_size()); + (void *) _buffer->get_buffer(), (int) _buffer->get_size()); _buffer.reset(); if (_ggml_tensor) { @@ -175,15 +177,19 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { uint32_t get_qnn_tensor_id() const { return QNN_TENSOR_GET_ID(_qnn_tensor); } + const std::string & get_tensor_name() const { return _tensor_name; } + private: bool bind_buffer_impl(qnn_buffer_ptr buffer) { if (_buffer) { if (_buffer != buffer) { - QNN_LOG_WARN("[%s]has been bound to another buffer %p\n", _tensor_name.c_str(), (void *) _buffer.get()); + QNN_LOG_WARN("[%s]has been bound to another buffer %p\n", _tensor_name.c_str(), + (void *) _buffer->get_buffer()); return false; } - QNN_LOG_DEBUG("[%s]already bound to same ggml tensor %p\n", _tensor_name.c_str(), (void *) _buffer.get()); + QNN_LOG_DEBUG("[%s]already bound to same ggml tensor %p\n", _tensor_name.c_str(), + (void *) _buffer->get_buffer()); return true; } @@ -221,8 +227,8 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); Qnn_ClientBuffer_t client_buf = { buffer->get_buffer(), (uint32_t) buffer->get_size() }; QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); - QNN_LOG_DEBUG("[%s]use client buffer %p size %d\n", _tensor_name.c_str(), client_buf.data, - (int) client_buf.dataSize); + QNN_LOG_DEBUG("[%s][%s]use client buffer %p size %d\n", get_backend_name(_device), _tensor_name.c_str(), + client_buf.data, (int) client_buf.dataSize); } _buffer = buffer; @@ -233,7 +239,7 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { } QNN_LOG_DEBUG("[%s][%s]bind to buffer: %p, size: %d\n", get_backend_name(_device), _tensor_name.c_str(), - (void *) buffer.get(), (int) buffer->get_size()); + (void *) buffer->get_buffer(), (int) buffer->get_size()); return true; } @@ -246,10 +252,11 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { if (_rpc_buffer) { memcpy(_rpc_buffer->get_buffer(), _buffer->get_buffer(), _buffer->get_size()); + // For CPU and GPU, the data is already in the tensor. + QNN_LOG_DEBUG("[%s][%s]write buffer(%p) to rpc buffer(%p)\n", get_backend_name(_device), + _tensor_name.c_str(), (void *) _buffer->get_buffer(), (void *) _rpc_buffer->get_buffer()); } - // For CPU and GPU, the data is already in the tensor. - QNN_LOG_DEBUG("[%s][%s]write tensor to qnn\n", get_backend_name(_device), _tensor_name.c_str()); return true; } @@ -262,10 +269,11 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { if (_rpc_buffer) { memcpy(_buffer->get_buffer(), _rpc_buffer->get_buffer(), _buffer->get_size()); + // For CPU and GPU, the data is already in the tensor. + QNN_LOG_DEBUG("[%s][%s]read buffer(%p) from rpc buffer(%p)\n", get_backend_name(_device), + _tensor_name.c_str(), (void *) _buffer->get_buffer(), (void *) _rpc_buffer->get_buffer()); } - // For CPU and GPU, the data is already in the tensor. - QNN_LOG_DEBUG("[%s][%s]read tensor from qnn\n", get_backend_name(_device), _tensor_name.c_str()); return true; } @@ -298,8 +306,8 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { break; } QNN_TENSOR_SET_TYPE(_qnn_tensor, new_tensor_type); - QNN_LOG_DEBUG("[%s][%s]tensor changed to type %d\n", get_backend_name(_device), _tensor_name.c_str(), - new_tensor_type); + QNN_LOG_DEBUG("[%s][%s]new_tensor_type %s\n", get_backend_name(_device), _tensor_name.c_str(), + get_qnn_tensor_type_name(new_tensor_type)); } bool should_use_mem_handle() const { @@ -307,16 +315,16 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { return false; } - std::string _tensor_name; - qnn_buffer_ptr _buffer; - bool _can_unbind = true; - QNNBackend _device; - std::shared_ptr _qnn_instance; - Qnn_Tensor_t _qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion); - qnn_dimension_array_t _dimensions = {}; - Qnn_GraphHandle_t _graph_handle = nullptr; - qnn_buffer_ptr _rpc_buffer; - ggml_tensor * _ggml_tensor = nullptr; + std::string _tensor_name; + qnn_buffer_ptr _buffer; + bool _can_unbind = true; + QNNBackend _device; + qnn_instance_ptr _qnn_instance; + Qnn_Tensor_t _qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion); + qnn_dimension_array_t _dimensions = {}; + Qnn_GraphHandle_t _graph_handle = nullptr; + qnn_buffer_ptr _rpc_buffer; + ggml_tensor * _ggml_tensor = nullptr; DISABLE_COPY(ggml_qnn_tensor); DISABLE_MOVE(ggml_qnn_tensor); @@ -340,13 +348,33 @@ inline int get_ggml_tensors_max_rank(const qnn::ggml_tensor_array_t & tensors) { return max_rank; } +inline bool bind_tensors_with_custom_buffers(const ggml_tensor_array_t & ggml_tensors, + std::vector & buffers, + qnn_tensor_array_t & tensor_wrappers, + std::vector & qnn_tensors) { + GGML_ASSERT(tensor_wrappers.size() == ggml_tensors.size()); + GGML_ASSERT(buffers.size() == ggml_tensors.size()); + qnn_tensors.resize(ggml_tensors.size()); + for (size_t i = 0; i < ggml_tensors.size(); i++) { + auto * ggml_tensor = ggml_tensors[i]; + if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor, buffers[i])) { + QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); + return false; + } + + qnn_tensors[i] = tensor_wrappers[i]->get_qnn_tensor(); + } + + return true; +} + inline bool bind_tensors(const ggml_tensor_array_t & ggml_tensors, qnn_tensor_array_t & tensor_wrappers, std::vector & qnn_tensors) { GGML_ASSERT(tensor_wrappers.size() == ggml_tensors.size()); qnn_tensors.resize(ggml_tensors.size()); for (size_t i = 0; i < ggml_tensors.size(); i++) { auto * ggml_tensor = ggml_tensors[i]; - if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor)) { + if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor, qnn_buffer_ptr())) { QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); return false; } @@ -361,7 +389,7 @@ inline bool bind_tensors(const ggml_tensor_array_t & ggml_tensors, qnn_tensor_ar GGML_ASSERT(tensor_wrappers.size() == ggml_tensors.size()); for (size_t i = 0; i < ggml_tensors.size(); i++) { auto * ggml_tensor = ggml_tensors[i]; - if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor)) { + if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor, qnn_buffer_ptr())) { QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); return false; } diff --git a/ggml/src/ggml-qnn/utils.cpp b/ggml/src/ggml-qnn/utils.cpp index f9178f90d556f..9696101b8b6e5 100644 --- a/ggml/src/ggml-qnn/utils.cpp +++ b/ggml/src/ggml-qnn/utils.cpp @@ -178,8 +178,8 @@ const char * get_ggml_type_name(ggml_type type) { return traits->type_name; } -const char * get_backend_name(QNNBackend device_index) { - switch (device_index) { +const char * get_backend_name(QNNBackend device) { + switch (device) { case QNN_BACKEND_CPU: return "qnn-cpu"; case QNN_BACKEND_GPU: @@ -192,18 +192,65 @@ const char * get_backend_name(QNNBackend device_index) { } } -const char * get_chipset_desc(uint32_t chipset_id) { - switch (chipset_id) { +const char * get_backend_desc(QNNBackend device) { + switch (device) { + case QNN_BACKEND_CPU: + return "CPU"; + case QNN_BACKEND_GPU: + return "Adreno GPU"; + case QNN_BACKEND_NPU: + return "Hexagon NPU"; + case QNN_BACKEND_COUNT: + default: + return "unknown"; + } +} + +const char * get_chipset_desc(uint32_t soc_model) { + switch (soc_model) { + case SM8350: + return "Snapdragon 888/888+"; case SM8450: - return "SD 8 Gen 1 (SM8450)"; + return "Snapdragon 8 Gen 1"; case SM8475: - return "SD 8+ Gen 1 (SM8475)"; + return "Snapdragon 8 Gen 1+"; case SM8550: - return "SD 8 Gen 2 (SM8550)"; + return "Snapdragon 8 Gen 2"; + case SM7675: + return "Snapdragon 7+ Gen 3"; + case SM8635: + return "Snapdragon 8s Gen 3"; case SM8650: - return "SD 8 Gen 3 (SM8650)"; + return "Snapdragon 8 Gen 3"; case SM8750: - return "SD 8 Gen 4 (SM8750)"; + return "Snapdragon 8 Elite"; + default: + return "unknown"; + } +} + +const char * get_chipset_model(uint32_t soc_model) { + switch (soc_model) { + case SM8350: + return "SM8350"; + case SM8450: + return "SM8450"; + case SA8295: + return "SA8295"; + case SM8475: + return "SM8475"; + case SM8550: + return "SM8550"; + case SSG2115P: + return "SSG2115P"; + case SM7675: + return "SM7675"; + case SM8635: + return "SM8635"; + case SM8650: + return "SM8650"; + case SM8750: + return "SM8750"; default: return "unknown"; } @@ -212,15 +259,15 @@ const char * get_chipset_desc(uint32_t chipset_id) { const char * get_htparch_desc(size_t htp_arch) { switch (htp_arch) { case V68: - return "QCOM_HTP_V68"; + return "HTP_V68"; case V69: - return "QCOM_HTP_V69"; + return "HTP_V69"; case V73: - return "QCOM_HTP_V73"; + return "HTP_V73"; case V75: - return "QCOM_HTP_V75"; + return "HTP_V75"; case V79: - return "QCOM_HTP_V79"; + return "HTP_V79"; default: return "unknown"; } @@ -234,6 +281,29 @@ uint32_t get_ggml_tensor_data_size(const ggml_tensor * tensor) { return (uint32_t) ggml_nbytes(tensor); } +const char * get_qnn_tensor_type_name(Qnn_TensorType_t type) { + switch (type) { + case QNN_TENSOR_TYPE_APP_WRITE: + return "QNN_TENSOR_TYPE_APP_WRITE"; + case QNN_TENSOR_TYPE_APP_READ: + return "QNN_TENSOR_TYPE_APP_READ"; + case QNN_TENSOR_TYPE_APP_READWRITE: + return "QNN_TENSOR_TYPE_APP_READWRITE"; + case QNN_TENSOR_TYPE_STATIC: + return "QNN_TENSOR_TYPE_STATIC"; + case QNN_TENSOR_TYPE_NATIVE: + return "QNN_TENSOR_TYPE_NATIVE"; + case QNN_TENSOR_TYPE_UNDEFINED: + return "QNN_TENSOR_TYPE_UNDEFINED"; + case QNN_TENSOR_TYPE_NULL: + return "QNN_TENSOR_TYPE_NULL"; + default: + break; + } + + return "unknown"; +} + #ifdef _WIN32 static void * _align_alloc(size_t alignment, size_t size) { return _aligned_malloc(size, alignment); @@ -265,14 +335,15 @@ void align_free(void * ptr) { void * page_align_alloc(size_t size) { const size_t alignment = _get_page_size(); size_t size_aligned = align_to_generic(alignment, size); - QNN_LOG_DEBUG("_align_alloc success, alignment: %ld, size: %ld, size_aligned: %ld\n", alignment, size, size_aligned); - void * data = _align_alloc(alignment, size_aligned); + void * data = _align_alloc(alignment, size_aligned); if (!data) { QNN_LOG_WARN("_align_alloc failed, alignment: %ld, size: %ld, size_aligned: %ld\n", alignment, size, size_aligned); return nullptr; } + QNN_LOG_DEBUG("_align_alloc success, alignment: %ld, size: %ld, size_aligned: %ld\n", alignment, size, + size_aligned); return data; } diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp index d6130a3df4b4e..2e55e2f2d85b3 100644 --- a/ggml/src/ggml-qnn/utils.hpp +++ b/ggml/src/ggml-qnn/utils.hpp @@ -23,11 +23,14 @@ qnn_dimension_array_t get_view_internal_dimension(const ggml_tensor * tensor, si uint32_t get_ggml_tensor_rank(const ggml_tensor * tensor); const char * get_ggml_type_name(ggml_type type); -const char * get_backend_name(QNNBackend device_index); -const char * get_chipset_desc(uint32_t chipset_id); +const char * get_backend_name(QNNBackend device); +const char * get_backend_desc(QNNBackend device); +const char * get_chipset_desc(uint32_t soc_model); +const char * get_chipset_model(uint32_t soc_model); const char * get_htparch_desc(size_t htp_arch); intptr_t align_to(size_t alignment, intptr_t offset); uint32_t get_ggml_tensor_data_size(const ggml_tensor * tensor); +const char * get_qnn_tensor_type_name(Qnn_TensorType_t type); void * page_align_alloc(size_t size); void align_free(void * ptr); @@ -199,48 +202,6 @@ const char * qnn_datatype_to_string(Qnn_DataType_t qnn_type); size_t get_system_total_memory_in_bytes(); size_t get_system_free_memory_in_bytes(); -#if ENABLE_QNNBACKEND_PERF -class qnn_perf { - public: - qnn_perf(const std::string & perf_name) : _perf_name(std::move(perf_name)) {}; - - ~qnn_perf() { info(); } - - qnn_perf() = delete; - qnn_perf(const qnn_perf &) = delete; - qnn_perf & operator=(const qnn_perf &) = delete; - - void start() { _begin_time = ggml_time_us(); } - - void info() { - _end_time = ggml_time_us(); - _duration = (_end_time - _begin_time); - QNN_LOG_INFO("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration); - } - - private: - int64_t _begin_time = 0LL; - int64_t _end_time = 0LL; - int64_t _duration = 0LL; - std::string _perf_name; -}; -#else -class qnn_perf { - public: - qnn_perf(const std::string &) {} - - ~qnn_perf() { info(); } - - qnn_perf() = delete; - qnn_perf(const qnn_perf &) = delete; - qnn_perf & operator=(const qnn_perf &) = delete; - - void start() {} - - void info() {} -}; -#endif - } // namespace qnn #define QNN_TENSOR_GET_ID(tensor) qnn::get_qnn_tensorid(tensor) From 1caca627ea03437a256d4979993cedb15a65709f Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 22 Mar 2025 12:51:01 +0800 Subject: [PATCH 147/166] fix compiling error after merge --- ggml/src/ggml-qnn/backend-ops.cpp | 2 ++ ggml/src/ggml-qnn/op-config-caps.cpp | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index ecafe7096331f..6ce6eda56cba1 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -85,6 +85,7 @@ constexpr const bool kQnnSupportedOps[] = { false, // GGML_OP_RMS_NORM false, // GGML_OP_RMS_NORM_BACK false, // GGML_OP_GROUP_NORM + false, // GGML_OP_L2_NORM true, // GGML_OP_MUL_MAT false, // GGML_OP_MUL_MAT_ID @@ -133,6 +134,7 @@ constexpr const bool kQnnSupportedOps[] = { false, // GGML_OP_ADD_REL_POS false, // GGML_OP_RWKV_WKV6 false, // GGML_OP_GATED_LINEAR_ATTN + false, // GGML_OP_RWKV_WKV7 false, // GGML_OP_UNARY diff --git a/ggml/src/ggml-qnn/op-config-caps.cpp b/ggml/src/ggml-qnn/op-config-caps.cpp index a29ea28ad6f03..1c174e2e9f26a 100644 --- a/ggml/src/ggml-qnn/op-config-caps.cpp +++ b/ggml/src/ggml-qnn/op-config-caps.cpp @@ -126,6 +126,7 @@ constexpr const qnn_op_caps_t kOpCaps[] = { }, {}, // GGML_OP_RMS_NORM_BACK {}, // GGML_OP_GROUP_NORM + {}, // GGML_OP_L2_NORM { // GGML_OP_MUL_MAT QNN_OP_MAT_MUL, // qnn_op_name @@ -179,6 +180,7 @@ constexpr const qnn_op_caps_t kOpCaps[] = { {}, // GGML_OP_ADD_REL_POS {}, // GGML_OP_RWKV_WKV6 {}, // GGML_OP_GATED_LINEAR_ATTN + {}, // GGML_OP_RWKV_WKV7 {}, // GGML_OP_UNARY @@ -222,6 +224,8 @@ static_assert(kOpCaps[GGML_OP_ADD].qnn_op_name, "GGML_OP_ADD does not have qnn_o static_assert(kOpCaps[GGML_OP_MUL_MAT].qnn_op_name, "GGML_OP_MUL_MAT does not have qnn_op_name in the kOpCaps table"); static_assert(kOpCaps[GGML_OP_MUL].qnn_op_name, "GGML_OP_MUL does not have qnn_op_name in the kOpCaps table"); static_assert(kOpCaps[GGML_OP_LOG].qnn_op_name, "GGML_OP_LOG does not have qnn_op_name in the kOpCaps table"); +static_assert(kOpCaps[GGML_OP_COUNT + GGML_UNARY_OP_GELU].qnn_op_name, + "GGML_UNARY_OP_GELU does not have qnn_op_name in the kOpCaps table"); static_assert(std::size(kOpCaps) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), "GGML_OP_COUNT does not match the size of the kOpCaps table"); @@ -297,6 +301,7 @@ constexpr const op_constructor_t kOpConstructors[] = { op_constructor_with_type_param, // GGML_OP_RMS_NORM nullptr, // GGML_OP_RMS_NORM_BACK nullptr, // GGML_OP_GROUP_NORM + nullptr, // GGML_OP_L2_NORM mat_mul_op_constructor, // GGML_OP_MUL_MAT nullptr, // GGML_OP_MUL_MAT_ID @@ -345,6 +350,7 @@ constexpr const op_constructor_t kOpConstructors[] = { nullptr, // GGML_OP_ADD_REL_POS nullptr, // GGML_OP_RWKV_WKV6 nullptr, // GGML_OP_GATED_LINEAR_ATTN + nullptr, // GGML_OP_RWKV_WKV7 nullptr, // GGML_OP_UNARY From 9e41f794035eb7ca1dc807cf2eeadcbf22beb472 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 16 Apr 2025 11:16:26 +0800 Subject: [PATCH 148/166] fix compiling error after merge master --- ggml/src/ggml-qnn/backend-ops.cpp | 9 ++------- ggml/src/ggml-qnn/op-config-caps.cpp | 18 ++++-------------- 2 files changed, 6 insertions(+), 21 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 6ce6eda56cba1..857278bdaafbf 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -138,17 +138,12 @@ constexpr const bool kQnnSupportedOps[] = { false, // GGML_OP_UNARY - false, // GGML_OP_MAP_UNARY - false, // GGML_OP_MAP_BINARY - - false, // GGML_OP_MAP_CUSTOM1_F32 - false, // GGML_OP_MAP_CUSTOM2_F32 - false, // GGML_OP_MAP_CUSTOM3_F32 - false, // GGML_OP_MAP_CUSTOM1 false, // GGML_OP_MAP_CUSTOM2 false, // GGML_OP_MAP_CUSTOM3 + false, // GGML_OP_CUSTOM + false, // GGML_OP_CROSS_ENTROPY_LOSS false, // GGML_OP_CROSS_ENTROPY_LOSS_BACK false, // GGML_OP_OPT_STEP_ADAMW diff --git a/ggml/src/ggml-qnn/op-config-caps.cpp b/ggml/src/ggml-qnn/op-config-caps.cpp index 1c174e2e9f26a..6fd65aec08a79 100644 --- a/ggml/src/ggml-qnn/op-config-caps.cpp +++ b/ggml/src/ggml-qnn/op-config-caps.cpp @@ -184,17 +184,12 @@ constexpr const qnn_op_caps_t kOpCaps[] = { {}, // GGML_OP_UNARY - {}, // GGML_OP_MAP_UNARY - {}, // GGML_OP_MAP_BINARY - - {}, // GGML_OP_MAP_CUSTOM1_F32 - {}, // GGML_OP_MAP_CUSTOM2_F32 - {}, // GGML_OP_MAP_CUSTOM3_F32 - {}, // GGML_OP_MAP_CUSTOM1 {}, // GGML_OP_MAP_CUSTOM2 {}, // GGML_OP_MAP_CUSTOM3 + {}, // GGML_OP_CUSTOM + {}, // GGML_OP_CROSS_ENTROPY_LOSS {}, // GGML_OP_CROSS_ENTROPY_LOSS_BACK {}, // GGML_OP_OPT_STEP_ADAMW @@ -354,17 +349,12 @@ constexpr const op_constructor_t kOpConstructors[] = { nullptr, // GGML_OP_UNARY - nullptr, // GGML_OP_MAP_UNARY - nullptr, // GGML_OP_MAP_BINARY - - nullptr, // GGML_OP_MAP_CUSTOM1_F32 - nullptr, // GGML_OP_MAP_CUSTOM2_F32 - nullptr, // GGML_OP_MAP_CUSTOM3_F32 - nullptr, // GGML_OP_MAP_CUSTOM1 nullptr, // GGML_OP_MAP_CUSTOM2 nullptr, // GGML_OP_MAP_CUSTOM3 + nullptr, // GGML_OP_CUSTOM + nullptr, // GGML_OP_CROSS_ENTROPY_LOSS nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK nullptr, // GGML_OP_OPT_STEP_ADAMW From beff5c4b783f45ec9fc5d1f57e11bcdbc30ff9cd Mon Sep 17 00:00:00 2001 From: nullname Date: Mon, 21 Apr 2025 12:06:16 +0800 Subject: [PATCH 149/166] feat: op perf opt (#38) * add op define xml * copy qnn libs in cmake * fix htp skel path * add windows copy file list * wip * add generated package * remove unused params * add cmake list * set qnn sdk and hexagon sdk path * wip * wip * fix tools version * fix compiling error * fix dims calc * wip * add mulmat 2d * wip * reduction * wip * wip * fix compiling error in x64 * wip * fix device description in emulator * wip * add flag * copy necessary libs * wip * load HtpPrepare first for emulator * enable custom op for 2d matrix * verify op config before add to node * Revert "verify op config before add to node" This reverts commit 206dec826e560625e053c4c78e023994f993526e. * wip * wip * wip * revert tool version change * use hexagon sdk version 5.5.0 https://docs.qualcomm.com/bundle/publicresource/topics/80-77512-2/release-notes-wrapper.html?product=1601111740010422#5.5.0 * wip * move to sub dir * add hexagon npu device and server lib * fix npu lib build * refactoring: rename QNNBackend enum * fix compiling error * wip * remove qnn/backend.hpp * add hexagon dsp host layer * extract rpc_mem from qnn submodule * fix dsp compiling error * wip * wip * open and lose npu device * split objects into separated files * fix linking error * add npu_tensor * add host graph * map rpc buffer before usage * fix some todos * add shared module * split rpc_interface from rpc_mem * get get_dsp_arch from device * wip * rename host classes * fix hexagon sdk arch getter * fix device open * fix linking error * fix crash * use tensor_data_type * fix npu lib crash * fix debug log print * skip empty graph * wip * add log * fix unmap fail * fix tensor set * remove some logs * flush back memory after finished * fix nb * wip * wip * add helper function * impl add op * fix some add in test-backend-ops * add elt wise sub and mul * fix crash on some inplace op * wip * fix elt wise op calc * wip * split mul_mat into file * add caps array * wip * wip * print support/unsupport op * copy lldb-server for newer android sdk * add tensor_spec * add assert * fix crash when loading model * rename cmake option * fix name * fix device memory and description * fix compiling error on qnn only build * fix some potential UBs * fix comments --- ggml/include/ggml-qnn.h | 13 - ggml/src/ggml-qnn/CMakeLists.txt | 111 +++++- ggml/src/ggml-qnn/backend-ops.hpp | 11 - ggml/src/ggml-qnn/npu/CMakeLists.txt | 147 ++++++++ ggml/src/ggml-qnn/npu/device/device.cpp | 173 +++++++++ ggml/src/ggml-qnn/npu/device/graph.cpp | 67 ++++ ggml/src/ggml-qnn/npu/device/graph.hpp | 29 ++ ggml/src/ggml-qnn/npu/device/op_impl.cpp | 194 ++++++++++ ggml/src/ggml-qnn/npu/device/op_impl.hpp | 17 + ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp | 146 +++++++ ggml/src/ggml-qnn/npu/device/op_mul_mat.hpp | 27 ++ ggml/src/ggml-qnn/npu/device/tensor.hpp | 90 +++++ ggml/src/ggml-qnn/npu/device/util.hpp | 36 ++ ggml/src/ggml-qnn/npu/host/buffer.cpp | 246 ++++++++++++ ggml/src/ggml-qnn/npu/host/buffer.hpp | 66 ++++ ggml/src/ggml-qnn/npu/host/graph.cpp | 82 ++++ ggml/src/ggml-qnn/npu/host/graph.hpp | 32 ++ ggml/src/ggml-qnn/npu/host/host.cpp | 153 ++++++++ ggml/src/ggml-qnn/npu/host/host_device.cpp | 305 +++++++++++++++ ggml/src/ggml-qnn/npu/host/host_device.hpp | 107 ++++++ ggml/src/ggml-qnn/npu/host/tensor.hpp | 88 +++++ ggml/src/ggml-qnn/npu/host/util.cpp | 96 +++++ ggml/src/ggml-qnn/npu/host/util.hpp | 26 ++ ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl | 90 +++++ ggml/src/ggml-qnn/qnn-types.hpp | 61 --- ggml/src/ggml-qnn/{ => qnn}/backend-ops.cpp | 18 +- .../{backend.hpp => qnn/backend-ops.hpp} | 18 +- ggml/src/ggml-qnn/{ => qnn}/buffer.hpp | 0 ggml/src/ggml-qnn/{ => qnn}/convert.cpp | 0 ggml/src/ggml-qnn/{ => qnn}/convert.hpp | 0 ggml/src/ggml-qnn/{ => qnn}/ggml-qnn.cpp | 130 +++---- ggml/src/ggml-qnn/{ => qnn}/graph.cpp | 8 +- ggml/src/ggml-qnn/{ => qnn}/graph.hpp | 16 +- .../ggml-qnn/qnn/hexagon/GgmlOpPackage.xml | 88 +++++ .../qnn/hexagon/GgmlOpPackage/Makefile | 357 ++++++++++++++++++ .../GgmlOpPackage/config/GgmlOpPackage.xml | 88 +++++ .../src/GgmlOpPackageInterface.cpp | 274 ++++++++++++++ .../GgmlOpPackage/src/ops/GgmlMulMat.cpp | 213 +++++++++++ ggml/src/ggml-qnn/{ => qnn}/logger.cpp | 0 ggml/src/ggml-qnn/{ => qnn}/logger.hpp | 0 .../src/ggml-qnn/{ => qnn}/op-config-base.hpp | 3 +- .../src/ggml-qnn/{ => qnn}/op-config-caps.cpp | 24 +- .../src/ggml-qnn/{ => qnn}/op-config-impl.cpp | 26 +- .../src/ggml-qnn/{ => qnn}/op-config-impl.hpp | 21 +- ggml/src/ggml-qnn/{ => qnn}/op-config.hpp | 0 ggml/src/ggml-qnn/{ => qnn}/profiler.cpp | 0 ggml/src/ggml-qnn/{ => qnn}/profiler.hpp | 0 ggml/src/ggml-qnn/{ => qnn}/qnn-lib.cpp | 319 ++++++++-------- ggml/src/ggml-qnn/{ => qnn}/qnn-lib.hpp | 67 ++-- ggml/src/ggml-qnn/qnn/qnn-types.hpp | 51 +++ ggml/src/ggml-qnn/{ => qnn}/tensor.hpp | 8 +- ggml/src/ggml-qnn/{ => qnn}/utils.cpp | 60 +-- ggml/src/ggml-qnn/{ => qnn}/utils.hpp | 7 +- ggml/src/ggml-qnn/shared/CMakeLists.txt | 35 ++ ggml/src/ggml-qnn/shared/common.cpp | 146 +++++++ ggml/src/ggml-qnn/shared/common.hpp | 56 +++ .../dyn-lib-loader.hpp} | 20 +- ggml/src/ggml-qnn/shared/rpc-interface.hpp | 223 +++++++++++ ggml/src/ggml-qnn/shared/rpc-mem.hpp | 129 +++++++ 59 files changed, 4334 insertions(+), 484 deletions(-) delete mode 100644 ggml/src/ggml-qnn/backend-ops.hpp create mode 100644 ggml/src/ggml-qnn/npu/CMakeLists.txt create mode 100644 ggml/src/ggml-qnn/npu/device/device.cpp create mode 100644 ggml/src/ggml-qnn/npu/device/graph.cpp create mode 100644 ggml/src/ggml-qnn/npu/device/graph.hpp create mode 100644 ggml/src/ggml-qnn/npu/device/op_impl.cpp create mode 100644 ggml/src/ggml-qnn/npu/device/op_impl.hpp create mode 100644 ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp create mode 100644 ggml/src/ggml-qnn/npu/device/op_mul_mat.hpp create mode 100644 ggml/src/ggml-qnn/npu/device/tensor.hpp create mode 100644 ggml/src/ggml-qnn/npu/device/util.hpp create mode 100644 ggml/src/ggml-qnn/npu/host/buffer.cpp create mode 100644 ggml/src/ggml-qnn/npu/host/buffer.hpp create mode 100644 ggml/src/ggml-qnn/npu/host/graph.cpp create mode 100644 ggml/src/ggml-qnn/npu/host/graph.hpp create mode 100644 ggml/src/ggml-qnn/npu/host/host.cpp create mode 100644 ggml/src/ggml-qnn/npu/host/host_device.cpp create mode 100644 ggml/src/ggml-qnn/npu/host/host_device.hpp create mode 100644 ggml/src/ggml-qnn/npu/host/tensor.hpp create mode 100644 ggml/src/ggml-qnn/npu/host/util.cpp create mode 100644 ggml/src/ggml-qnn/npu/host/util.hpp create mode 100644 ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl delete mode 100644 ggml/src/ggml-qnn/qnn-types.hpp rename ggml/src/ggml-qnn/{ => qnn}/backend-ops.cpp (94%) rename ggml/src/ggml-qnn/{backend.hpp => qnn/backend-ops.hpp} (76%) rename ggml/src/ggml-qnn/{ => qnn}/buffer.hpp (100%) rename ggml/src/ggml-qnn/{ => qnn}/convert.cpp (100%) rename ggml/src/ggml-qnn/{ => qnn}/convert.hpp (100%) rename ggml/src/ggml-qnn/{ => qnn}/ggml-qnn.cpp (81%) rename ggml/src/ggml-qnn/{ => qnn}/graph.cpp (98%) rename ggml/src/ggml-qnn/{ => qnn}/graph.hpp (85%) create mode 100644 ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage.xml create mode 100644 ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/Makefile create mode 100644 ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/config/GgmlOpPackage.xml create mode 100644 ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/src/GgmlOpPackageInterface.cpp create mode 100644 ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/src/ops/GgmlMulMat.cpp rename ggml/src/ggml-qnn/{ => qnn}/logger.cpp (100%) rename ggml/src/ggml-qnn/{ => qnn}/logger.hpp (100%) rename ggml/src/ggml-qnn/{ => qnn}/op-config-base.hpp (98%) rename ggml/src/ggml-qnn/{ => qnn}/op-config-caps.cpp (95%) rename ggml/src/ggml-qnn/{ => qnn}/op-config-impl.cpp (94%) rename ggml/src/ggml-qnn/{ => qnn}/op-config-impl.hpp (83%) rename ggml/src/ggml-qnn/{ => qnn}/op-config.hpp (100%) rename ggml/src/ggml-qnn/{ => qnn}/profiler.cpp (100%) rename ggml/src/ggml-qnn/{ => qnn}/profiler.hpp (100%) rename ggml/src/ggml-qnn/{ => qnn}/qnn-lib.cpp (69%) rename ggml/src/ggml-qnn/{ => qnn}/qnn-lib.hpp (91%) create mode 100644 ggml/src/ggml-qnn/qnn/qnn-types.hpp rename ggml/src/ggml-qnn/{ => qnn}/tensor.hpp (98%) rename ggml/src/ggml-qnn/{ => qnn}/utils.cpp (92%) rename ggml/src/ggml-qnn/{ => qnn}/utils.hpp (97%) create mode 100644 ggml/src/ggml-qnn/shared/CMakeLists.txt create mode 100644 ggml/src/ggml-qnn/shared/common.cpp create mode 100644 ggml/src/ggml-qnn/shared/common.hpp rename ggml/src/ggml-qnn/{dl-loader.hpp => shared/dyn-lib-loader.hpp} (67%) create mode 100644 ggml/src/ggml-qnn/shared/rpc-interface.hpp create mode 100644 ggml/src/ggml-qnn/shared/rpc-mem.hpp diff --git a/ggml/include/ggml-qnn.h b/ggml/include/ggml-qnn.h index 48194106cfad9..6d3e66d3d3cbd 100644 --- a/ggml/include/ggml-qnn.h +++ b/ggml/include/ggml-qnn.h @@ -1,24 +1,11 @@ #pragma once #include "ggml-backend.h" -#include "ggml.h" #ifdef __cplusplus extern "C" { #endif -#define GGML_QNN_NAME "qnn" -#define GGML_QNN_MAX_DEVICES QNN_BACKEND_COUNT - -enum QNNBackend { - QNN_BACKEND_CPU = 0, - QNN_BACKEND_GPU, - QNN_BACKEND_NPU, - QNN_BACKEND_COUNT, -}; - -GGML_API bool ggml_backend_is_qnn(ggml_backend_t backend); - GGML_API ggml_backend_reg_t ggml_backend_qnn_reg(void); #ifdef __cplusplus diff --git a/ggml/src/ggml-qnn/CMakeLists.txt b/ggml/src/ggml-qnn/CMakeLists.txt index b3591f903ddf9..3e8fa3a1b8117 100644 --- a/ggml/src/ggml-qnn/CMakeLists.txt +++ b/ggml/src/ggml-qnn/CMakeLists.txt @@ -1,9 +1,13 @@ message(STATUS "Using QNN backend") +option(GGML_HEXAGON_NPU_ONLY "ggml-qnn: Only use Hexagon NPU" OFF) +option(GGML_QNN_ENABLE_HEXAGON_BACKEND "ggml-qnn: Enable Hexagon custom package" ${GGML_HEXAGON_NPU_ONLY}) + if(CMAKE_SYSTEM_NAME STREQUAL "Android") find_library(LOG_LIB log) set(QNN_LINK_LIBRARIES ${LOG_LIB}) set(QNN_DEFAULT_LIB_SEARCH_PATH "/data/local/tmp/" CACHE STRING "customized library search path for QNN backend") + add_compile_options(-g -O0) elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows" OR CMAKE_SYSTEM_NAME STREQUAL "Linux") set(QNN_DEFAULT_LIB_SEARCH_PATH "" CACHE STRING "customized library search path for QNN backend") else() @@ -21,15 +25,22 @@ if(NOT DEFINED GGML_QNN_SDK_PATH) endif() message("CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}") +message("CMAKE_CXX_FLAGS_DEBUG: ${CMAKE_CXX_FLAGS_DEBUG}") message("CMAKE_CXX_FLAGS_RELEASE: ${CMAKE_CXX_FLAGS_RELEASE}") message("QNN_SDK_PATH: ${GGML_QNN_SDK_PATH}") -file(GLOB QNN_SOURCES "${CMAKE_CURRENT_LIST_DIR}/*.cpp") +file(GLOB QNN_SOURCES "${CMAKE_CURRENT_LIST_DIR}/qnn/*.cpp") +file(GLOB COMMON_SOURCES "${CMAKE_CURRENT_LIST_DIR}/*.cpp") ggml_add_backend_library(ggml-qnn ${QNN_SOURCES} + ${COMMON_SOURCES} ) -target_include_directories(ggml-qnn PRIVATE ${GGML_QNN_SDK_PATH}/include/QNN ${CMAKE_CURRENT_LIST_DIR}) +target_include_directories(ggml-qnn PRIVATE + ${GGML_QNN_SDK_PATH}/include/QNN + ${CMAKE_CURRENT_LIST_DIR}/qnn + ${CMAKE_CURRENT_LIST_DIR} +) target_link_libraries(ggml-qnn PRIVATE ${QNN_LINK_LIBRARIES}) if(NOT "${QNN_DEFAULT_LIB_SEARCH_PATH}" STREQUAL "") @@ -52,3 +63,99 @@ if(GGML_QNN_ENABLE_PERFORMANCE_TRACKING) else() message("GGML_QNN_ENABLE_PERFORMANCE_TRACKING is disabled") endif() + +add_subdirectory(shared) + +if(GGML_HEXAGON_NPU_ONLY) + message("GGML_HEXAGON_NPU_ONLY is enabled") + add_compile_definitions(GGML_HEXAGON_NPU_ONLY) + set(GGML_QNN_ENABLE_HEXAGON_BACKEND ON) +else() + message("GGML_HEXAGON_NPU_ONLY is disabled") +endif() + +if(GGML_QNN_ENABLE_HEXAGON_BACKEND) + message("GGML_QNN_ENABLE_HEXAGON_BACKEND is enabled") + add_subdirectory(npu) + target_link_libraries(hexagon-npu-host runtime-common) + target_link_libraries(ggml-qnn PRIVATE hexagon-npu-host) +else() + message("GGML_QNN_ENABLE_HEXAGON_BACKEND is disabled") + target_link_libraries(ggml-qnn PRIVATE runtime-common) +endif() + +# Copy QNN dynamic libraries +set(QNN_DYNAMIC_LIBS "") + +if(CMAKE_SYSTEM_NAME STREQUAL "Android" OR CMAKE_SYSTEM_NAME STREQUAL "Linux") + if(CMAKE_SYSTEM_NAME STREQUAL "Android") + # Android + set(QNN_SDK_LIB_PATH "${GGML_QNN_SDK_PATH}/lib/aarch64-android") + elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + # Linux x86_64 + set(QNN_SDK_LIB_PATH "${GGML_QNN_SDK_PATH}/lib/x86_64-linux-clang") + else() + # Linux aarch64 + set(QNN_SDK_LIB_PATH "${GGML_QNN_SDK_PATH}/lib/aarch64-oe-linux-gcc11.2") + endif() + + list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/libQnnSystem.so") + list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/libQnnCpu.so") + list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/libQnnGpu.so") + list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/libQnnHtp.so") + file(GLOB HTP_STUB_LIBS "${QNN_SDK_LIB_PATH}/libQnnHtp*.so") + list(APPEND QNN_DYNAMIC_LIBS ${HTP_STUB_LIBS}) + + if(CMAKE_SYSTEM_NAME STREQUAL "Android") + file(GLOB HTP_SKEL_LIBS "${GGML_QNN_SDK_PATH}/lib/hexagon-*/unsigned/libQnnHtp*Skel.so") + list(APPEND QNN_DYNAMIC_LIBS ${HTP_SKEL_LIBS}) + + if(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") + if(EXISTS "${CMAKE_ANDROID_NDK}/prebuilt/android-arm64/gdbserver/gdbserver") + list(APPEND QNN_DYNAMIC_LIBS "${CMAKE_ANDROID_NDK}/prebuilt/android-arm64/gdbserver/gdbserver") + message("old ndk, copy gdbserver") + else() + file(GLOB LLDB_SERVER "${CMAKE_ANDROID_NDK}/toolchains/llvm/prebuilt/linux-x86_64/lib64/clang/*/lib/linux/aarch64/lldb-server") + list(APPEND QNN_DYNAMIC_LIBS ${LLDB_SERVER}) + message("new ndk, copy lldb-server") + endif() + + file(GLOB OMP_LIBS "${CMAKE_ANDROID_NDK}/toolchains/llvm/prebuilt/linux-x86_64/lib64/clang/*/lib/linux/aarch64/libomp.so") + file(GLOB ASAN_LIBS "${CMAKE_ANDROID_NDK}/toolchains/llvm/prebuilt/linux-x86_64/lib64/clang/*/lib/linux/libclang_rt.asan-aarch64-android.so") + list(APPEND QNN_DYNAMIC_LIBS ${OMP_LIBS}) + list(APPEND QNN_DYNAMIC_LIBS ${ASAN_LIBS}) + endif() + else() + # Linux + list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/libHtpPrepare.so") + endif() +elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows") + if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + # x86_64 + set(QNN_SDK_LIB_PATH "${GGML_QNN_SDK_PATH}/lib/x86_64-windows-msvc") + else() + # aarch64 + set(QNN_SDK_LIB_PATH "${GGML_QNN_SDK_PATH}/lib/aarch64-windows-msvc") + endif() + + list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/QnnSystem.dll") + list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/QnnCpu.dll") + list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/QnnGpu.dll") + list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/QnnHtp.dll") + file(GLOB HTP_STUB_LIBS "${QNN_SDK_LIB_PATH}/QnnHtp*.dll") + + if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/HtpPrepare.dll") + endif() + + list(APPEND QNN_DYNAMIC_LIBS ${HTP_STUB_LIBS}) +endif() + +foreach(QNN_DYNAMIC_LIB ${QNN_DYNAMIC_LIBS}) + message("Copy: ${QNN_DYNAMIC_LIB} -> ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}") + add_custom_command( + TARGET ggml-qnn POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + ${QNN_DYNAMIC_LIB} + ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) +endforeach() diff --git a/ggml/src/ggml-qnn/backend-ops.hpp b/ggml/src/ggml-qnn/backend-ops.hpp deleted file mode 100644 index 64fb10f00ddfe..0000000000000 --- a/ggml/src/ggml-qnn/backend-ops.hpp +++ /dev/null @@ -1,11 +0,0 @@ -#pragma once - -#include "backend.hpp" -#include "ggml.h" - -namespace qnn { - -bool device_supports_op(ggml_backend_qnn_device_context * ctx, const ggml_tensor * op); -bool device_compute_graph(ggml_backend_qnn_device_context * ctx, ggml_cgraph * cgraph); - -} // namespace qnn diff --git a/ggml/src/ggml-qnn/npu/CMakeLists.txt b/ggml/src/ggml-qnn/npu/CMakeLists.txt new file mode 100644 index 0000000000000..4c734bb098999 --- /dev/null +++ b/ggml/src/ggml-qnn/npu/CMakeLists.txt @@ -0,0 +1,147 @@ +enable_language(ASM) +cmake_policy(SET CMP0115 OLD) + +if(DEFINED ENV{HEXAGON_SDK_ROOT}) + set(HEXAGON_SDK_ROOT $ENV{HEXAGON_SDK_ROOT}) + message("HEXAGON_SDK_ROOT: ${HEXAGON_SDK_ROOT}") +else() + message(FATAL_ERROR "HEXAGON_SDK_ROOT not defined") +endif() + +if(HEXAGON_SDK_ROOT) + include(${HEXAGON_SDK_ROOT}/build/cmake/hexagon_fun.cmake) +else() + include(${HEXAGON_CMAKE_ROOT}/hexagon_fun.cmake) +endif() + +# Base Include dirs for the Project +set(common_incs + ${CMAKE_CURRENT_BINARY_DIR}/ + ${HEXAGON_SDK_ROOT}/incs/ + ${HEXAGON_SDK_ROOT}/incs/stddef/ + ${HEXAGON_SDK_ROOT}/incs/HAP/ + ${HEXAGON_SDK_ROOT}/rtos/qurt/ + ${HEXAGON_SDK_ROOT}/utils/examples/ +) + +include_directories(${common_incs}) + +if(${CMAKE_SYSTEM_NAME} MATCHES "Android|Linux|Windows") + # host build + file(GLOB common_srcs "${CMAKE_CURRENT_LIST_DIR}/common/*.cpp") + file(GLOB host_srcs "${CMAKE_CURRENT_LIST_DIR}/host/*.cpp") + set(stub_srcs "${CMAKE_CURRENT_BINARY_DIR}/npu_device_stub.c") + add_library(hexagon-npu-host STATIC + ${common_srcs} + ${host_srcs} + ${stub_srcs} + ) + + # disable warnings for the stub + set_source_files_properties( + ${stub_srcs} + PROPERTIES + COMPILE_FLAGS "-w" + ) + + build_idl(idl/hexagon_npu.idl hexagon-npu-host) + + # Add compile definitions to the target + target_compile_definitions(hexagon-npu-host PUBLIC + VERIFY_PRINT_ERROR + GGML_QNN_ENABLE_HEXAGON_BACKEND + ) + + target_include_directories(hexagon-npu-host PRIVATE + ${HEXAGON_SDK_ROOT}/ipc/fastrpc/rpcmem/inc/ + ${QNN_SDK_ROOT}/include/QNN/ + ${CMAKE_CURRENT_LIST_DIR}/host/ + ${CMAKE_CURRENT_LIST_DIR}/ + ) + + target_include_directories(hexagon-npu-host PUBLIC + ${HEXAGON_SDK_ROOT}/incs/ # TODO: this is for rpc-mem + ) + + if(NOT ${CMAKE_SYSTEM_NAME} MATCHES "Windows") + set_target_properties(hexagon-npu-host PROPERTIES OUTPUT_NAME "hexagon_npu") + endif() + + if(${CMAKE_SYSTEM_NAME} MATCHES "Android|Linux") + target_link_options(hexagon-npu-host PUBLIC -pie) + endif() + + link_options(hexagon-npu-host) + + if(${CMAKE_SYSTEM_NAME} MATCHES "Android") + set(PREBUILT_LIB_DIR "android_aarch64") + elseif(${CMAKE_SYSTEM_NAME} MATCHES "Linux") + set(PREBUILT_LIB_DIR "UbuntuARM_aarch64") + else() + # Windows + set(PREBUILT_LIB_DIR "windows_aarch64") + endif() + + choose_dsprpc("3" dsprpc) # cdsprpc + link_custom_library(hexagon-npu-host ${dsprpc}) +else() + # hexagon npu build + cmake_minimum_required(VERSION 3.14.3) + project(hexagon_npu C CXX ASM) + + # check if QNN_SDK_ROOT is set + if(NOT DEFINED ENV{QNN_SDK_ROOT}) + message(FATAL_ERROR "QNN_SDK_ROOT not defined") + endif() + + set(QNN_SDK_ROOT $ENV{QNN_SDK_ROOT}) + message("QNN_SDK_ROOT: ${QNN_SDK_ROOT}") + include_directories( + ${QNN_SDK_ROOT}/include/QNN/ + ) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17") + + file(GLOB common_srcs "${CMAKE_CURRENT_LIST_DIR}/common/*.cpp") + file(GLOB device_srcs "${CMAKE_CURRENT_LIST_DIR}/device/*.cpp") + set(skel_srcs "${CMAKE_CURRENT_BINARY_DIR}/npu_device_skel.c") + add_library(hexagon_npu_skel_OBJS OBJECT + ${common_srcs} + ${device_srcs} + ${skel_srcs} + ) + + if(CMAKE_BUILD_TYPE MATCHES "Debug|Dbg") + message("Debug build, enable all logging") + target_compile_definitions(hexagon_npu_skel_OBJS PUBLIC + _DEBUG + DEBUG_LOGGING + ) + else() + message("Release build, disable debug logging") + target_compile_definitions(hexagon_npu_skel_OBJS PUBLIC + NDEBUG + RELEASE_LOGGING + ) + endif() + + build_idl(idl/hexagon_npu.idl hexagon_npu_skel_OBJS) + + # disable warnings for the skel + set_source_files_properties( + ${skel_srcs} + PROPERTIES + COMPILE_FLAGS "-w" + ) + + add_library(hexagon_npu_skel SHARED $) + + target_link_libraries(hexagon_npu_skel + ${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++abi.a + ${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++.a + ) + set_target_properties(hexagon_npu_skel PROPERTIES OUTPUT_NAME "hexagon_npu_skel_${HEXAGON_ARCH}") + + copy_binaries(hexagon_npu_skel) +endif() + +# vim: set noet fenc=utf-8 ff=unix ft=cmake : diff --git a/ggml/src/ggml-qnn/npu/device/device.cpp b/ggml/src/ggml-qnn/npu/device/device.cpp new file mode 100644 index 0000000000000..2368d44f671ef --- /dev/null +++ b/ggml/src/ggml-qnn/npu/device/device.cpp @@ -0,0 +1,173 @@ + +#include +#include +#include + +#include + +#include "graph.hpp" +#include "hexagon_npu.h" +#include "op_impl.hpp" +#include "remote.h" +#include "tensor.hpp" +#include "util.hpp" + +#define NPU_UNUSED(x) (void) (x) + +namespace { + +struct npu_device_context { + int unused = 0; + // TODO: should we add tensor context here? +}; + +inline hexagon::tensor * tensor_from_handle(npu_device_graph_handle_t h) { + return reinterpret_cast(h); +} + +inline npu_device_graph_handle_t tensor_to_handle(hexagon::tensor * tensor) { + return reinterpret_cast(tensor); +} + +inline hexagon::graph * graph_from_handle(npu_device_tensor_handle_t h) { + return reinterpret_cast(h); +} + +inline npu_device_tensor_handle_t graph_to_handle(hexagon::graph * graph) { + return reinterpret_cast(graph); +} + +} // namespace + +int npu_device_open(const char * uri, remote_handle64 * h) { + // TODO: should we have a device context here? + auto * context = new (std::nothrow) npu_device_context(); + if (!context) { + DEVICE_LOG_ERROR("Failed to allocate memory for the npu_device_context"); + return AEE_ENOMEMORY; + } + + *h = reinterpret_cast(context); + return AEE_SUCCESS; +} + +int npu_device_close(remote_handle64 h) { + auto * context = reinterpret_cast(h); + if (!context) { + DEVICE_LOG_ERROR("Invalid npu_device_context handle"); + return AEE_EINVHANDLE; + } + + delete context; + return AEE_SUCCESS; +} + +AEEResult npu_device_device_get_alignment(remote_handle64 _h, uint32_t * alignment) { + NPU_UNUSED(_h); + *alignment = sizeof(HVX_Vector); + return AEE_SUCCESS; +} + +AEEResult npu_device_device_support_op(remote_handle64 _h, const npu_device_tensor_spec * src0, + const npu_device_tensor_spec * src1, const npu_device_tensor_spec * dst, + npu_device_tensor_op op, boolean * is_supported) { + NPU_UNUSED(_h); + *is_supported = hexagon::support_op(*src0, *src1, *dst, op); + return AEE_SUCCESS; +} + +AEEResult npu_device_tensor_init(remote_handle64 _h, const npu_device_tensor_config * info, + npu_device_tensor_handle_t * tensor_handle) { + NPU_UNUSED(_h); + auto * tensor = new (std::nothrow) hexagon::tensor(*info); + if (!tensor) { + DEVICE_LOG_ERROR("Failed to allocate memory for the tensor"); + return AEE_ENOMEMORY; + } + + *tensor_handle = tensor_to_handle(tensor); + return AEE_SUCCESS; +} + +AEEResult npu_device_tensor_set_src(remote_handle64 _h, npu_device_tensor_handle_t tensor_handle, uint64_t index, + npu_device_tensor_handle_t src) { + NPU_UNUSED(_h); + auto * tensor = tensor_from_handle(tensor_handle); + if (!tensor) { + return AEE_EINVHANDLE; + } + + auto * src_tensor = tensor_from_handle(src); + tensor->set_src(index, src_tensor); + return AEE_SUCCESS; +} + +AEEResult npu_device_tensor_set_op(remote_handle64 _h, npu_device_tensor_handle_t tensor_handle, + npu_device_tensor_op op) { + NPU_UNUSED(_h); + auto * tensor = tensor_from_handle(tensor_handle); + if (!tensor) { + return AEE_EINVHANDLE; + } + + tensor->set_op(op); + return AEE_SUCCESS; +} + +AEEResult npu_device_tensor_free(remote_handle64 _h, npu_device_tensor_handle_t tensor_handle) { + NPU_UNUSED(_h); + auto * tensor = tensor_from_handle(tensor_handle); + if (!tensor) { + return AEE_EINVHANDLE; + } + + delete tensor; + return AEE_SUCCESS; +} + +AEEResult npu_device_graph_init(remote_handle64 _h, npu_device_graph_handle_t * graph_handle) { + NPU_UNUSED(_h); + auto * graph = new (std::nothrow) hexagon::graph(); + if (!graph) { + return AEE_ENOMEMORY; + } + + *graph_handle = graph_to_handle(graph); + return AEE_SUCCESS; +} + +AEEResult npu_device_graph_set_tensor(remote_handle64 _h, npu_device_graph_handle_t graph_handle, + const npu_device_tensor_handle_t * tensor_handles, int tensor_handlesLen) { + NPU_UNUSED(_h); + auto * graph = graph_from_handle(graph_handle); + if (!graph || !tensor_handles || tensor_handlesLen <= 0) { + return AEE_EINVHANDLE; + } + + graph->set_tensor(tensor_handles, tensor_handlesLen); + return AEE_SUCCESS; +} + +AEEResult npu_device_graph_compute(remote_handle64 _h, npu_device_graph_handle_t graph_handle) { + NPU_UNUSED(_h); + auto * graph = graph_from_handle(graph_handle); + if (!graph) { + return AEE_EINVHANDLE; + } + + if (!graph->compute()) { + return AEE_EFAILED; + } + + return AEE_SUCCESS; +} + +AEEResult npu_device_graph_free(remote_handle64 _h, npu_device_graph_handle_t graph_handle) { + NPU_UNUSED(_h); + auto * graph = graph_from_handle(graph_handle); + if (graph) { + delete graph; + } + + return AEE_SUCCESS; +} diff --git a/ggml/src/ggml-qnn/npu/device/graph.cpp b/ggml/src/ggml-qnn/npu/device/graph.cpp new file mode 100644 index 0000000000000..b21b8add2997c --- /dev/null +++ b/ggml/src/ggml-qnn/npu/device/graph.cpp @@ -0,0 +1,67 @@ + +#include "graph.hpp" + +#include + +#include "op_impl.hpp" +#include "util.hpp" + +namespace hexagon { + +graph::~graph() noexcept { + if (_tensors) { + delete[] _tensors; + } +} + +void graph::set_tensor(const npu_device_tensor_handle_t * tensors, int tensor_count) { + if (_tensor_count > 0) { + delete[] _tensors; + } + + if (tensor_count <= 0) { + _tensors = nullptr; + _tensor_count = 0; + return; + } + + _tensors = new (std::nothrow) tensor *[tensor_count]; + for (int i = 0; i < tensor_count; ++i) { + auto * tensor_obj = reinterpret_cast(tensors[i]); + _tensors[i] = tensor_obj; + DEVICE_LOG_DEBUG("graph(%p) set_tensor[%d]: %p(%p,%p), op: %d\n", (void *) this, i, (void *) tensor_obj, + (void *) tensor_obj->get_src(0), (void *) tensor_obj->get_src(1), tensor_obj->get_op()); + } + + _tensor_count = tensor_count; + DEVICE_LOG_DEBUG("graph(%p) tensor count: %zu\n", (void *) this, _tensor_count); +} + +bool graph::compute() { + if (!_tensors || !_tensor_count) { + DEVICE_LOG_DEBUG("graph(%p) no tensors to compute\n", (void *) this); + return true; // return success if no tensors to compute + } + + DEVICE_LOG_DEBUG("graph(%p) compute\n", (void *) this); + for (size_t i = 0; i < _tensor_count; ++i) { + auto * dst = _tensors[i]; + auto op = dst->get_op(); + auto * func = get_compute_func(op); + if (!func) { + DEVICE_LOG_ERROR("graph(%p) tensor[%zu] op %d not supported\n", (void *) this, i, op); + return false; + } + + if (!func(dst)) { + DEVICE_LOG_ERROR("graph(%p) tensor[%zu] op %d compute failed\n", (void *) this, i, op); + return false; + } + + dst->flush(); // TODO: optimize this + } + + return true; +} + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/device/graph.hpp b/ggml/src/ggml-qnn/npu/device/graph.hpp new file mode 100644 index 0000000000000..22f6615d1435f --- /dev/null +++ b/ggml/src/ggml-qnn/npu/device/graph.hpp @@ -0,0 +1,29 @@ +#pragma once + +#include "hexagon_npu.h" +#include "tensor.hpp" + +namespace hexagon { + +class graph { + public: + // TODO: add execute direction here + explicit graph() noexcept {} + + ~graph() noexcept; + + void set_tensor(const npu_device_tensor_handle_t * tensors, int tensor_count); + + bool compute(); + + private: + tensor ** _tensors = nullptr; + size_t _tensor_count = 0; + + graph(const graph &) = delete; + void operator=(const graph &) = delete; + graph(graph &&) = delete; + void operator=(graph &&) = delete; +}; + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/device/op_impl.cpp b/ggml/src/ggml-qnn/npu/device/op_impl.cpp new file mode 100644 index 0000000000000..7067a1d52bc9a --- /dev/null +++ b/ggml/src/ggml-qnn/npu/device/op_impl.cpp @@ -0,0 +1,194 @@ + + +#include "op_impl.hpp" + +#include +#include + +#include "op_mul_mat.hpp" + +namespace { + +template +inline void vec_op_f32_f32(const float * src0, const float * src1, size_t count, float * dst) { + HVX_Vector * iptr0 = ((HVX_Vector *) src0); + HVX_Vector * iptr0_end = ((HVX_Vector *) src0) + (count / hexagon::kFloatsPerVector); + HVX_Vector * iptr1 = ((HVX_Vector *) src1); + HVX_Vector * optr = ((HVX_Vector *) dst); + HVX_Vector prev0 = *iptr0++; + HVX_Vector prev1 = *iptr1++; + + // TODO: prefetch or just use VTCM? + while (iptr0 < iptr0_end) { + HVX_Vector curr0 = *iptr0++; + HVX_Vector curr1 = *iptr1++; + HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); + *optr++ = Q6_Vsf_equals_Vqf32(_OpIntrinsic(s0, s1)); + prev0 = curr0; + prev1 = curr1; + } + + if ((iptr0_end - ((HVX_Vector *) src0)) > 0) { + // handle the last vector + // see also: + // https://github.com/UbiquitousLearning/mllm/blob/babf4410352ce8730824c87699c025a0d4ce3a6f/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/LLaMAMul.cpp#L147 + // or qualcomm sdk libs\qhl_hvx\src\qhblas_hvx\qhblas_hvx_aw_vector_add_ah.c + bool iptr0_aligned = hexagon::is_addr_aligned(iptr0); + HVX_Vector curr0 = iptr0_aligned ? prev0 : *iptr0; + iptr0 = iptr0_aligned ? iptr0 : iptr0 + 1; + bool iptr1_aligned = hexagon::is_addr_aligned(iptr1); + HVX_Vector curr1 = iptr1_aligned ? prev1 : *iptr1; + iptr1 = iptr1_aligned ? iptr1 : iptr1 + 1; + HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); + *optr++ = Q6_Vsf_equals_Vqf32(_OpIntrinsic(s0, s1)); + prev0 = curr0; + prev1 = curr1; + } + + const size_t leftover = count % hexagon::kFloatsPerVector; + const size_t leftover_bytes = leftover * sizeof(float); + if (leftover > 0) { + // handle the leftover elements + HVX_Vector curr0 = + (leftover_bytes + hexagon::unaligned_bytes(iptr0) > hexagon::kBytesPerVector) ? *iptr0 : prev0; + curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + + HVX_Vector curr1 = + (leftover_bytes + hexagon::unaligned_bytes(iptr1) > hexagon::kBytesPerVector) ? *iptr1 : prev1; + curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); + + q6op_vstu_variable_ARV(optr, leftover_bytes, Q6_Vsf_equals_Vqf32(_OpIntrinsic(curr0, curr1))); + } +} + +inline HVX_Vector vadd_f32_f32(HVX_Vector a, HVX_Vector b) { + return Q6_Vqf32_vadd_VsfVsf(a, b); +} + +inline HVX_Vector vsub_f32_f32(HVX_Vector a, HVX_Vector b) { + return Q6_Vqf32_vsub_VsfVsf(a, b); +} + +inline HVX_Vector vmul_f32_f32(HVX_Vector a, HVX_Vector b) { + return Q6_Vqf32_vmpy_VsfVsf(a, b); +} + +template +bool element_wise_op(hexagon::tensor * out) { + if (!out) { + return false; + } + + auto * src0 = out->get_src(0); + auto * src1 = out->get_src(1); + if (!src0 || !src1) { + return true; // skip if no src + } + + if (src0->get_ne(0) != src1->get_ne(0)) { + // TODO: handle this case + DEVICE_LOG_ERROR("src0[0] and src1[0] not match: %ld vs %ld\n", (long) src0->get_ne(0), (long) src1->get_ne(0)); + return false; + } + + static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "element_wise_op requires max dims 4"); + + const auto * src0_ptr = reinterpret_cast(src0->get_data()); + const auto * src1_ptr = reinterpret_cast(src1->get_data()); + auto * dst_ptr = reinterpret_cast(out->get_data()); + for (int64_t i3 = 0; i3 < out->get_ne(3); i3++) { + const auto * src0_cube = src0_ptr + i3 * src0->get_nb(3); + const auto * src1_cube = src1_ptr + (i3 % src1->get_ne(3)) * src1->get_nb(3); + auto * dst_cube = dst_ptr + i3 * out->get_nb(3); + for (int64_t i2 = 0; i2 < out->get_ne(2); i2++) { + const auto * src0_plane = src0_cube + i2 * src0->get_nb(2); + const auto * src1_plane = src1_cube + (i2 % src1->get_ne(2)) * src1->get_nb(2); + auto * dst_plane = dst_cube + i2 * out->get_nb(2); + for (int64_t i1 = 0; i1 < out->get_ne(1); i1++) { + // TODO: prefetch row? + auto * src0_row = src0_plane + i1 * src0->get_nb(1); + auto * src1_row = src1_plane + (i1 % src1->get_ne(1)) * src1->get_nb(1); + auto * dst_row = reinterpret_cast(dst_plane + i1 * out->get_nb(1)); + _RowFunc(reinterpret_cast(src0_row), reinterpret_cast(src1_row), + static_cast(out->get_ne(0)), reinterpret_cast<_TyDst *>(dst_row)); + } + } + } + + return true; +} + +bool is_element_wise_op_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, + const npu_device_tensor_spec & dst, npu_device_tensor_op op) { + if (op != NPU_OP_ADD && op != NPU_OP_SUB && op != NPU_OP_MUL) { + DEVICE_LOG_DEBUG("Unsupported element wise op: %s\n", hexagon::op_get_name(op)); + return false; + } + + if (src0.ne[0] != src1.ne[0]) { + DEVICE_LOG_DEBUG("src0.ne[0] and src1.ne[0] not match: %ld vs %ld\n", (long) src0.ne[0], (long) src1.ne[0]); + return false; + } + + for (size_t i = 0; i < DEVICE_TENSOR_MAX_DIMS; ++i) { + if (src0.ne[i] != dst.ne[i]) { + DEVICE_LOG_DEBUG("src0.ne[%zu] and dst.ne[%zu] not match: %lld vs %lld\n", i, i, (long long) src0.ne[i], + (long long) dst.ne[i]); + return false; + } + } + + return true; +} + +struct op_capabilities { + npu_device_tensor_op op; + hexagon::compute_func_type compute_func; + hexagon::op_is_supported_func_type is_supported; +}; + +constexpr const op_capabilities kOpCapabilities[] = { + { NPU_OP_MUL_MAT, hexagon::mul_mat_f32, hexagon::is_mul_mat_supported }, + { NPU_OP_ADD, element_wise_op>, is_element_wise_op_supported }, + { NPU_OP_SUB, element_wise_op>, is_element_wise_op_supported }, + { NPU_OP_MUL, element_wise_op>, is_element_wise_op_supported }, +}; + +static_assert(kOpCapabilities[NPU_OP_MUL_MAT].compute_func == hexagon::mul_mat_f32, + "kOpArray[NPU_OP_MUL_MAT] != mul_mat_f32"); + +static_assert(std::size(kOpCapabilities) == NPU_OP_COUNT); +static_assert(kOpCapabilities[NPU_OP_MUL_MAT].op == NPU_OP_MUL_MAT, "kOpArray[NPU_OP_MUL_MAT].op != NPU_OP_MUL_MAT"); +static_assert(kOpCapabilities[NPU_OP_MUL].op == NPU_OP_MUL, "kOpArray[NPU_OP_MUL].op != NPU_OP_MUL"); + +} // namespace + +namespace hexagon { + +compute_func_type get_compute_func(npu_device_tensor_op op) { + if (op >= NPU_OP_COUNT) { + return nullptr; + } + + return kOpCapabilities[op].compute_func; +} + +bool support_op(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, + const npu_device_tensor_spec & dst, npu_device_tensor_op op) { + if (get_compute_func(op) == nullptr) { + DEVICE_LOG_ERROR("Unsupported op: %s, get_compute_func failed\n", op_get_name(op)); + return false; + } + + auto is_supported_func = kOpCapabilities[op].is_supported; + if (!is_supported_func || !is_supported_func(src0, src1, dst, op)) { + DEVICE_LOG_ERROR("Unsupported op: %s, is_supported_func failed\n", op_get_name(op)); + return false; + } + + return true; +} + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/device/op_impl.hpp b/ggml/src/ggml-qnn/npu/device/op_impl.hpp new file mode 100644 index 0000000000000..1fee7769ce04c --- /dev/null +++ b/ggml/src/ggml-qnn/npu/device/op_impl.hpp @@ -0,0 +1,17 @@ +#pragma once + +#include "hexagon_npu.h" +#include "tensor.hpp" + +namespace hexagon { + +typedef bool (*compute_func_type)(tensor * dst); +typedef bool (*op_is_supported_func_type)(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, + const npu_device_tensor_spec & dst, npu_device_tensor_op op); + +compute_func_type get_compute_func(npu_device_tensor_op op); + +bool support_op(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, + const npu_device_tensor_spec & dst, npu_device_tensor_op op); + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp new file mode 100644 index 0000000000000..fbda69d2d7cc2 --- /dev/null +++ b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp @@ -0,0 +1,146 @@ +#include "op_mul_mat.hpp" + +#include + +namespace { + +inline float vec_dot_product_f32_f32(const float * src0, const float * src1, size_t count) { + HVX_Vector * iptr0 = ((HVX_Vector *) src0); + HVX_Vector * iptr0_end = ((HVX_Vector *) src0) + (count / hexagon::kFloatsPerVector); + HVX_Vector * iptr1 = ((HVX_Vector *) src1); + HVX_Vector prev0 = *iptr0++; + HVX_Vector prev1 = *iptr1++; + HVX_Vector sum = Q6_V_vzero(); + + // TODO: prefetch or just use VTCM? + while (iptr0 < iptr0_end) { + HVX_Vector curr0 = *iptr0++; + HVX_Vector curr1 = *iptr1++; + HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); + sum = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(s0, s1), sum); + prev0 = curr0; + prev1 = curr1; + } + + if ((iptr0_end - ((HVX_Vector *) src0)) > 0) { + // handle the last vector + // see also: + // https://github.com/UbiquitousLearning/mllm/blob/babf4410352ce8730824c87699c025a0d4ce3a6f/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/LLaMAMul.cpp#L147 + // or qualcomm sdk libs\qhl_hvx\src\qhblas_hvx\qhblas_hvx_aw_vector_add_ah.c + bool iptr0_aligned = hexagon::is_addr_aligned(iptr0); + HVX_Vector curr0 = iptr0_aligned ? prev0 : *iptr0; + iptr0 = iptr0_aligned ? iptr0 : iptr0 + 1; + bool iptr1_aligned = hexagon::is_addr_aligned(iptr1); + HVX_Vector curr1 = iptr1_aligned ? prev1 : *iptr1; + iptr1 = iptr1_aligned ? iptr1 : iptr1 + 1; + HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); + sum = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(s0, s1), sum); + prev0 = curr0; + prev1 = curr1; + } + + const size_t leftover = count % hexagon::kFloatsPerVector; + const size_t leftover_bytes = leftover * sizeof(float); + if (leftover > 0) { + // handle the leftover elements + HVX_Vector curr0 = + (leftover_bytes + hexagon::unaligned_bytes(iptr0) > hexagon::kBytesPerVector) ? *iptr0 : prev0; + curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + + HVX_Vector curr1 = + (leftover_bytes + hexagon::unaligned_bytes(iptr1) > hexagon::kBytesPerVector) ? *iptr1 : prev1; + curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); + + sum = Q6_Vqf32_vadd_Vqf32Vqf32( + Q6_V_valign_VVR(Q6_Vqf32_vmpy_VsfVsf(curr0, curr1), Q6_V_vzero(), leftover_bytes), sum); + } + + // TODO: do we have a better way to do the reduction? + for (size_t i = hexagon::kFloatsPerVector / 2; i > 0; i /= 2) { + sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_V_vror_VR(sum, i * sizeof(float))); + } + + float result; + q6op_vstu_variable_ARV(&result, sizeof(float), Q6_Vsf_equals_Vqf32(sum)); + return result; +} + +} // namespace + +namespace hexagon { + +bool mul_mat_f32(hexagon::tensor * out) { + if (!out) { + return false; + } + + auto * src0 = out->get_src(0); + auto * src1 = out->get_src(1); + if (!src0 || !src1) { + return true; // skip if no src + } + + static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "mul_mat_f32 requires max dims 4"); + + const auto r02 = src1->get_ne(2) / src0->get_ne(2); + const auto r03 = src1->get_ne(3) / src0->get_ne(3); + const auto * src0_ptr = reinterpret_cast(src0->get_data()); + const auto * src1_ptr = reinterpret_cast(src1->get_data()); + auto * dst_ptr = reinterpret_cast(out->get_data()); + for (int64_t i3 = 0; i3 < out->get_ne(3); i3++) { + const auto * src0_cube = src0_ptr + i3 / r03 * src0->get_nb(3); + const auto * src1_cube = src1_ptr + i3 * src1->get_nb(3); + auto * dst_cube = dst_ptr + i3 * out->get_nb(3); + for (int64_t i2 = 0; i2 < out->get_ne(2); i2++) { + const auto * src0_plane = src0_cube + i2 / r02 * src0->get_nb(2); + const auto * src1_plane = src1_cube + i2 * src1->get_nb(2); + auto * dst_plane = dst_cube + i2 * out->get_nb(2); + for (int64_t i1 = 0; i1 < out->get_ne(1); i1++) { + // TODO: prefetch row? + auto * src1_row = src1_plane + i1 * src1->get_nb(1); + auto * dst_row = reinterpret_cast(dst_plane + i1 * out->get_nb(1)); + for (int64_t i0 = 0; i0 < out->get_ne(0); i0++) { + auto * src0_row = src0_plane + i0 * src0->get_nb(1); + // TODO: figure out how to handle a entire row + *dst_row++ = + vec_dot_product_f32_f32(reinterpret_cast(src0_row), + reinterpret_cast(src1_row), (size_t) src0->get_ne(0)); + } + } + } + } + + return true; +} + +bool is_mul_mat_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, + const npu_device_tensor_spec & dst, npu_device_tensor_op op) { + if (op != NPU_OP_MUL_MAT) { + DEVICE_LOG_DEBUG("op is not NPU_OP_MUL_MAT: %d\n", op); + return false; + } + + if (src0.ne[0] != src1.ne[0] || src0.ne[1] != dst.ne[0]) { + DEVICE_LOG_DEBUG("src0 and src1 cannot multiply: %ldx%ld vs %ldx%ld\n", (long) src0.ne[0], (long) src0.ne[1], + (long) src1.ne[0], (long) src1.ne[1]); + return false; + } + + if (src1.ne[1] != dst.ne[1] || src1.ne[2] != dst.ne[2] || src1.ne[3] != dst.ne[3]) { + DEVICE_LOG_DEBUG("src1 and dst dimensions not match: %ldx%ld vs %ldx%ld\n", (long) src1.ne[2], + (long) src1.ne[3], (long) dst.ne[2], (long) dst.ne[3]); + return false; + } + + if (src1.ne[2] % src0.ne[2] || src1.ne[3] % src0.ne[3]) { + DEVICE_LOG_DEBUG("src0 cannot broadcast to src1: %ldx%ld vs %ldx%ld\n", (long) src0.ne[2], (long) src0.ne[3], + (long) src1.ne[2], (long) src1.ne[3]); + return false; + } + + return true; +} + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/device/op_mul_mat.hpp b/ggml/src/ggml-qnn/npu/device/op_mul_mat.hpp new file mode 100644 index 0000000000000..cc57d3d1fe6d4 --- /dev/null +++ b/ggml/src/ggml-qnn/npu/device/op_mul_mat.hpp @@ -0,0 +1,27 @@ +#pragma once + +#include + +#include + +#include "tensor.hpp" + +namespace hexagon { + +constexpr const size_t kBytesPerVector = sizeof(HVX_Vector); // 128 for v73 +constexpr const size_t kFloatsPerVector = kBytesPerVector / sizeof(float); +constexpr const size_t kAlignMask = kBytesPerVector - 1; + +inline size_t unaligned_bytes(const void * addr) { + return ((size_t) addr) & kAlignMask; +} + +inline bool is_addr_aligned(void * addr) { + return unaligned_bytes(addr) == 0; +} + +bool mul_mat_f32(tensor * out); +bool is_mul_mat_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, + const npu_device_tensor_spec & dst, npu_device_tensor_op op); + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/device/tensor.hpp b/ggml/src/ggml-qnn/npu/device/tensor.hpp new file mode 100644 index 0000000000000..83aa29a609cfc --- /dev/null +++ b/ggml/src/ggml-qnn/npu/device/tensor.hpp @@ -0,0 +1,90 @@ +#pragma once + +#include +#include + +#include "hexagon_npu.h" +#include "util.hpp" + +namespace hexagon { + +constexpr const size_t kMaxTensorSrc = DEVICE_TENSOR_MAX_SRC; + +class tensor { + public: + explicit tensor(const npu_device_tensor_config & info) noexcept : _info(info) { + uint64 phy_address = 0; + void * mmap_address = nullptr; + auto ret = HAP_mmap_get(_info.buffer_fd, &mmap_address, &phy_address); + if (ret != AEE_SUCCESS) { + DEVICE_LOG_ERROR("Failed to mmap tensor buffer: %d", (int) ret); + return; + } + + _data = static_cast(mmap_address); + DEVICE_LOG_INFO("tensor(%p[%ldx%ldx%ldx%ld]), fd: %d, offset: %zu, mmap_address: %p, phy_address: 0x%lx\n", + (void *) this, (long) _info.ne[0], (long) _info.ne[1], (long) _info.ne[2], (long) _info.ne[3], + _info.buffer_fd, _info.offset, (void *) mmap_address, phy_address); + } + + ~tensor() noexcept { + auto ret = HAP_mmap_put(_info.buffer_fd); + if (ret != AEE_SUCCESS) { + DEVICE_LOG_ERROR("Failed to unmap tensor buffer: %d", (int) ret); + } + + DEVICE_LOG_INFO("~tensor(%p) fd: %d", (void *) this, _info.buffer_fd); + } + + void flush() { + if (_data) { + qurt_mem_cache_clean((qurt_addr_t) (_data + _info.offset), (qurt_size_t) _info.size, + QURT_MEM_CACHE_INVALIDATE, QURT_MEM_DCACHE); + } + } + + bool set_src(size_t index, tensor * src) { + if (index >= kMaxTensorSrc) { + return false; + } + + _src[index] = src; + return true; + } + + void set_op(npu_device_tensor_op op) { _info.op = op; } + + tensor * get_src(size_t index) const { + if (index >= kMaxTensorSrc) { + return nullptr; + } + + return _src[index]; + } + + const npu_device_tensor_config & get_info() const { return _info; } + + const int64_t get_ne(size_t index) const { return _info.ne[index]; } + + const size_t get_nb(size_t index) const { return _info.nb[index]; } + + npu_device_tensor_op get_op() const { return _info.op; } + + npu_device_tensor_data_type get_type() const { return _info.type; } + + uint8_t * get_data() const { return _data + _info.offset; } + + bool is_valid() const { return _data != nullptr; } + + private: + npu_device_tensor_config _info; + tensor * _src[kMaxTensorSrc] = {}; + uint8_t * _data = nullptr; + + tensor(const tensor &) = delete; + void operator=(const tensor &) = delete; + tensor(tensor &&) = delete; + void operator=(tensor &&) = delete; +}; + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/device/util.hpp b/ggml/src/ggml-qnn/npu/device/util.hpp new file mode 100644 index 0000000000000..12b7dde81e9c4 --- /dev/null +++ b/ggml/src/ggml-qnn/npu/device/util.hpp @@ -0,0 +1,36 @@ +#pragma once + +#include + +#include "hexagon_npu.h" + +#define DEVICE_LOG_ERROR(...) FARF(FATAL, __VA_ARGS__) +#define DEVICE_LOG_WARN(...) FARF(ERROR, __VA_ARGS__) +#define DEVICE_LOG_INFO(...) FARF(HIGH, __VA_ARGS__) + +#ifdef _DEBUG +# undef FARF_LOW +# define FARF_LOW 1 +# define DEVICE_LOG_DEBUG(...) FARF(LOW, __VA_ARGS__) +#else +# define DEVICE_LOG_DEBUG(...) (void) 0 +#endif + +namespace hexagon { + +constexpr const char * op_get_name(npu_device_tensor_op op) { + switch (op) { + case NPU_OP_MUL_MAT: + return "MUL_MAT"; + case NPU_OP_ADD: + return "ADD"; + case NPU_OP_SUB: + return "SUB"; + case NPU_OP_MUL: + return "MUL"; + default: + return "UNKNOWN"; + } +} + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/host/buffer.cpp b/ggml/src/ggml-qnn/npu/host/buffer.cpp new file mode 100644 index 0000000000000..ff5c8a320c745 --- /dev/null +++ b/ggml/src/ggml-qnn/npu/host/buffer.cpp @@ -0,0 +1,246 @@ +#include "buffer.hpp" + +#include + +#include "host_device.hpp" +#include "tensor.hpp" + +namespace { + +constexpr const int kRpcMemDefaultHeapId = RPCMEM_HEAP_ID_SYSTEM; +constexpr const uint32_t kRpcMemDefaultFlags = RPCMEM_DEFAULT_FLAGS; // TODO: should we use a different flag? + +static hexagon::host_buffer * get_buffer_object(ggml_backend_buffer_t buffer) { + return reinterpret_cast(buffer->context); +} + +static hexagon::host_buffer_type * get_buffer_type_object(ggml_backend_buffer_type_t buft) { + return reinterpret_cast(buft->context); +} + +void backend_buffer_free_buffer(ggml_backend_buffer_t buffer) { + delete get_buffer_object(buffer); +} + +void * backend_buffer_get_base(ggml_backend_buffer_t buffer) { + auto * buffer_obj = get_buffer_object(buffer); + GGML_ASSERT(buffer_obj != nullptr); + return buffer_obj->get_buffer(); +} + +ggml_status backend_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { + auto * buffer_type_obj = get_buffer_type_object(buffer->buft); + GGML_ASSERT(buffer_type_obj != nullptr); + + auto * device_object = buffer_type_obj->get_device(); + GGML_ASSERT(device_object != nullptr); + + auto * buffer_obj = get_buffer_object(buffer); + GGML_ASSERT(buffer_obj != nullptr); + + auto tensor_object = buffer_obj->init_tensor(tensor, device_object->get_device_handle()); + if (!tensor_object) { + LOG_ERROR("Failed to init tensor\n"); + return GGML_STATUS_ALLOC_FAILED; + } + + return GGML_STATUS_SUCCESS; +} + +void backend_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, + size_t size) { + GGML_UNUSED(buffer); + memcpy((char *) tensor->data + offset, data, size); +} + +void backend_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, + size_t size) { + GGML_UNUSED(buffer); + memcpy(data, (const char *) tensor->data + offset, size); +} + +bool backend_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { + GGML_UNUSED(buffer); + if (ggml_backend_buffer_is_host(src->buffer)) { + memcpy(dst->data, src->data, ggml_nbytes(src)); + return true; + } + + return false; +} + +void backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + auto * buffer_obj = get_buffer_object(buffer); + GGML_ASSERT(buffer_obj != nullptr); + memset(buffer_obj->get_buffer(), value, buffer_obj->get_size()); +} + +constexpr const ggml_backend_buffer_i backend_buffer_interface = { + /* .free_buffer = */ backend_buffer_free_buffer, + /* .get_base = */ backend_buffer_get_base, + /* .init_tensor = */ backend_buffer_init_tensor, + /* .memset_tensor = */ nullptr, + /* .set_tensor = */ backend_buffer_set_tensor, + /* .get_tensor = */ backend_buffer_get_tensor, + /* .cpy_tensor = */ backend_buffer_cpy_tensor, + /* .clear = */ backend_buffer_clear, + /* .reset = */ nullptr, +}; + +const char * backend_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + auto * buffer_type_obj = get_buffer_type_object(buft); + GGML_ASSERT(buffer_type_obj != nullptr); + return buffer_type_obj->get_name(); +} + +ggml_backend_buffer_t backend_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + auto * buffer_type_obj = get_buffer_type_object(buft); + GGML_ASSERT(buffer_type_obj != nullptr); + return buffer_type_obj->allocate_buffer(size); +} + +size_t backend_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + auto * buffer_type_obj = get_buffer_type_object(buft); + GGML_ASSERT(buffer_type_obj != nullptr); + return buffer_type_obj->get_buffer_alignment(); +} + +size_t backend_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { + auto * buffer_type_obj = get_buffer_type_object(buft); + GGML_ASSERT(buffer_type_obj != nullptr); + return buffer_type_obj->get_max_buffer_size(); +} + +bool backend_buffer_is_host(ggml_backend_buffer_type_t buft) { + return buft->iface.get_name == backend_buffer_type_get_name; +} + +} // namespace + +namespace hexagon { + +host_buffer::host_buffer(common::rpc_mem_ptr allocator, size_t size, uint32_t domain_id) : + _allocator(allocator), + _size(size), + _domain_id(domain_id) { + if (!_allocator->is_valid()) { + LOG_ERROR("rpc memory not initialized\n"); + return; + } + + if (size > _allocator->get_max_alloc_size()) { + LOG_ERROR("rpc memory size %zu exceeds max alloc size %zu\n", size, _allocator->get_max_alloc_size()); + return; + } + + _data = _allocator->alloc(kRpcMemDefaultHeapId, kRpcMemDefaultFlags, size); + if (!_data) { + LOG_ERROR("failed to allocate rpc memory, size: %d MB\n", (int) (size / (1 << 20))); + return; + } + + LOG_DEBUG("create host_buffer(%p), size: %zu, domain_id: %d\n", (void *) _data, size, (int) domain_id); +} + +host_buffer::~host_buffer() { + LOG_DEBUG("destroy host_buffer(%p), size: %zu, domain_id: %d\n", (void *) _data, _size, (int) _domain_id); + _tensors.clear(); + if (_buffer_fd != -1) { + auto ret = _allocator->fastrpc_munmap((int) _domain_id, _buffer_fd, nullptr, 0); + if (ret != AEE_SUCCESS) { + LOG_ERROR("failed to munmap rpc memory, fd: %d, ret: %d\n", _buffer_fd, ret); + return; + } + } + + _allocator->free(_data); +} + +std::shared_ptr host_buffer::init_tensor(ggml_tensor * tensor, remote_handle64 device_handle) { + if (!_data) { + LOG_ERROR("failed to init tensor, rpc memory not initialized\n"); + return std::shared_ptr(); + } + + if (_buffer_fd == -1) { + _buffer_fd = _allocator->to_fd(_data); + if (_buffer_fd < 0) { + LOG_ERROR("failed to get fd from rpc memory\n"); + return std::shared_ptr(); + } + + auto ret = _allocator->fastrpc_mmap((int) _domain_id, _buffer_fd, _data, 0, _size, FASTRPC_MAP_FD); + if (ret != AEE_SUCCESS) { + LOG_ERROR("failed to mmap rpc memory, fd: %d, ret: %d\n", _buffer_fd, ret); + return std::shared_ptr(); + } + + LOG_DEBUG("mmap rpc memory(%p), fd: %d, addr: %p, size: %zu\n", (void *) _data, _buffer_fd, _data, _size); + } + + auto tensor_object = std::make_shared( + tensor, _buffer_fd, (uint64_t) (reinterpret_cast(tensor->data) - reinterpret_cast(_data)), + device_handle); + if (!tensor_object->is_valid()) { + LOG_ERROR("failed to init tensor, device handle: %p\n", (void *) device_handle); + return std::shared_ptr(); + } + + _tensors.push_back(tensor_object); + return tensor_object; +} + +host_buffer_type::host_buffer_type(ggml_backend_dev_t dev, const std::string & name, common::rpc_mem_ptr rpc_mem) : + _name(name), + _rpc_mem(rpc_mem) { + iface = { + /* .get_name = */ backend_buffer_type_get_name, + /* .alloc_buffer = */ backend_buffer_type_alloc_buffer, + /* .get_alignment = */ backend_buffer_type_get_alignment, + /* .get_max_size = */ backend_buffer_type_get_max_size, + /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes + /* .is_host = */ backend_buffer_is_host, + }; + device = dev; + context = this; + + _device = reinterpret_cast(device->context); + LOG_DEBUG("[%s]create host_buffer_type %s\n", _device->get_name(), _name.c_str()); +} + +size_t host_buffer_type::get_buffer_alignment() const { + return _device->is_device_initialized() ? _device->get_alignment() : 128; +} + +size_t host_buffer_type::get_max_buffer_size() const { + if (!_rpc_mem) { + LOG_ERROR("rpc memory not initialized\n"); + return 0; + } + + return _rpc_mem->get_max_alloc_size(); +} + +ggml_backend_buffer_t host_buffer_type::allocate_buffer(size_t size) { + if (!_rpc_mem) { + LOG_ERROR("rpc memory not initialized\n"); + return nullptr; + } + + if (!_device->is_device_initialized()) { + LOG_ERROR("device is not initialized\n"); + return nullptr; + } + + auto * buffer = new host_buffer(_rpc_mem, size, _device->get_dsp_domain_id()); + if (!buffer->is_valid()) { + delete buffer; + LOG_ERROR("Failed to allocate buffer of size %zu\n", size); + return nullptr; + } + + LOG_DEBUG("[%s]allocate buffer %p, size: %zu\n", _device->get_name(), buffer->get_buffer(), size); + return ggml_backend_buffer_init(this, backend_buffer_interface, buffer, size); +} + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/host/buffer.hpp b/ggml/src/ggml-qnn/npu/host/buffer.hpp new file mode 100644 index 0000000000000..955944bb98f59 --- /dev/null +++ b/ggml/src/ggml-qnn/npu/host/buffer.hpp @@ -0,0 +1,66 @@ +#pragma once + +#include +#include + +#include "ggml-backend-impl.h" +#include "hexagon_npu.h" +#include "rpc-mem.hpp" + +namespace hexagon { + +class host_tensor; + +class host_buffer { + public: + explicit host_buffer(common::rpc_mem_ptr allocator, size_t size, uint32_t domain_id); + + ~host_buffer(); + + bool is_valid() const { return _data != nullptr; } + + void * get_buffer() { return _data; } + + size_t get_size() const { return _size; } + + std::shared_ptr init_tensor(ggml_tensor * tensor, remote_handle64 device_handle); + + private: + common::rpc_mem_ptr _allocator; + void * _data = nullptr; + size_t _size = 0; + int _buffer_fd = -1; + uint32_t _domain_id = 0; + + std::list> _tensors; + + DISABLE_COPY(host_buffer); + DISABLE_MOVE(host_buffer); +}; + +class npu_device; + +class host_buffer_type : public ggml_backend_buffer_type { + public: + explicit host_buffer_type(ggml_backend_dev_t dev, const std::string & name, common::rpc_mem_ptr rpc_mem); + + const char * get_name() const { return _name.c_str(); } + + size_t get_buffer_alignment() const; + + size_t get_max_buffer_size() const; + + ggml_backend_buffer_t allocate_buffer(size_t size); + + npu_device * get_device() const { return _device; } + + private: + npu_device * _device = nullptr; + std::string _name; + common::rpc_mem_ptr _rpc_mem; + + DISABLE_COPY(host_buffer_type); + DISABLE_MOVE(host_buffer_type); +}; + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/host/graph.cpp b/ggml/src/ggml-qnn/npu/host/graph.cpp new file mode 100644 index 0000000000000..9e8cf8320408e --- /dev/null +++ b/ggml/src/ggml-qnn/npu/host/graph.cpp @@ -0,0 +1,82 @@ +#include "graph.hpp" + +#include "tensor.hpp" + +namespace hexagon { + +host_graph::host_graph(ggml_cgraph * cgraph, remote_handle64 device_handle) : _device_handle(device_handle) { + auto status = npu_device_graph_init(_device_handle, &_graph_handle); + if (status != AEE_SUCCESS) { + LOG_ERROR("Failed to init graph: %d", (int) status); + _graph_handle = 0; + return; + } + + update(cgraph); +} + +host_graph::~host_graph() { + if (_graph_handle) { + npu_device_graph_free(_device_handle, _graph_handle); + _graph_handle = 0; + } +} + +bool host_graph::update(ggml_cgraph * cgraph) { + if (!_graph_handle) { + LOG_ERROR("host_graph not initialized\n"); + return false; + } + + _tensor_handles.clear(); + _tensor_handles.reserve(cgraph->n_nodes); + for (int i = 0; i < cgraph->n_nodes; ++i) { + auto * node = cgraph->nodes[i]; + if (node->op == GGML_OP_NONE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE) { + // skip view liked ops + LOG_DEBUG("node[%d]%s(%s), addr: %p, type: %s, skipped\n", i, ggml_get_name(node), ggml_op_desc(node), + (void *) node, ggml_type_name(node->type)); + continue; + } + + auto * tensor_obj = host_tensor::from_ggml_tensor(node); + if (!tensor_obj) { + LOG_DEBUG("Unable to get host tensor from ggml tensor: %p\n", (void *) node); + continue; + } + + tensor_obj->set_op(node->op); + _tensor_handles.push_back(tensor_obj->get_device_tensor_handle()); + LOG_DEBUG("node[%d]%s(%s), addr: %p, type: %s, tensor_handle: %p\n", i, ggml_get_name(node), ggml_op_desc(node), + (void *) node, ggml_type_name(node->type), (void *) tensor_obj->get_device_tensor_handle()); + for (size_t j = 0; j < GGML_MAX_SRC && node->src[j]; ++j) { + auto * src = host_tensor::from_ggml_tensor(node->src[j]); + tensor_obj->set_src(j, src); + } + } + + LOG_DEBUG("host_graph::update, host_graph(%p), ggml_cgraph(%p), tensor count(%zu)\n", (void *) this, + (void *) cgraph, _tensor_handles.size()); + if (!_tensor_handles.empty()) { + npu_device_graph_set_tensor(_device_handle, _graph_handle, _tensor_handles.data(), + (int) _tensor_handles.size()); + } + return true; +} + +bool host_graph::compute() { + if (!_graph_handle) { + LOG_ERROR("host_graph not initialized\n"); + return false; + } + + auto status = npu_device_graph_compute(_device_handle, _graph_handle); + if (status != AEE_SUCCESS) { + LOG_ERROR("Failed to compute host_graph: 0x%x\n", (int) status); + return false; + } + + return true; +} + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/host/graph.hpp b/ggml/src/ggml-qnn/npu/host/graph.hpp new file mode 100644 index 0000000000000..20c917e1203ca --- /dev/null +++ b/ggml/src/ggml-qnn/npu/host/graph.hpp @@ -0,0 +1,32 @@ +#pragma once + +#include + +#include "common.hpp" +#include "ggml-backend-impl.h" +#include "hexagon_npu.h" + +namespace hexagon { + +class host_graph { + public: + host_graph(ggml_cgraph * cgraph, remote_handle64 device_handle); + + ~host_graph(); + + bool is_valid() const { return _graph_handle != 0; } + + bool update(ggml_cgraph * cgraph); + + bool compute(); + + private: + remote_handle64 _device_handle = 0; + npu_device_graph_handle_t _graph_handle = 0; + std::vector _tensor_handles; + + DISABLE_COPY(host_graph); + DISABLE_MOVE(host_graph); +}; + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/host/host.cpp b/ggml/src/ggml-qnn/npu/host/host.cpp new file mode 100644 index 0000000000000..90c4cd29e8e20 --- /dev/null +++ b/ggml/src/ggml-qnn/npu/host/host.cpp @@ -0,0 +1,153 @@ + +#include +#include + +#include "buffer.hpp" +#include "common.hpp" +#include "ggml-backend-impl.h" +#include "ggml-impl.h" +#include "host_device.hpp" + +namespace { + +hexagon::npu_device * get_device_object(ggml_backend_dev_t device) { + return reinterpret_cast(device->context); +} + +hexagon::npu_device * get_device_object(ggml_backend_t backend) { + return get_device_object(backend->device); +} + +const char * backend_dev_get_name(ggml_backend_dev_t dev) { + auto * dev_obj = get_device_object(dev); + GGML_ASSERT(dev_obj != nullptr); + return dev_obj->get_name(); +} + +const char * backend_dev_get_description(ggml_backend_dev_t dev) { + auto * dev_obj = get_device_object(dev); + GGML_ASSERT(dev_obj != nullptr); + return dev_obj->get_description(); +} + +bool backend_dev_is_npu_device(ggml_backend_dev_t dev) { + return dev->iface.get_name == backend_dev_get_name; +} + +void backend_dev_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { + GGML_UNUSED(dev); + *free = common::get_system_free_memory_in_bytes(); + *total = common::get_system_total_memory_in_bytes(); +} + +enum ggml_backend_dev_type backend_dev_get_type(ggml_backend_dev_t dev) { + GGML_UNUSED(dev); + return GGML_BACKEND_DEVICE_TYPE_ACCEL; +} + +void backend_dev_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { + GGML_ASSERT(get_device_object(dev) != nullptr); + props->name = backend_dev_get_name(dev); + props->description = backend_dev_get_description(dev); + props->type = backend_dev_get_type(dev); + backend_dev_get_memory(dev, &props->memory_free, &props->memory_total); + props->caps = {}; +} + +ggml_backend_t backend_dev_init_backend(ggml_backend_dev_t dev, const char * params) { + auto * dev_obj = get_device_object(dev); + GGML_ASSERT(dev_obj != nullptr); + if (!dev_obj->init_device(dev, params)) { + LOG_ERROR("[%s]Failed to init device\n", backend_dev_get_name(dev)); + return nullptr; + } + + return new hexagon::npu_backend(dev); +} + +ggml_backend_buffer_type_t backend_dev_get_buffer_type(ggml_backend_dev_t dev) { + auto * dev_obj = get_device_object(dev); + GGML_ASSERT(dev_obj != nullptr); + return dev_obj->get_default_buffer_type(dev); +} + +ggml_backend_buffer_t backend_dev_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, + size_t max_tensor_size) { + // TODO: should we use the device memory here? + GGML_UNUSED(dev); + GGML_UNUSED(max_tensor_size); + return ggml_backend_cpu_buffer_from_ptr(ptr, size); +} + +bool backend_dev_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { + if (!backend_dev_is_npu_device(dev)) { + return false; + } + + auto * dev_obj = get_device_object(dev); + GGML_ASSERT(dev_obj != nullptr); + return dev_obj->supports_op(op); +} + +bool backend_dev_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { + if (!backend_dev_is_npu_device(dev)) { + return false; + } + + auto * dev_obj = get_device_object(dev); + GGML_ASSERT(dev_obj != nullptr); + return dev_obj->supports_buft(buft); +} + +bool backend_dev_offload_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { + if (!backend_dev_is_npu_device(dev)) { + return false; + } + + auto * dev_obj = get_device_object(dev); + GGML_ASSERT(dev_obj != nullptr); + return dev_obj->offload_op(op); +} + +constexpr const ggml_backend_device_i npu_device_interface = { + /* .get_name = */ backend_dev_get_name, + /* .get_description = */ backend_dev_get_description, + /* .get_memory = */ backend_dev_get_memory, + /* .get_type = */ backend_dev_get_type, + /* .get_props = */ backend_dev_get_props, + /* .init_backend = */ backend_dev_init_backend, + /* .get_buffer_type = */ backend_dev_get_buffer_type, + /* .get_host_buffer_type = */ nullptr, + /* .buffer_from_host_ptr = */ backend_dev_buffer_from_host_ptr, + /* .supports_op = */ backend_dev_supports_op, + /* .supports_buft = */ backend_dev_supports_buft, + /* .offload_op = */ backend_dev_offload_op, + /* .event_new = */ nullptr, + /* .event_free = */ nullptr, + /* .event_synchronize = */ nullptr, +}; + +class npu_device_proxy : public backend_device_proxy { + public: + explicit npu_device_proxy(backend_index_type device) { _device = std::make_unique(device); } + + const ggml_backend_device_i & get_iface() const { return npu_device_interface; } + + void * get_context() { return _device.get(); } + + private: + std::unique_ptr _device; + + DISABLE_COPY(npu_device_proxy); + DISABLE_MOVE(npu_device_proxy); +}; + +} // namespace + +backend_device_proxy_ptr create_hexagon_backend_context(backend_index_type device) { + if (device < QNN_BACKEND_COUNT || device >= TOTAL_BACKEND_COUNT) { + return backend_device_proxy_ptr(); + } + + return std::make_shared(device); +} diff --git a/ggml/src/ggml-qnn/npu/host/host_device.cpp b/ggml/src/ggml-qnn/npu/host/host_device.cpp new file mode 100644 index 0000000000000..aa90cfa8bc8f1 --- /dev/null +++ b/ggml/src/ggml-qnn/npu/host/host_device.cpp @@ -0,0 +1,305 @@ +#include "host_device.hpp" + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wmissing-prototypes" +#include +#pragma GCC diagnostic pop + +#include + +#include "graph.hpp" +#include "util.hpp" + +#define SKEL_URI_DEFINE(arch) ("file:///libhexagon_npu_skel_" arch ".so?npu_device_skel_handle_invoke&_modver=1.0") + +namespace { + +struct device_library_info { + hexagon::hexagon_dsp_arch arch; + const char * device_lib_uri; +}; + +constexpr const device_library_info kDeviceLibraryInfo[] = { + { hexagon::NONE, SKEL_URI_DEFINE("") }, + { hexagon::V68, SKEL_URI_DEFINE("v68") }, + { hexagon::V69, SKEL_URI_DEFINE("v69") }, + { hexagon::V73, SKEL_URI_DEFINE("v73") }, + { hexagon::V75, SKEL_URI_DEFINE("v75") }, + { hexagon::V79, SKEL_URI_DEFINE("v79") }, +}; + +const device_library_info & get_device_library_info(hexagon::hexagon_dsp_arch arch) { + for (const auto & info : kDeviceLibraryInfo) { + if (info.arch == arch) { + return info; + } + } + + LOG_ERROR("Unknown DSP arch: %d, using hexagon::NONE\n", arch); + return kDeviceLibraryInfo[0]; +} + +const char * get_domain_param(uint32_t domain_id) { + for (const auto & domain : supported_domains) { + if ((uint32_t) domain.id == domain_id) { + return domain.uri; + } + } + + return ""; +} + +constexpr const ggml_guid kBackendNpuGuid = { 0x7a, 0xd7, 0x59, 0x7d, 0x8f, 0x66, 0x4f, 0x35, + 0x84, 0x8e, 0xf5, 0x9a, 0x9b, 0x83, 0x7d, 0x0a }; + +hexagon::npu_backend * get_backend_object(ggml_backend_t backend) { + return reinterpret_cast(backend); +} + +const char * backend_get_name(ggml_backend_t backend) { + auto * backend_obj = get_backend_object(backend); + GGML_ASSERT(backend_obj != nullptr); + return backend_obj->get_name(); +} + +void backend_free(ggml_backend_t backend) { + delete get_backend_object(backend); +} + +bool backend_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, + ggml_tensor * dst) { + // TODO: implement this + return false; +} + +ggml_status backend_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { + auto * backend_obj = get_backend_object(backend); + GGML_ASSERT(backend_obj != nullptr); + return backend_obj->graph_compute(cgraph); +} + +} // namespace + +namespace hexagon { + +// TODO: should we use another domain? +npu_device::npu_device(backend_index_type device) : _dsp_domain_id(CDSP_DOMAIN_ID) { + GGML_UNUSED(device); + LOG_DEBUG("[%s]NPU device created\n", _name.c_str()); +} + +npu_device::~npu_device() { + if (_device_handle) { + npu_device_close(_device_handle); + } +} + +size_t npu_device::get_alignment() const { + uint32_t alignment = 0; + npu_device_device_get_alignment(_device_handle, &alignment); + return alignment; +} + +bool npu_device::is_device_initialized() const { + if (!_device_handle) { + LOG_ERROR("[%s]NPU device not opened\n", get_name()); + return false; + } + + if (!_rpc_mem) { + LOG_ERROR("[%s]rpc memory not initialized\n", get_name()); + return false; + } + + return true; +} + +bool npu_device::init_device(ggml_backend_dev_t dev, const char * params) { + if (!init_rpc_mem()) { + return false; + } + + if (!_device_handle) { + auto arch = get_dsp_arch(_rpc_interface, _dsp_domain_id); + const auto & device_lib_info = get_device_library_info(arch); + std::string device_lib_uri = device_lib_info.device_lib_uri; + device_lib_uri += get_domain_param(_dsp_domain_id); + LOG_DEBUG("[%s]NPU device arch: %s, uri: %s\n", get_name(), get_dsp_arch_desc(arch), device_lib_uri.c_str()); + auto err = npu_device_open(device_lib_uri.c_str(), &_device_handle); + if (err != AEE_SUCCESS) { + if (err == AEE_ECONNREFUSED) { + LOG_DEBUG("[%s]NPU device is not available, trying to enable unsigned DSP module and reopen\n", + get_name()); + enable_unsigned_dsp_module(_rpc_interface, _dsp_domain_id); + err = npu_device_open(device_lib_uri.c_str(), &_device_handle); + } + + if (err != AEE_SUCCESS) { + LOG_ERROR("[%s]Unable to open NPU device, err: 0x%x, uri %s\n", get_name(), err, + device_lib_uri.c_str()); + _device_handle = 0; + return false; + } + } + + _description += ' '; + _description += get_dsp_arch_desc(arch); + LOG_DEBUG("[%s]NPU device opened successfully\n", get_name()); + } else { + LOG_DEBUG("[%s]NPU device is already opened\n", get_name()); + } + + return true; +} + +bool npu_device::supports_buft(ggml_backend_buffer_type_t buft) const { + return buft && buft->device && buft->device->context == this; +} + +bool npu_device::supports_op_impl(const ggml_tensor * op) { + if (op->op == GGML_OP_NONE) { + return true; + } + + if (type_to_npu_type(op->type) == NPU_DATA_TYPE_COUNT) { + LOG_DEBUG("[%s]Unsupported op tensor type: %s\n", get_name(), ggml_type_name(op->type)); + return false; + } + + auto * src0 = op->src[0]; + if (!src0) { + LOG_DEBUG("[%s]Unsupported inplace op: %s\n", get_name(), ggml_op_name(op->op)); + return false; + } + + if (type_to_npu_type(src0->type) == NPU_DATA_TYPE_COUNT) { + LOG_DEBUG("[%s]Unsupported src0 tensor type: %s\n", get_name(), ggml_type_name(src0->type)); + return false; + } + + auto * src1 = op->src[1]; + if (src1 && type_to_npu_type(src1->type) == NPU_DATA_TYPE_COUNT) { + LOG_DEBUG("[%s]Unsupported src1 tensor type: %s\n", get_name(), ggml_type_name(src1->type)); + return false; + } + + auto npu_op = op_to_npu_op(op->op); + if (npu_op == NPU_OP_COUNT) { + LOG_DEBUG("[%s]Unsupported op: %s\n", get_name(), ggml_op_name(op->op)); + return false; + } + + constexpr const auto get_spec = [](const ggml_tensor * tensor) -> npu_device_tensor_spec { + if (!tensor) { + return npu_device_tensor_spec{}; + } + + static_assert(DEVICE_TENSOR_MAX_DIMS == GGML_MAX_DIMS, "tensor dimensions mismatch"); + npu_device_tensor_spec spec{}; + spec.ne[0] = tensor->ne[0]; + spec.ne[1] = tensor->ne[1]; + spec.ne[2] = tensor->ne[2]; + spec.ne[3] = tensor->ne[3]; + spec.type = type_to_npu_type(tensor->type); + return spec; + }; + + boolean supported = false; + auto src0_spec = get_spec(src0); + auto src1_spec = get_spec(src1); + auto dst_spec = get_spec(op); + auto ret = npu_device_device_support_op(_device_handle, &src0_spec, &src1_spec, &dst_spec, npu_op, &supported); + if (ret != AEE_SUCCESS || !supported) { + LOG_DEBUG("[%s]Unsupported op: %s, ret: 0x%x, supported: %d\n", get_name(), ggml_op_name(op->op), ret, + supported); + return false; + } + + LOG_DEBUG("[%s]Supported op: %s\n", get_name(), ggml_op_name(op->op)); + return true; +} + +bool npu_device::init_rpc_mem() { + if (!_rpc_mem) { + auto rpc_interface = std::make_shared(); + if (!rpc_interface->is_valid()) { + LOG_ERROR("[%s]Failed to load rpc memory library\n", get_name()); + return false; + } + + auto rpc_mem = std::make_shared(rpc_interface); + _rpc_interface = rpc_interface; + _rpc_mem = rpc_mem; + LOG_DEBUG("[%s]rpc memory initialized\n", get_name()); + } else { + LOG_DEBUG("[%s]rpc memory already initialized\n", get_name()); + } + + return true; +} + +bool npu_device::offload_op(const ggml_tensor * op) { + // TODO: implement this + return false; +} + +ggml_backend_buffer_type_t npu_device::get_default_buffer_type(ggml_backend_dev_t dev) { + // Note that this function will be called before the npu_device::init_device + if (!init_rpc_mem()) { + return nullptr; + } + + if (!_default_buffer_type) { + LOG_DEBUG("[%s]Creating default buffer type\n", get_name()); + _default_buffer_type = std::make_unique(dev, _name + "_buffer_type", _rpc_mem); + if (!_default_buffer_type) { + LOG_ERROR("[%s]Default buffer type not initialized\n", get_name()); + return nullptr; + } + } else { + LOG_DEBUG("[%s]Default buffer type already created\n", get_name()); + } + + return _default_buffer_type.get(); +} + +npu_backend::npu_backend(ggml_backend_dev_t dev) : ggml_backend{} { + memccpy(&_guid, &kBackendNpuGuid, 0, sizeof(ggml_guid)); + device = dev; + guid = &_guid; + iface.get_name = backend_get_name; + iface.free = backend_free; + iface.cpy_tensor_async = backend_cpy_tensor_async; + iface.graph_compute = backend_graph_compute; + _device = reinterpret_cast(dev->context); +} + +ggml_status npu_backend::graph_compute(ggml_cgraph * cgraph) { + if (!cgraph || !cgraph->n_nodes) { + LOG_DEBUG("[%s]Graph is empty, nothing to compute\n", get_name()); + return GGML_STATUS_SUCCESS; + } + + std::shared_ptr graph; + if (_graph_cache.count(cgraph) == 0) { + LOG_DEBUG("[%s]graph(%p) not found in cache, creating new graph\n", get_name(), (void *) cgraph); + graph = std::make_shared(cgraph, _device->get_device_handle()); + if (!graph->is_valid()) { + LOG_ERROR("Failed to create graph\n"); + return GGML_STATUS_FAILED; + } + + _graph_cache[cgraph] = graph; + } else { + graph = _graph_cache[cgraph]; + LOG_DEBUG("[%s]graph(%p) found in cache, using existing graph\n", get_name(), (void *) cgraph); + if (!graph->update(cgraph)) { + LOG_ERROR("[%s]Failed to update graph(%p)\n", get_name(), (void *) cgraph); + return GGML_STATUS_FAILED; + } + } + + return graph->compute() ? GGML_STATUS_SUCCESS : GGML_STATUS_FAILED; +} + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/host/host_device.hpp b/ggml/src/ggml-qnn/npu/host/host_device.hpp new file mode 100644 index 0000000000000..efc7914f18615 --- /dev/null +++ b/ggml/src/ggml-qnn/npu/host/host_device.hpp @@ -0,0 +1,107 @@ +#pragma once + +#include +#include +#ifndef NDEBUG +# include +#endif + +#include "buffer.hpp" +#include "common.hpp" +#include "ggml-backend-impl.h" +#include "hexagon_npu.h" +#include "rpc-mem.hpp" + +namespace hexagon { + +class npu_device { + public: + explicit npu_device(backend_index_type device); + + ~npu_device(); + + const char * get_name() const { return _name.c_str(); } + + const char * get_description() const { return _description.c_str(); } + + size_t get_alignment() const; + + uint32_t get_dsp_domain_id() const { return _dsp_domain_id; } + + ggml_backend_buffer_type_t get_default_buffer_type(ggml_backend_dev_t dev); + + bool is_device_initialized() const; + bool init_device(ggml_backend_dev_t dev, const char * params); + + bool supports_buft(ggml_backend_buffer_type_t buft) const; + bool offload_op(const ggml_tensor * op); + +#ifndef NDEBUG + bool supports_op(const ggml_tensor * op) { + if (supports_op_impl(op)) { + if (op->op != GGML_OP_NONE) { + _supported_op++; + LOG_DEBUG("[%s]Supported op: %s, supported/unsupported: %u/%u\n", get_name(), ggml_op_name(op->op), + _supported_op.load(), _unsupported_op.load()); + } + + return true; + } + + _unsupported_op++; + LOG_DEBUG("[%s]Unsupported op: %s, supported/unsupported: %u/%u\n", get_name(), ggml_op_name(op->op), + _supported_op.load(), _unsupported_op.load()); + return false; + } +#else + bool supports_op(const ggml_tensor * op) { return supports_op_impl(op); } +#endif + + remote_handle64 get_device_handle() const { return _device_handle; } + + private: + bool supports_op_impl(const ggml_tensor * op); + bool init_rpc_mem(); + + std::string _name = "hexagon-npu"; + std::string _description = "Hexagon NPU"; + common::rpc_interface_ptr _rpc_interface; + common::rpc_mem_ptr _rpc_mem; + remote_handle64 _device_handle = 0; + std::unique_ptr _default_buffer_type; + uint32_t _dsp_domain_id = 0; + +#ifndef NDEBUG + std::atomic_uint32_t _supported_op = 0; + std::atomic_uint32_t _unsupported_op = 0; +#endif + + DISABLE_COPY(npu_device); + DISABLE_MOVE(npu_device); +}; + +class host_graph; + +class npu_backend : public ggml_backend { + public: + explicit npu_backend(ggml_backend_dev_t dev); + + ~npu_backend() {} + + const char * get_name() const { + // TODO: should we use the device name here? + return _device->get_name(); + } + + ggml_status graph_compute(ggml_cgraph * cgraph); + + private: + ggml_guid _guid = {}; + npu_device * _device = nullptr; + std::unordered_map> _graph_cache; + + DISABLE_COPY(npu_backend); + DISABLE_MOVE(npu_backend); +}; + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/host/tensor.hpp b/ggml/src/ggml-qnn/npu/host/tensor.hpp new file mode 100644 index 0000000000000..e7d5f7a88aeb4 --- /dev/null +++ b/ggml/src/ggml-qnn/npu/host/tensor.hpp @@ -0,0 +1,88 @@ +#pragma once + +#include "common.hpp" +#include "ggml-impl.h" +#include "hexagon_npu.h" +#include "util.hpp" + +namespace hexagon { + +// TODO: merge this with device tensor? +class host_tensor { + public: + static host_tensor * from_ggml_tensor(ggml_tensor * tensor) { + if (!tensor || !tensor->extra) { + return nullptr; + } + return static_cast(tensor->extra); + } + + explicit host_tensor(ggml_tensor * tensor, int buffer_fd, uint64_t offset, remote_handle64 device_handle) : + _device_handle(device_handle) { + _info.buffer_fd = buffer_fd; + _info.offset = offset; + _info.type = type_to_npu_type(tensor->type); + _info.op = op_to_npu_op(tensor->op); + _info.size = ggml_nbytes(tensor); + + static_assert(DEVICE_TENSOR_MAX_DIMS == GGML_MAX_DIMS, "tensor dimensions mismatch"); + static_assert(sizeof(_info.ne) == sizeof(tensor->ne), "tensor ne size mismatch"); + static_assert(sizeof(_info.nb) == sizeof(tensor->nb), "tensor nb size mismatch"); + memcpy(_info.ne, tensor->ne, sizeof(_info.ne)); + memcpy(_info.nb, tensor->nb, sizeof(_info.nb)); + + auto status = npu_device_tensor_init(_device_handle, &_info, &_device_tensor_handle); + if (status != AEE_SUCCESS) { + LOG_ERROR("Failed to init tensor: %d", (int) status); + _device_tensor_handle = 0; + return; + } + + tensor->extra = this; + _ggml_tensor = tensor; + LOG_DEBUG( + "host_tensor(%p) created, ggml_tensor(%p[%ldx%ldx%ldx%ld], nb[%ld][%ld][%ld][%ld]), " + "device_tensor_handle(%p)\n", + (void *) this, (void *) tensor, (long) tensor->ne[0], (long) tensor->ne[1], (long) tensor->ne[2], + (long) tensor->ne[3], (long) tensor->nb[0], (long) tensor->nb[1], (long) tensor->nb[2], + (long) tensor->nb[3], (void *) _device_tensor_handle); + } + + ~host_tensor() { + LOG_DEBUG("host_tensor(%p) destroy, device_tensor_handle: %p\n", (void *) this, (void *) _device_tensor_handle); + if (_device_tensor_handle) { + npu_device_tensor_free(_device_handle, _device_tensor_handle); + _ggml_tensor->extra = nullptr; + } + } + + npu_device_tensor_handle_t get_device_tensor_handle() const { return _device_tensor_handle; } + + void set_src(size_t index, host_tensor * src) { + if (index >= DEVICE_TENSOR_MAX_SRC) { + LOG_ERROR("host_tensor(%p) set_src[%zu] out of range\n", (void *) this, index); + return; + } + + LOG_DEBUG("host_tensor(%p) set_src[%zu]: %p\n", (void *) this, index, (void *) src); + npu_device_tensor_set_src(_device_handle, _device_tensor_handle, index, src->get_device_tensor_handle()); + } + + void set_op(ggml_op op) { + _info.op = op_to_npu_op(op); + npu_device_tensor_set_op(_device_handle, _device_tensor_handle, _info.op); + } + + bool is_valid() const { return _device_tensor_handle != 0; } + + private: + remote_handle64 _device_handle = 0; + npu_device_tensor_handle_t _device_tensor_handle = 0; + npu_device_tensor_config _info = {}; + ggml_tensor * _ggml_tensor = nullptr; + + DISABLE_COPY(host_tensor); + DISABLE_MOVE(host_tensor); +}; + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/host/util.cpp b/ggml/src/ggml-qnn/npu/host/util.cpp new file mode 100644 index 0000000000000..5db54b661ebde --- /dev/null +++ b/ggml/src/ggml-qnn/npu/host/util.cpp @@ -0,0 +1,96 @@ +#include "util.hpp" + +#include + +namespace hexagon { + +enum npu_device_tensor_op op_to_npu_op(ggml_op op) { + switch (op) { + case GGML_OP_MUL_MAT: + return NPU_OP_MUL_MAT; + case GGML_OP_ADD: + return NPU_OP_ADD; + case GGML_OP_SUB: + return NPU_OP_SUB; + case GGML_OP_MUL: + return NPU_OP_MUL; + default: + return NPU_OP_COUNT; + } +} + +enum npu_device_tensor_data_type type_to_npu_type(ggml_type type) { + switch (type) { + case GGML_TYPE_F32: + return NPU_DATA_TYPE_F32; + default: + return NPU_DATA_TYPE_COUNT; + } +} + +hexagon_dsp_arch get_dsp_arch(common::rpc_interface_ptr rpc_interface, uint32_t domain_id) { + if (!rpc_interface || !rpc_interface->is_valid()) { + return NONE; + } + + remote_dsp_capability dsp_caps = {}; + dsp_caps.domain = domain_id; + dsp_caps.attribute_ID = ARCH_VER; + auto ret = rpc_interface->remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_caps, sizeof(dsp_caps)); + if (ret != AEE_SUCCESS) { + LOG_ERROR("failed to get DSP arch: %d\n", ret); + return NONE; + } + + LOG_DEBUG("get DSP arch: 0x%x\n", (int) dsp_caps.capability); + auto arch = dsp_caps.capability & 0xFF; + switch (arch) { + case 0x68: + return V68; + case 0x69: + return V69; + case 0x73: + return V73; + case 0x75: + return V75; + case 0x79: + return V79; + default: + LOG_ERROR("unknown DSP arch: %x\n", arch); + return NONE; + } +} + +const char * get_dsp_arch_desc(hexagon_dsp_arch arch) { + switch (arch) { + case V68: + return "V68"; + case V69: + return "V69"; + case V73: + return "V73"; + case V75: + return "V75"; + case V79: + return "V79"; + case NONE: + default: + return "UnknownArch"; + } +} + +void enable_unsigned_dsp_module(common::rpc_interface_ptr rpc_interface, uint32_t domain_id) { + if (!rpc_interface || !rpc_interface->is_valid()) { + return; + } + + remote_rpc_control_unsigned_module data = {}; + data.domain = domain_id; + data.enable = 1; + auto ret = rpc_interface->remote_session_control(DSPRPC_CONTROL_UNSIGNED_MODULE, &data, sizeof(data)); + if (ret != AEE_SUCCESS) { + LOG_ERROR("failed to enable unsigned DSP module: 0x%x\n", ret); + } +} + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/host/util.hpp b/ggml/src/ggml-qnn/npu/host/util.hpp new file mode 100644 index 0000000000000..c001272d4cf7f --- /dev/null +++ b/ggml/src/ggml-qnn/npu/host/util.hpp @@ -0,0 +1,26 @@ +#include "ggml-impl.h" +#include "hexagon_npu.h" +#include "rpc-interface.hpp" + +namespace hexagon { + +enum npu_device_tensor_op op_to_npu_op(ggml_op op); +enum npu_device_tensor_data_type type_to_npu_type(ggml_type type); + +// TODO: merge with qcom_htp_arch +enum hexagon_dsp_arch { + NONE = 0, + V68, + V69, + V73, + V75, + V79, // SD 8 Gen 4 (SM8750) +}; + +hexagon_dsp_arch get_dsp_arch(common::rpc_interface_ptr rpc_interface, uint32_t domain_id); + +const char * get_dsp_arch_desc(hexagon_dsp_arch arch); + +void enable_unsigned_dsp_module(common::rpc_interface_ptr rpc_interface, uint32_t domain_id); + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl b/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl new file mode 100644 index 0000000000000..d62e65b3bd877 --- /dev/null +++ b/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl @@ -0,0 +1,90 @@ +#include "AEEStdDef.idl" +#include "AEEStdErr.idl" +#include "remote.idl" + +const uint32_t DEVICE_TENSOR_MAX_DIMS = 4; +const uint32_t DEVICE_TENSOR_MAX_SRC = 2; + +interface npu_device : remote_handle64{ + + typedef int64_t ne_type[DEVICE_TENSOR_MAX_DIMS]; + typedef uint64_t tensor_handle_t; + typedef uint64_t graph_handle_t; + + enum tensor_op { + NPU_OP_MUL_MAT, + NPU_OP_ADD, + NPU_OP_SUB, + NPU_OP_MUL, + NPU_OP_COUNT + }; + + enum tensor_data_type { + NPU_DATA_TYPE_F32, + NPU_DATA_TYPE_COUNT + }; + + struct tensor_spec { + ne_type ne; + tensor_data_type type; + }; + + struct tensor_config { + ne_type ne; + uint64_t nb[DEVICE_TENSOR_MAX_DIMS]; + long buffer_fd; + uint64_t offset; + uint64_t size; + tensor_data_type type; + tensor_op op; + }; + + AEEResult device_get_alignment( + rout uint32_t alignment + ); + + AEEResult device_support_op( + in tensor_spec src0, + in tensor_spec src1, + in tensor_spec dst, + in tensor_op op, + rout boolean is_supported + ); + + AEEResult tensor_init( + in tensor_config info, + rout tensor_handle_t tensor_handle + ); + + AEEResult tensor_set_src( + in tensor_handle_t tensor_handle, + in uint64_t index, + in tensor_handle_t src + ); + + AEEResult tensor_set_op( + in tensor_handle_t tensor_handle, + in tensor_op op + ); + + AEEResult tensor_free( + in tensor_handle_t tensor_handle + ); + + AEEResult graph_init( + rout graph_handle_t graph_handle + ); + + AEEResult graph_set_tensor( + in graph_handle_t graph_handle, + in sequence tensor_handles + ); + + AEEResult graph_compute( + in graph_handle_t graph_handle + ); + + AEEResult graph_free( + in graph_handle_t graph_handle + ); +}; diff --git a/ggml/src/ggml-qnn/qnn-types.hpp b/ggml/src/ggml-qnn/qnn-types.hpp deleted file mode 100644 index 957f8b681f3da..0000000000000 --- a/ggml/src/ggml-qnn/qnn-types.hpp +++ /dev/null @@ -1,61 +0,0 @@ - -#pragma once - -#include "QnnCommon.h" -#include "QnnInterface.h" -#include "QnnTypes.h" -#include "Saver/QnnSaver.h" -#include "System/QnnSystemInterface.h" - -namespace qnn { - -enum qcom_htp_arch { - NONE = 0, - V68 = 68, - V69 = 69, - V73 = 73, - V75 = 75, - V79 = 79, // SD 8 Gen 4 (SM8750) -}; - -enum qcom_chipset { - UNKNOWN_SM = 0, - SM8350 = 30, // v68, SD 888/888+ - SM8450 = 36, // v69, SD 8 Gen 1 - SA8295 = 39, // v68 - SM8475 = 42, // v69, SD 8+ Gen 1 - SM8550 = 43, // v73, SD 8 Gen 2 - SSG2115P = 46, // v73 - SM7675 = 70, // V73, SD 7+ Gen 3 - SM8635 = 68, // v73, SD 8s Gen 3 - SM8650 = 57, // v75, SD 8 Gen 3 - SM8750 = 69, // v79, SD 8 Gen 4 -}; - -struct qcom_socinfo { - uint32_t soc_model; - size_t htp_arch; - size_t vtcm_size_in_mb; -}; - -using pfn_rpc_mem_init = void (*)(void); -using pfn_rpc_mem_deinit = void (*)(void); -using pfn_rpc_mem_alloc = void * (*) (int, uint32_t, int); -using pfn_rpc_mem_free = void (*)(void *); -using pfn_rpc_mem_to_fd = int (*)(void *); - -using pfn_qnnsaver_initialize = decltype(QnnSaver_initialize); -using pfn_qnninterface_getproviders = decltype(QnnInterface_getProviders); -using pfn_qnnsysteminterface_getproviders = decltype(QnnSystemInterface_getProviders); -} // namespace qnn - -#define RPCMEM_DEFAULT_FLAGS 1 -#define RPCMEM_HEAP_ID_SYSTEM 25 - -#define DISABLE_COPY(class_name) \ - class_name(const class_name &) = delete; \ - void operator=(const class_name &) = delete - -#define DISABLE_MOVE(class_name) \ - class_name(class_name &&) = delete; \ - void operator=(class_name &&) = delete diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/qnn/backend-ops.cpp similarity index 94% rename from ggml/src/ggml-qnn/backend-ops.cpp rename to ggml/src/ggml-qnn/qnn/backend-ops.cpp index 857278bdaafbf..d4d2c57cbf4fe 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/qnn/backend-ops.cpp @@ -12,7 +12,7 @@ namespace { -qnn::qnn_graph * get_qnn_graph_from_cache(ggml_backend_qnn_device_context * ctx, const ggml_cgraph * cgraph) { +qnn::qnn_graph * get_qnn_graph_from_cache(qnn::ggml_backend_qnn_device_context * ctx, const ggml_cgraph * cgraph) { auto & graph_cache = ctx->qnn_graph_cache; std::string graph_key; auto op_data_type = qnn::qnn_graph::get_graph_key_from_cgraph(cgraph, graph_key); @@ -178,7 +178,7 @@ inline bool is_type_bit_enabled(uint64_t bits, ggml_type type) { return bits & (uint64_t(1) << type); } -inline bool is_tensor_size_valid(ggml_backend_qnn_device_context * ctx, const ggml_tensor * tensor) { +inline bool is_tensor_size_valid(qnn::ggml_backend_qnn_device_context * ctx, const ggml_tensor * tensor) { constexpr const auto get_tensor_size_in_bytes = [](const ggml_tensor * tensor, ggml_type type) -> size_t { return tensor->ne[0] * tensor->ne[1] * tensor->ne[2] * tensor->ne[3] * ggml_type_size(type); }; @@ -200,7 +200,7 @@ inline bool is_tensor_size_valid(ggml_backend_qnn_device_context * ctx, const gg return true; } -bool is_tensor_type_valid(ggml_backend_qnn_device_context * ctx, const ggml_tensor * tensor) { +bool is_tensor_type_valid(qnn::ggml_backend_qnn_device_context * ctx, const ggml_tensor * tensor) { if (!tensor) { QNN_LOG_DEBUG("tensor is nullptr\n"); return false; @@ -239,7 +239,7 @@ bool is_data_reinterpretation_op(ggml_op op) { return op == GGML_OP_VIEW || op == GGML_OP_PERMUTE; } -bool ggnl_qnn_supports_op_tensor(ggml_backend_qnn_device_context * ctx, const ggml_tensor * op) { +bool ggnl_qnn_supports_op_tensor(qnn::ggml_backend_qnn_device_context * ctx, const ggml_tensor * op) { if (op->op == GGML_OP_NONE) { return true; } @@ -265,7 +265,7 @@ bool ggnl_qnn_supports_op_tensor(ggml_backend_qnn_device_context * ctx, const gg return true; } -bool ggml_qnn_have_same_tensor_types(ggml_backend_qnn_device_context * ctx, const ggml_tensor * op) { +bool ggml_qnn_have_same_tensor_types(qnn::ggml_backend_qnn_device_context * ctx, const ggml_tensor * op) { auto * src0 = op->src[0]; auto * src1 = op->src[1]; if (src1) { @@ -291,7 +291,7 @@ bool ggml_qnn_have_same_tensor_types(ggml_backend_qnn_device_context * ctx, cons } // TODO: move to caps array? -bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context * ctx, const ggml_tensor * op) { +bool ggml_qnn_supports_matmul_op(qnn::ggml_backend_qnn_device_context * ctx, const ggml_tensor * op) { auto * src0 = op->src[0]; auto * src1 = op->src[1]; if (is_data_reinterpretation_op(src0->op) || is_data_reinterpretation_op(src1->op)) { @@ -343,7 +343,7 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context * ctx, const gg #ifndef NDEBUG -void print_tensor_info(ggml_backend_qnn_device_context * ctx, const ggml_tensor * op, bool is_supported) { +void print_tensor_info(qnn::ggml_backend_qnn_device_context * ctx, const ggml_tensor * op, bool is_supported) { const char * supported = is_supported ? "supported" : "unsupported"; std::string op_key; qnn::get_qnn_op_desc(op, true, GGML_TYPE_COUNT, op_key); @@ -358,7 +358,7 @@ void print_tensor_info(ggml_backend_qnn_device_context * ctx, const ggml_tensor namespace qnn { -bool device_supports_op(ggml_backend_qnn_device_context * ctx, const ggml_tensor * op) { +bool device_supports_op(qnn::ggml_backend_qnn_device_context * ctx, const ggml_tensor * op) { // Note that this function could be called before the device context is initialized if (op->op == GGML_OP_NONE) { return true; @@ -435,7 +435,7 @@ bool device_supports_op(ggml_backend_qnn_device_context * ctx, const ggml_tensor return is_op_supported; } -bool device_compute_graph(ggml_backend_qnn_device_context * ctx, ggml_cgraph * cgraph) { +bool device_compute_graph(qnn::ggml_backend_qnn_device_context * ctx, ggml_cgraph * cgraph) { QNN_LOG_DEBUG("[%s]compute graph start, nodes count: %d\n", qnn::get_backend_name(ctx->device), (int) cgraph->n_nodes); diff --git a/ggml/src/ggml-qnn/backend.hpp b/ggml/src/ggml-qnn/qnn/backend-ops.hpp similarity index 76% rename from ggml/src/ggml-qnn/backend.hpp rename to ggml/src/ggml-qnn/qnn/backend-ops.hpp index f2484a7a973f6..564a64a40e654 100644 --- a/ggml/src/ggml-qnn/backend.hpp +++ b/ggml/src/ggml-qnn/qnn/backend-ops.hpp @@ -1,4 +1,3 @@ - #pragma once #ifndef NDEBUG @@ -18,15 +17,15 @@ #include "qnn-lib.hpp" namespace qnn { + typedef std::unordered_map> qnn_graph_cache_t; -} // namespace qnn struct ggml_backend_qnn_device_context { // initialize in constructor - QNNBackend device; - size_t threads; - std::string name; - std::string description; + backend_index_type device; + size_t threads; + std::string name; + std::string description; // initialize in qnn init qnn::qcom_socinfo socinfo = {}; @@ -46,10 +45,15 @@ struct ggml_backend_qnn_device_context { uint64_t supported_types; uint64_t cpu_preprocess_types; - explicit ggml_backend_qnn_device_context(QNNBackend device, size_t threads, const char * name, + explicit ggml_backend_qnn_device_context(backend_index_type device, size_t threads, const char * name, uint64_t supported_types) : device(device), threads(threads), name(name), supported_types(supported_types) {} }; + +bool device_supports_op(ggml_backend_qnn_device_context * ctx, const ggml_tensor * op); +bool device_compute_graph(ggml_backend_qnn_device_context * ctx, ggml_cgraph * cgraph); + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/buffer.hpp b/ggml/src/ggml-qnn/qnn/buffer.hpp similarity index 100% rename from ggml/src/ggml-qnn/buffer.hpp rename to ggml/src/ggml-qnn/qnn/buffer.hpp diff --git a/ggml/src/ggml-qnn/convert.cpp b/ggml/src/ggml-qnn/qnn/convert.cpp similarity index 100% rename from ggml/src/ggml-qnn/convert.cpp rename to ggml/src/ggml-qnn/qnn/convert.cpp diff --git a/ggml/src/ggml-qnn/convert.hpp b/ggml/src/ggml-qnn/qnn/convert.hpp similarity index 100% rename from ggml/src/ggml-qnn/convert.hpp rename to ggml/src/ggml-qnn/qnn/convert.hpp diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/qnn/ggml-qnn.cpp similarity index 81% rename from ggml/src/ggml-qnn/ggml-qnn.cpp rename to ggml/src/ggml-qnn/qnn/ggml-qnn.cpp index 1d3e45562c6ef..e559cfdb28627 100644 --- a/ggml/src/ggml-qnn/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn/qnn/ggml-qnn.cpp @@ -1,11 +1,9 @@ -#include "ggml-qnn.h" - #include #include #include #include "backend-ops.hpp" -#include "backend.hpp" +#include "common.hpp" #include "ggml-backend-impl.h" #include "ggml-impl.h" #include "logger.hpp" @@ -14,8 +12,8 @@ namespace { -ggml_backend_qnn_device_context * get_device_context(ggml_backend_dev_t dev) { - return reinterpret_cast(dev->context); +qnn::ggml_backend_qnn_device_context * get_device_context(ggml_backend_dev_t dev) { + return reinterpret_cast(dev->context); } qnn::qnn_buffer_interface * get_buffer_context(ggml_backend_buffer_t buffer) { @@ -141,6 +139,16 @@ void ggml_backend_qnn_free(ggml_backend_t backend) { delete backend; } +ggml_guid_t ggml_backend_qnn_guid() { + static ggml_guid guid = { 0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81, + 0x92, 0xa3, 0xb4, 0xc5, 0xd6, 0xe7, 0xf8, 0x09 }; + return &guid; +} + +bool ggml_backend_is_qnn(ggml_backend_t backend) { + return ggml_guid_matches(backend->guid, ggml_backend_qnn_guid()); +} + bool ggml_backend_qnn_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) { GGML_UNUSED(backend_src); @@ -154,7 +162,7 @@ bool ggml_backend_qnn_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_ } ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(ggml_backend_dev_t dev) { - static ggml_backend_buffer_type ggml_backend_qnn_buffer_types[GGML_QNN_MAX_DEVICES]; + static ggml_backend_buffer_type ggml_backend_qnn_buffer_types[QNN_BACKEND_COUNT]; auto * dev_ctx = get_device_context(dev); if (!ggml_backend_qnn_buffer_types[dev_ctx->device].device) { ggml_backend_qnn_buffer_types[dev_ctx->device] = { @@ -215,8 +223,8 @@ const char * ggml_backend_qnn_device_get_description(ggml_backend_dev_t dev) { void ggml_backend_qnn_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { GGML_UNUSED(dev); - *free = qnn::get_system_free_memory_in_bytes(); - *total = qnn::get_system_total_memory_in_bytes(); + *free = common::get_system_free_memory_in_bytes(); + *total = common::get_system_total_memory_in_bytes(); QNN_LOG_DEBUG("free memory: %ldMB, total memory: %ldMB\n", (*free / 1048576), (*total) / 1048576); } @@ -237,12 +245,6 @@ void ggml_backend_qnn_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_ }; } -ggml_guid_t ggml_backend_qnn_guid() { - static ggml_guid guid = { 0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81, - 0x92, 0xa3, 0xb4, 0xc5, 0xd6, 0xe7, 0xf8, 0x09 }; - return &guid; -} - ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, const char * extend_lib_search_path) { if (!extend_lib_search_path) { extend_lib_search_path = GGML_QNN_DEFAULT_LIB_SEARCH_PATH; @@ -256,8 +258,7 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, QNN_LOG_DEBUG("device %s\n", qnn::get_backend_name(device)); QNN_LOG_DEBUG("extend_lib_search_path %s\n", extend_lib_search_path); auto instance = std::make_shared(extend_lib_search_path, device); - auto result = instance->qnn_init(nullptr); - if (result != 0) { + if (!instance->qnn_init(nullptr)) { QNN_LOG_WARN("failed to init qnn backend %s\n", qnn::get_backend_name(device)); return nullptr; } @@ -351,80 +352,43 @@ constexpr const ggml_backend_device_i ggml_backend_qnn_device_interface = { /* .event_synchronize = */ nullptr, }; -/* - * ----------------------------------------------------------------------------------------------- - * qnn backend registry object - * ----------------------------------------------------------------------------------------------- - */ - -struct ggml_backend_qnn_reg_impl : ggml_backend_reg { - std::vector> device_contexts; - std::vector devices; - - explicit ggml_backend_qnn_reg_impl(ggml_backend_reg_i interface) { - context = this; - iface = interface; - - QNN_LOG_DEBUG("qnn backend registry init\n"); - for (size_t i = 0; i < QNN_BACKEND_COUNT; i++) { - const auto device_enum = (QNNBackend) (QNN_BACKEND_COUNT - 1 - i); // init from the last device, i.e. NPU -#ifndef GGML_QNN_ENABLE_CPU_BACKEND - if (device_enum == QNN_BACKEND_CPU) { - /* - * here we skip the initialization of CPU device, - * cause it'll block unsupported ops fallback to ggml cpu backend - */ - QNN_LOG_DEBUG("qnn backend registry skip CPU device\n"); - continue; - } -#endif - - const auto & device_caps = qnn::get_device_caps(device_enum); - device_contexts.emplace_back(std::make_unique( - /* .device = */ device_enum, // init from the last device, i.e. NPU - /* .threads = */ 1, - /* .name = */ qnn::get_backend_name(device_enum), - /* .supported_types = */ device_caps.supported_types)); - - devices.emplace_back(ggml_backend_device{ - /* iface = */ ggml_backend_qnn_device_interface, - /* reg = */ this, - /* context = */ device_contexts.back().get(), - }); - } +class qnn_device_proxy : public backend_device_proxy { + public: + explicit qnn_device_proxy(backend_index_type device) { + const auto & device_caps = qnn::get_device_caps(device); + _device_context = std::make_unique( + /* .device = */ device, // init from the last device, i.e. NPU + /* .threads = */ 1, // TODO: fix this + /* .name = */ qnn::get_backend_name(device), + /* .supported_types = */ device_caps.supported_types); } -}; - -const char * ggml_backend_qnn_reg_get_name(ggml_backend_reg_t reg) { - GGML_UNUSED(reg); - return GGML_QNN_NAME; -} -size_t ggml_backend_qnn_reg_get_device_count(ggml_backend_reg_t reg) { - auto * ctx = (ggml_backend_qnn_reg_impl *) reg->context; - return ctx->devices.size(); -} + const ggml_backend_device_i & get_iface() const { return ggml_backend_qnn_device_interface; } -ggml_backend_dev_t ggml_backend_qnn_reg_get_device(ggml_backend_reg_t reg, size_t index) { - auto * ctx = (ggml_backend_qnn_reg_impl *) reg->context; - GGML_ASSERT(index < ctx->devices.size()); - return &(ctx->devices[index]); -} + void * get_context() { return _device_context.get(); } -const ggml_backend_reg_i ggml_backend_qnn_reg_interface = { - /* .get_name = */ ggml_backend_qnn_reg_get_name, - /* .get_device_count = */ ggml_backend_qnn_reg_get_device_count, - /* .get_device_get = */ ggml_backend_qnn_reg_get_device, - /* .get_proc_address = */ nullptr, + private: + std::unique_ptr _device_context; }; } // namespace -bool ggml_backend_is_qnn(ggml_backend_t backend) { - return ggml_guid_matches(backend->guid, ggml_backend_qnn_guid()); -} +backend_device_proxy_ptr create_qnn_backend_context(backend_index_type device) { + if (device >= QNN_BACKEND_COUNT) { + QNN_LOG_ERROR("[qnn]invalid device %d\n", device); + return backend_device_proxy_ptr(); + } + +#ifndef GGML_QNN_ENABLE_CPU_BACKEND + if (device == QNN_BACKEND_CPU) { + /* + * here we skip the initialization of CPU device, + * cause it'll block unsupported ops fallback to ggml cpu backend + */ + GGML_LOG_DEBUG("qnn backend registry skip CPU device\n"); + return backend_device_proxy_ptr(); + } +#endif -ggml_backend_reg_t ggml_backend_qnn_reg() { - static ggml_backend_qnn_reg_impl reg{ ggml_backend_qnn_reg_interface }; - return ® + return std::make_unique(device); } diff --git a/ggml/src/ggml-qnn/graph.cpp b/ggml/src/ggml-qnn/qnn/graph.cpp similarity index 98% rename from ggml/src/ggml-qnn/graph.cpp rename to ggml/src/ggml-qnn/qnn/graph.cpp index 3021a6f0a2fb5..70fc71c211c14 100644 --- a/ggml/src/ggml-qnn/graph.cpp +++ b/ggml/src/ggml-qnn/qnn/graph.cpp @@ -35,7 +35,7 @@ int get_op_max_rank(const ggml_tensor * op) { } qnn::qnn_tensor_ptr_t create_tensor_with_cache(ggml_tensor * tensor, qnn::ggml_qnn_tensor::tensor_type_t type, int rank, - ggml_type override_data_type, QNNBackend device, + ggml_type override_data_type, backend_index_type device, Qnn_GraphHandle_t graph_handle, std::shared_ptr qnn_instance, qnn_tensor_cache_t & tensor_cache) { @@ -60,7 +60,7 @@ qnn::qnn_tensor_ptr_t create_tensor_with_cache(ggml_tensor * tensor, qnn::ggml_q qnn::qnn_tensor_array_t create_tensors_with_cache(const qnn::ggml_tensor_array_t & ggml_tensors, qnn::ggml_qnn_tensor::tensor_type_t type, int rank, - ggml_type override_data_type, QNNBackend device, + ggml_type override_data_type, backend_index_type device, Qnn_GraphHandle_t graph_handle, std::shared_ptr qnn_instance, qnn_tensor_cache_t & tensor_cache) { @@ -74,7 +74,7 @@ qnn::qnn_tensor_array_t create_tensors_with_cache(const qnn::ggml_tensor_array_t } qnn::qnn_op_config_ptr_t create_operation_from_op_tensor(ggml_tensor * dst, const std::string & name, int rank, - QNNBackend device, Qnn_GraphHandle_t graph_handle, + backend_index_type device, Qnn_GraphHandle_t graph_handle, std::shared_ptr qnn_instance, qnn_tensor_cache_t & tensor_cache) { auto operation = qnn::create_op(dst, name, qnn_instance); @@ -335,7 +335,7 @@ ggml_type qnn_graph::get_graph_key_from_cgraph(const ggml_cgraph * cgraph, std:: return min_op_type; } -qnn_graph::qnn_graph(const std::string & graph_name, QNNBackend device, qnn_instance_ptr qnn_instance, +qnn_graph::qnn_graph(const std::string & graph_name, backend_index_type device, qnn_instance_ptr qnn_instance, htp_precision precision, size_t vtcm_size_in_mb) : _graph_name(graph_name), _device(device), diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/qnn/graph.hpp similarity index 85% rename from ggml/src/ggml-qnn/graph.hpp rename to ggml/src/ggml-qnn/qnn/graph.hpp index a913b8bba38b0..5e862112fbd1e 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/qnn/graph.hpp @@ -45,7 +45,7 @@ class qnn_graph { */ static ggml_type get_graph_key_from_cgraph(const ggml_cgraph * cgraph, std::string & output); - explicit qnn_graph(const std::string & graph_name, QNNBackend device, qnn_instance_ptr qnn_instance, + explicit qnn_graph(const std::string & graph_name, backend_index_type device, qnn_instance_ptr qnn_instance, htp_precision precision, size_t vtcm_size_in_mb); ~qnn_graph(); @@ -62,17 +62,17 @@ class qnn_graph { const std::string & get_name() const { return _graph_name; } - QNNBackend get_device() const { return _device; } + backend_index_type get_device() const { return _device; } private: bool finalize(); - const std::string _graph_name; - const QNNBackend _device; - Qnn_GraphHandle_t _graph_handle = nullptr; - qnn_instance_ptr _qnn_instance; - qnn_interface_ptr _qnn_interface; - qnn_op_config_array_t _operations; + const std::string _graph_name; + const backend_index_type _device; + Qnn_GraphHandle_t _graph_handle = nullptr; + qnn_instance_ptr _qnn_instance; + qnn_interface_ptr _qnn_interface; + qnn_op_config_array_t _operations; qnn_tensor_array_t _tensor_inputs; qnn_tensor_array_t _tensor_outputs; diff --git a/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage.xml b/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage.xml new file mode 100644 index 0000000000000..f4c6575902948 --- /dev/null +++ b/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage.xml @@ -0,0 +1,88 @@ + + + + + GgmlMulMat + + + GGML MulMat operator + + + + + in[0] + + src0 + + true + BACKEND_SPECIFIC + + 4D + NHWC + [N, C, H , W] + + + + + in[1] + + src1 + + true + BACKEND_SPECIFIC + + 4D + NHWC + [N, C, H , W] + + + + + out[0] + + dst + + true + BACKEND_SPECIFIC + + 4D + [N, C, H , W] + + + + + HTP + + + + + + + GgmlMulMat + + + + + GgmlMulMat + + + in[0] + QNN_DATATYPE_FLOAT_16 + QNN_DATATYPE_FLOAT_32 + + + in[1] + QNN_DATATYPE_FLOAT_16 + QNN_DATATYPE_FLOAT_32 + + + + out[0] + QNN_DATATYPE_FLOAT_16 + QNN_DATATYPE_FLOAT_32 + + + + + + diff --git a/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/Makefile b/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/Makefile new file mode 100644 index 0000000000000..f177822d35a06 --- /dev/null +++ b/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/Makefile @@ -0,0 +1,357 @@ +# check all setup prerequisites if the command goal is not clean +ifneq ($(MAKECMDGOALS),clean) +ifndef QNN_INCLUDE +$(info "INFO: Qnn include not explicitly defined, attempting to use QNN_SDK_ROOT if it is valid") +QNN_INCLUDE := $(QNN_SDK_ROOT)/include/QNN +endif +ifeq ($(wildcard $(QNN_INCLUDE)),) +$(error "ERROR: QNN_INCLUDE path is not set. QNN include paths must be set to obtain BE headers necessary to compile the package") +endif +ifndef QNN_TARGET_LIB +$(info "INFO: Qnn target not explicitly defined, attempting to use QNN_SDK_ROOT if it is valid") +QNN_TARGET_LIB := $(QNN_SDK_ROOT)/lib/aarch64-android +endif +ifeq ($(wildcard $(QNN_TARGET_LIB)),) +ifeq ($(MAKECMDGOALS),htp_aarch64) +$(error "ERROR: QNN_TARGET_LIB is needed to compile package for aarch64") +else ifeq ($(MAKECMDGOALS),all) +$(info "WARNING:QNN_TARGET_LIB may need to be defined to compile packages") +endif +endif + +ifndef HEXAGON_SDK_ROOT +$(error "ERROR: HEXAGON_SDK_ROOT is not set. Hexagon-SDK path must be set to the latest hexagon-sdk-x.y.z") +endif + +ifeq ($(wildcard $(HEXAGON_SDK_ROOT)),) +$(error "ERROR: HEXAGON_SDK_ROOT is not set correctly. Please set HEXAGON_SDK_ROOT to latest hexagon-sdk-X.Y.Z path") +endif + +HEXAGON_SDK_BASE := $(dir $(HEXAGON_SDK_ROOT)) + +$(info "HEXAGON_SDK_ROOT is [${HEXAGON_SDK_ROOT}]") +# Users should note that the tools version may change between hexagon sdk versions +# Following combination of SDK and Tool version is supported +# fix the sdk root for new versions +HEXAGON_SDK_ROOT_V68 := $(HEXAGON_SDK_ROOT) +HEXAGON_SDK_ROOT_V69 := $(HEXAGON_SDK_ROOT) +HEXAGON_SDK_ROOT_V73 := $(HEXAGON_SDK_ROOT) +HEXAGON_SDK_ROOT_V75 := $(HEXAGON_SDK_ROOT) +HEXAGON_SDK_ROOT_V79 := $(HEXAGON_SDK_ROOT) + +#Updated to point to latest sdk to match with libQnnHtp.so +HEXAGON_SDK_ROOT_X86 := $(HEXAGON_SDK_ROOT) +HEXAGON_TOOLS_VERSION_V68 := 8.7.06 +HEXAGON_TOOLS_VERSION_V69 := 8.7.06 +HEXAGON_TOOLS_VERSION_V73 := 8.7.06 +HEXAGON_TOOLS_VERSION_V75 := 8.7.06 +HEXAGON_TOOLS_VERSION_V79 := 8.7.06 + +#Updated to point to latest sdk to match with libQnnHtp.so +HEXAGON_TOOLS_VERSION_X86 := 8.7.06 + +ifndef ANDROID_NDK_ROOT +ifeq ($(MAKECMDGOALS),htp_aarch64) +$(error "ERROR: ANDROID_NDK_ROOT is not set. Android NDK path must be set to compile package for aarch64") +else ifeq ($(MAKECMDGOALS),all) +$(info "WARNING: ANDROID_NDK_ROOT is not set. Android NDK path must be set to compile package for aarch64") +endif +endif + +ifndef PACKAGE_NAME +export +PACKAGE_NAME := $(notdir $(shell pwd)) +$(info "INFO: No package name defined. Using current directory name: $(PACKAGE_NAME) as the package name") +endif + +WORK := build +SRC_DIR := src +OP_SRC_DIR := src/ops +OP_INCLUDE_DIR := ./include +OP_INCLUDES = #$(wildcard $(OP_INCLUDE_DIR)/*.h) user defined if any op specific headers are needed, add -I to common flags +LIBRARY_NAME := libQnn$(PACKAGE_NAME).so +SUPPORTED_TARGETS = x86_64-linux-clang hexagon-v68 hexagon-v69 hexagon-v73 hexagon-v75 hexagon-v79 aarch64-android + + +COMMON_CXX_FLAGS = -std=c++17 -I$(QNN_INCLUDE) -fPIC -Wall -Wreorder -Wno-missing-braces -Wno-unused-function +COMMON_CXX_FLAGS += -Werror -Wno-format -Wno-unused-command-line-argument -fvisibility=default -stdlib=libc++ +COMMON_CXX_FLAGS += -DQNN_API="__attribute__((visibility(\"default\")))" -D__QAIC_HEADER_EXPORT="__attribute__((visibility(\"default\")))" + +X86_LIBNATIVE_RELEASE_DIR := $(HEXAGON_SDK_ROOT_X86)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_X86)/Tools + +# Ensure hexagon sdk tool version can be retrieved +ifeq ($(wildcard $(X86_LIBNATIVE_RELEASE_DIR)/.),) +$(error "Cannot retrieve hexagon tools from: $(X86_LIBNATIVE_RELEASE_DIR). \ + \ + Please check that hexagon tools version is correct. Expected: $(HEXAGON_TOOLS_VERSION_X86)") +endif + +#Check tools for hexagon_v68 are present. +ifeq ($(MAKECMDGOALS),htp_v68) +ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V68)),) +$(error "ERROR: HEXAGON_SDK_ROOT_V68 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V68)") +endif +endif + +ifeq ($(MAKECMDGOALS),htp_v69) +ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V69)),) +$(error "ERROR: HEXAGON_SDK_ROOT_V69 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V69)") +endif +endif + +ifeq ($(MAKECMDGOALS),htp_v73) +ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V73)),) +$(error "ERROR: HEXAGON_SDK_ROOT_V73 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V73)") +endif +endif + +ifeq ($(MAKECMDGOALS),htp_v75) +ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V75)),) +$(error "ERROR: HEXAGON_SDK_ROOT_V75 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V75)") +endif +endif + +#Check tools for hexagon_v79 are present. +ifeq ($(MAKECMDGOALS),htp_v79) +ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V79)),) +$(error "ERROR: HEXAGON_SDK_ROOT_V79 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V79)") +endif +endif + + + +endif +OP_SOURCES = $(wildcard $(OP_SRC_DIR)/*.cpp) +OTHER_SOURCES = $(wildcard $(SRC_DIR)/*.cpp) +HFILES = $(wildcard $(QNN_INCLUDE)/*.h) +HFILES += $(wildcard $(QNN_INCLUDE)/HTP/*.h) +HFILES += $(wildcard $(QNN_INCLUDE)/HTP/core/*.h) +OP_OBJS = $(patsubst $(SRC_DIR)/%,%,$(patsubst %.cpp,%.o,$(OP_SOURCES))) +OTHER_OBJS = $(patsubst $(SRC_DIR)/%,%,$(patsubst %.cpp,%.o,$(OTHER_SOURCES))) + +#======= Assembly ======== +OP_SOURCES_ASM_X86 += $(wildcard $(OP_SRC_DIR)/x86_asm/*.S) +OP_OBJS_ASM_X86 += $(subst /x86_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_X86)))) +OP_SOURCES_ASM_V68 += $(wildcard $(OP_SRC_DIR)/v68_asm/*.S) +OP_OBJS_ASM_V68 += $(subst /v68_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V68)))) +OP_SOURCES_ASM_V69 += $(wildcard $(OP_SRC_DIR)/v69_asm/*.S) +OP_OBJS_ASM_V69 += $(subst /v69_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V69)))) +OP_SOURCES_ASM_V73 += $(wildcard $(OP_SRC_DIR)/v73_asm/*.S) +OP_OBJS_ASM_V73 += $(subst /v73_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V73)))) +OP_SOURCES_ASM_V75 += $(wildcard $(OP_SRC_DIR)/v75_asm/*.S) +OP_OBJS_ASM_V75 += $(subst /v75_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V75)))) +OP_SOURCES_ASM_V79 += $(wildcard $(OP_SRC_DIR)/v79_asm/*.S) +OP_OBJS_ASM_V79 += $(subst /v79_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V79)))) + +OP_SOURCES_ASM_ANDROID += $(wildcard $(OP_SRC_DIR)/android_asm/*.S) +OP_OBJS_ASM_ANDROID += $(subst /android_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_ANDROID)))) + + +all: htp_v68 htp_x86 htp_aarch64 + +#============================================================================================================ +# Setup compiler, compiler instructions and linker for x86 +X86_CXX ?= clang++-9 +# Checking if clang++-9 is present. If not switch to clang++ +ifeq ($(shell $(X86_CXX) -v 2>&1 | grep -c "clang version"), 0) + X86_CXX := clang++ +endif +X86_LDFLAGS:= -Wl,--whole-archive -L$(X86_LIBNATIVE_RELEASE_DIR)/libnative/lib -lnative -Wl,--no-whole-archive -lpthread -L$(QNN_SDK_ROOT)/lib/x86_64-linux-clang -lHtpPrepare +X86_C_FLAGS := -D__HVXDBL__ -I$(X86_LIBNATIVE_RELEASE_DIR)/libnative/include -ffast-math -DUSE_OS_LINUX +X86_CXX_FLAGS = $(COMMON_CXX_FLAGS) $(X86_C_FLAGS) -fomit-frame-pointer -Wno-invalid-offsetof +linux_objs = +#============================================================================================================ +# Setup compiler, compiler instructions and linker for hexagon +HEXAGON_CXX_FLAGS := $(COMMON_CXX_FLAGS) -mhvx -mhvx-length=128B -mhmx -DUSE_OS_QURT -O2 -Wno-reorder -DPREPARE_DISABLED + +HEXAGON_CXX_FLAGS_V68 := $(HEXAGON_CXX_FLAGS) -mv68 -I$(HEXAGON_SDK_ROOT_V68)/rtos/qurt/computev68/include/qurt -I$(HEXAGON_SDK_ROOT_V68)/rtos/qurt/computev68/include/posix -I$(HEXAGON_SDK_ROOT_V68)/incs -I$(HEXAGON_SDK_ROOT_V68)/incs/stddef +HEXAGON_CXX_FLAGS_V69 := $(HEXAGON_CXX_FLAGS) -mv69 -I$(HEXAGON_SDK_ROOT_V69)/rtos/qurt/computev69/include/qurt -I$(HEXAGON_SDK_ROOT_V69)/rtos/qurt/computev69/include/posix -I$(HEXAGON_SDK_ROOT_V69)/incs -I$(HEXAGON_SDK_ROOT_V69)/incs/stddef +HEXAGON_CXX_FLAGS_V73 := $(HEXAGON_CXX_FLAGS) -mv73 -I$(HEXAGON_SDK_ROOT_V73)/rtos/qurt/computev73/include/qurt -I$(HEXAGON_SDK_ROOT_V73)/rtos/qurt/computev73/include/posix -I$(HEXAGON_SDK_ROOT_V73)/incs -I$(HEXAGON_SDK_ROOT_V73)/incs/stddef +HEXAGON_CXX_FLAGS_V75 := $(HEXAGON_CXX_FLAGS) -mv75 -I$(HEXAGON_SDK_ROOT_V75)/rtos/qurt/computev75/include/qurt -I$(HEXAGON_SDK_ROOT_V75)/rtos/qurt/computev75/include/posix -I$(HEXAGON_SDK_ROOT_V75)/incs -I$(HEXAGON_SDK_ROOT_V75)/incs/stddef +HEXAGON_CXX_FLAGS_V79 := $(HEXAGON_CXX_FLAGS) -mv79 -I$(HEXAGON_SDK_ROOT_V79)/rtos/qurt/computev79/include/qurt -I$(HEXAGON_SDK_ROOT_V79)/rtos/qurt/computev79/include/posix -I$(HEXAGON_SDK_ROOT_V79)/incs -I$(HEXAGON_SDK_ROOT_V79)/incs/stddef + + +HEXAGON_CXX_V68 := $(HEXAGON_SDK_ROOT_V68)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V68)/Tools/bin/hexagon-clang++ +HEXAGON_CXX_V69 := $(HEXAGON_SDK_ROOT_V69)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V69)/Tools/bin/hexagon-clang++ +HEXAGON_CXX_V73 := $(HEXAGON_SDK_ROOT_V73)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V73)/Tools/bin/hexagon-clang++ +HEXAGON_CXX_V75 := $(HEXAGON_SDK_ROOT_V75)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V75)/Tools/bin/hexagon-clang++ +HEXAGON_CXX_V79 := $(HEXAGON_SDK_ROOT_V79)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V79)/Tools/bin/hexagon-clang++ + + +HEX_LDFLAGS = +hexagon_objs = +#============================================================================================================ +# Setup compiler, compiler instructions and linker for aarch64 +AARCH64_C__FLAGS = -D__HVXDBL__ -I$(X86_LIBNATIVE_RELEASE_DIR)/libnative/include -ffast-math -DUSE_OS_LINUX -DANDROID +AARCH64_CXX_FLAGS = $(COMMON_CXX_FLAGS) $(AARCH64_C__FLAGS) -fomit-frame-pointer -Wno-invalid-offsetof -Wno-unused-variable -Wno-unused-parameter -Wno-missing-braces -Wno-sign-compare -Wno-unused-private-field -Wno-unused-variable -Wno-ignored-qualifiers -Wno-missing-field-initializers +ARM_CLANG_OPTS =--target=aarch64-none-linux-android21 --sysroot=$(ANDROID_NDK_ROOT)/toolchains/llvm/prebuilt/linux-x86_64/sysroot -stdlib=libc++ -static-libstdc++ +AARCH64_CXX = $(ANDROID_NDK_ROOT)/toolchains/llvm/prebuilt/linux-x86_64/bin/clang++ $(ARM_CLANG_OPTS) +AARCH64_LDFLAGS = -L$(QNN_TARGET_LIB) -lQnnHtp -lQnnHtpPrepare +aarch64_objs = +#============================================================================================================ +# Setup targets and goals + +htp_x86: X86_BUILD + +htp_v68: HEXAGON_BUILD_V68 + +htp_v69: HEXAGON_BUILD_V69 + +htp_v73: HEXAGON_BUILD_V73 + +htp_v75: HEXAGON_BUILD_V75 + +htp_v79: HEXAGON_BUILD_V79 + + + +htp_aarch64: AARCH64_BUILD + +AARCH64_BUILD: $(WORK)/aarch64-android/$(LIBRARY_NAME) + +HEXAGON_BUILD_V68: $(WORK)/hexagon-v68/$(LIBRARY_NAME) + +HEXAGON_BUILD_V69: $(WORK)/hexagon-v69/$(LIBRARY_NAME) + +HEXAGON_BUILD_V73: $(WORK)/hexagon-v73/$(LIBRARY_NAME) + +HEXAGON_BUILD_V75: $(WORK)/hexagon-v75/$(LIBRARY_NAME) + +HEXAGON_BUILD_V79: $(WORK)/hexagon-v79/$(LIBRARY_NAME) + + + +X86_BUILD: $(WORK)/x86_64-linux-clang/$(LIBRARY_NAME) + + +define build_objs = +ifneq ($(filter $(2),$(SUPPORTED_TARGETS)),) +$(2)_objs += $(foreach x,$(1),$(WORK)/$(2)/$(x)) +else +$$(error "Unknown target option provided: $(2): Supported targets are: $(SUPPORTED_TARGETS)") +endif +endef + +$(eval $(call build_objs,$(OTHER_OBJS),x86_64-linux-clang)) +$(eval $(call build_objs,$(OP_OBJS),x86_64-linux-clang)) +$(eval $(call build_objs,$(OP_OBJS_ASM_X86),x86_64-linux-clang)) +$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v68)) +$(eval $(call build_objs,$(OP_OBJS),hexagon-v68)) +$(eval $(call build_objs,$(OP_OBJS_ASM_V68),hexagon-v68)) +$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v69)) +$(eval $(call build_objs,$(OP_OBJS),hexagon-v69)) +$(eval $(call build_objs,$(OP_OBJS_ASM_V69),hexagon-v69)) +$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v73)) +$(eval $(call build_objs,$(OP_OBJS),hexagon-v73)) +$(eval $(call build_objs,$(OP_OBJS_ASM_V73),hexagon-v73)) +$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v75)) +$(eval $(call build_objs,$(OP_OBJS),hexagon-v75)) +$(eval $(call build_objs,$(OP_OBJS_ASM_V75),hexagon-v75)) +$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v79)) +$(eval $(call build_objs,$(OP_OBJS),hexagon-v79)) +$(eval $(call build_objs,$(OP_OBJS_ASM_V75),hexagon-v79)) + +$(eval $(call build_objs,$(OTHER_OBJS),aarch64-android)) +$(eval $(call build_objs,$(OP_OBJS),aarch64-android)) +$(eval $(call build_objs,$(OP_OBJS_ASM_ANDROID),aarch64-android)) + +# x86 +$(WORK)/x86_64-linux-clang $(WORK)/hexagon-v68 $(WORK)/hexagon-v69 $(WORK)/hexagon-v73 $(WORK)/hexagon-v75 $(WORK)/hexagon-v79 $(WORK)/aarch64-android: + @mkdir -p $@/ops + +$(WORK)/x86_64-linux-clang/%.o: $(SRC_DIR)/%.cpp | $(WORK)/x86_64-linux-clang + $(X86_CXX) $(X86_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/x86_64-linux-clang/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/x86_64-linux-clang + $(X86_CXX) $(X86_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/x86_64-linux-clang/ops/%.o: $(OP_SRC_DIR)/x86_asm/%.S | $(WORK)/x86_64-linux-clang + $(X86_CXX) $(X86_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/x86_64-linux-clang/$(LIBRARY_NAME): $(x86_64-linux-clang_objs) | $(HFILES) + $(X86_CXX) -fPIC -std=c++17 -g -shared -o $@ $^ $(X86_LDFLAGS) + +# v68 +$(WORK)/hexagon-v68/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v68 + $(HEXAGON_CXX_V68) $(HEXAGON_CXX_FLAGS_V68) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v68/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v68 + $(HEXAGON_CXX_V68) $(HEXAGON_CXX_FLAGS_V68) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v68/ops/%.o: $(OP_SRC_DIR)/v68_asm/%.S | $(WORK)/hexagon-v68 + $(HEXAGON_CXX_V68) $(HEXAGON_CXX_FLAGS_V68) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v68/$(LIBRARY_NAME): $(hexagon-v68_objs) | $(HFILES) + $(HEXAGON_CXX_V68) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS) + +# v69 +$(WORK)/hexagon-v69/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v69 + $(HEXAGON_CXX_V69) $(HEXAGON_CXX_FLAGS_V69) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v69/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v69 + $(HEXAGON_CXX_V69) $(HEXAGON_CXX_FLAGS_V69) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v69/ops/%.o: $(OP_SRC_DIR)/v69_asm/%.S | $(WORK)/hexagon-v69 + $(HEXAGON_CXX_V69) $(HEXAGON_CXX_FLAGS_V69) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v69/$(LIBRARY_NAME): $(hexagon-v69_objs) | $(HFILES) + $(HEXAGON_CXX_V69) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS) + +# v73 +$(WORK)/hexagon-v73/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v73 + $(HEXAGON_CXX_V73) $(HEXAGON_CXX_FLAGS_V73) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v73/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v73 + $(HEXAGON_CXX_V73) $(HEXAGON_CXX_FLAGS_V73) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v73/ops/%.o: $(OP_SRC_DIR)/v73_asm/%.S | $(WORK)/hexagon-v73 + $(HEXAGON_CXX_V73) $(HEXAGON_CXX_FLAGS_V73) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v73/$(LIBRARY_NAME): $(hexagon-v73_objs) | $(HFILES) + $(HEXAGON_CXX_V73) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS) + +#v75 +$(WORK)/hexagon-v75/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v75 + $(HEXAGON_CXX_V75) $(HEXAGON_CXX_FLAGS_V75) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v75/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v75 + $(HEXAGON_CXX_V75) $(HEXAGON_CXX_FLAGS_V75) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v75/ops/%.o: $(OP_SRC_DIR)/v75_asm/%.S | $(WORK)/hexagon-v75 + $(HEXAGON_CXX_V75) $(HEXAGON_CXX_FLAGS_V75) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v75/$(LIBRARY_NAME): $(hexagon-v75_objs) | $(HFILES) + $(HEXAGON_CXX_V75) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS) + +#v79 +$(WORK)/hexagon-v79/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v79 + $(HEXAGON_CXX_V79) $(HEXAGON_CXX_FLAGS_V79) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v79/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v79 + $(HEXAGON_CXX_V79) $(HEXAGON_CXX_FLAGS_V79) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v79/ops/%.o: $(OP_SRC_DIR)/v79_asm/%.S | $(WORK)/hexagon-v79 + $(HEXAGON_CXX_V79) $(HEXAGON_CXX_FLAGS_V79) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v79/$(LIBRARY_NAME): $(hexagon-v79_objs) | $(HFILES) + $(HEXAGON_CXX_V79) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS) + + + +# aarch64 +$(WORK)/aarch64-android/%.o: $(SRC_DIR)/%.cpp | $(WORK)/aarch64-android + $(AARCH64_CXX) $(AARCH64_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/aarch64-android/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/aarch64-android + $(AARCH64_CXX) $(AARCH64_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/aarch64-android/ops/%.o: $(OP_SRC_DIR)/android_asm/%.S | $(WORK)/aarch64-android + $(AARCH64_CXX) $(AARCH64_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/aarch64-android/$(LIBRARY_NAME): $(aarch64-android_objs) | $(HFILES) + $(AARCH64_CXX) -fPIC -std=c++17 -g -shared -o $@ $^ $(AARCH64_LDFLAGS) + +clean: + -rm -rf $(WORK) + +.PHONY: all clean diff --git a/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/config/GgmlOpPackage.xml b/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/config/GgmlOpPackage.xml new file mode 100644 index 0000000000000..f4c6575902948 --- /dev/null +++ b/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/config/GgmlOpPackage.xml @@ -0,0 +1,88 @@ + + + + + GgmlMulMat + + + GGML MulMat operator + + + + + in[0] + + src0 + + true + BACKEND_SPECIFIC + + 4D + NHWC + [N, C, H , W] + + + + + in[1] + + src1 + + true + BACKEND_SPECIFIC + + 4D + NHWC + [N, C, H , W] + + + + + out[0] + + dst + + true + BACKEND_SPECIFIC + + 4D + [N, C, H , W] + + + + + HTP + + + + + + + GgmlMulMat + + + + + GgmlMulMat + + + in[0] + QNN_DATATYPE_FLOAT_16 + QNN_DATATYPE_FLOAT_32 + + + in[1] + QNN_DATATYPE_FLOAT_16 + QNN_DATATYPE_FLOAT_32 + + + + out[0] + QNN_DATATYPE_FLOAT_16 + QNN_DATATYPE_FLOAT_32 + + + + + + diff --git a/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/src/GgmlOpPackageInterface.cpp b/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/src/GgmlOpPackageInterface.cpp new file mode 100644 index 0000000000000..df9ab364209b5 --- /dev/null +++ b/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/src/GgmlOpPackageInterface.cpp @@ -0,0 +1,274 @@ +//============================================================================== +// Auto Generated Code for GgmlOpPackage +//============================================================================== + +#include "HTP/QnnHtpCommon.h" +#include "HTP/core/constraints.h" +#include "HTP/core/op_package_feature_support.h" +#include "HTP/core/op_register_ext.h" +#include "HTP/core/optimize.h" +#include "HTP/core/simple_reg.h" +#include "HTP/core/unique_types.h" +#include "QnnOpPackage.h" +#include "QnnSdkBuildId.h" + +DEFINE_UNIQ_TY() +BEGIN_PKG_OPS_OPTS_LIST() + +/** Note that the order of declarations given here defines the order in which ops and graph optimizations are + * registered to the HTP Core. + * Append the latest OpName at the bottom + */ +DECLARE_PKG_OPS_OPTS_LIST(PKG_GgmlMulMat) + +END_PKG_OPS_OPTS_LIST() + +// op package info +static constexpr auto sg_packageName = THIS_PKG_NAME_STR; // package name passed in as compile flag + +static std::array sg_opNames{{"GgmlMulMat"}}; + +static Qnn_ApiVersion_t sg_sdkApiVersion = QNN_HTP_API_VERSION_INIT; +static QnnOpPackage_Info_t sg_packageInfo = QNN_OP_PACKAGE_INFO_INIT; + +// global data +static QnnOpPackage_GlobalInfrastructure_t sg_globalInfra = +nullptr; // global infrastructure not in use for now +static bool sg_packageInitialized = false; + +/* + * user provided logging call back function + * currently only supported on linux x86-64 and nonrpc versions + * typedef void (*QnnLog_Callback_t)(const char* fmt, + * QnnLog_Level_t level, + * uint64_t timestamp, + * va_list args); + * usage: if(sg_logInitialized && level <= sg_maxLogLevel) + * sg_logCallback(fmt, level, timestamp, args); + * + * for cross rpc versions, skel side user provided logging call back function + * can be defined as part of op packages. maximal log level sg_maxLogLevel + * can be set by Qnn_ErrorHandle_t GgmlOpPackageLogSetLevel(QnnLog_Level_t maxLogLevel) + */ +/* + * for alternative logging method provided by HTP core, please refer to log.h + */ +static QnnLog_Callback_t sg_logCallback = + nullptr; // user provided call back function pointer for logging +static QnnLog_Level_t sg_maxLogLevel = + (QnnLog_Level_t)0; // maximal log level used in user provided logging +static bool sg_logInitialized = + false; // tracks whether user provided logging method has been initialized + + +/* +* op initialization +* needs to be global in the package +* one initialization per package before any op definitions +* syntax: INIT_PACKAGE_OP_DEF() +*/ +INIT_PACKAGE_OP_DEF() + +/* +* optimization initialization +* needs to be global in the package +* one initialization per package before any optimization definitions +* syntax: INIT_PACKAGE_OPTIMIZATION_DEF() +*/ +INIT_PACKAGE_OPTIMIZATION_DEF() + +/* + * op parameter order initialization + * needs to be global in the package + * one initialization per package before any op parameter order definitions + * syntax: INIT_PACKAGE_PARAM_ORDER_DEF() + */ +INIT_PACKAGE_PARAM_ORDER_DEF() + +/* + * axis parameter name list + * optional + * needs to be global in the package + * one list per package + * for listing axis parameter names passed into Qnn_AddNode API + * HTP backend auto-adjusts values in axis parameters based on HTP backfilling + * note: HTP backend backfills tensor dimensions to 4 dimensions + * syntax: LIST_PACKAGE_AXIS_PARAMS(...) + * e.g. LIST_PACKAGE_AXIS_PARAMS("Axis", "AXIS", "axis") + */ +// LIST_PACKAGE_AXIS_PARAMS() + +/* + * per-channel quantized op name list + * optional + * needs to be global in the package + * one list per package + * for listing op names which support per-channel quantization + * per-axis quantization info of an op is embeded in axisScaleOffsetEncoding + * inside Qnn_Tensor_t types + * HTP backend only supports per-channel scale ops + * i.e. along last dimension, offset is always zero + * if an op name is marked as having per-channel scale support, and in + * QNN_AddNode, at least one input, parameter, or output has + * QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET type: + * then: + * HTP backend will pass to op implementation function the following: + * output(s), input(s), parameter(s), + * outputPerChannelScale(s), inputPerChannelScale(s), paramPerChannelScale(s) + * + * optimization rules can be used to remove extra perChannelScale tensors + * + * syntax: LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS(...) + * e.g. LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS(sg_op1Name, sg_op2Name) + */ + +// LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS() + +/* +* Declare and define the special intialize function for HTP Backend to load +*/ +INIT_PKG_CORE_INIT_FUNC() + +/* op package API's */ + +Qnn_ErrorHandle_t GgmlOpPackageInit(QnnOpPackage_GlobalInfrastructure_t infrastructure) { + if (sg_packageInitialized) return QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED; + + /* + * op parameter order registration + * registers all defined op parameter orders in the package + * syntax: REGISTER_PACKAGE_PARAM_ORDERS() + */ + REGISTER_PACKAGE_PARAM_ORDERS() + + /* + * op axis parameter name registration + * registers all axis parameter names in the package + * used with LIST_PACKAGE_AXIS_PARAMS(...) + * syntax: REGISTER_PACKAGE_AXIS_PARAMS() + */ + REGISTER_PACKAGE_AXIS_PARAMS() + + /* + * per-channel scale op name registration + * registers all per-channel scale op names in the package + * used with LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS(...) + * syntax: REGISTER_PACKAGE_PER_CHANNEL_QUANTIZED_OPS() + */ + REGISTER_PACKAGE_PER_CHANNEL_QUANTIZED_OPS() + + sg_globalInfra = infrastructure; + sg_packageInitialized = true; + return QNN_SUCCESS; +} + +Qnn_ErrorHandle_t GgmlOpPackageGetInfo(const QnnOpPackage_Info_t** info) { + if (!sg_packageInitialized) return QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED; + if (!info) return QNN_OP_PACKAGE_ERROR_INVALID_INFO; + + sg_packageInfo = QNN_OP_PACKAGE_INFO_INIT; + sg_packageInfo.packageName = sg_packageName; + sg_packageInfo.operationNames = sg_opNames.data(); + sg_packageInfo.numOperations = sg_opNames.size(); + sg_packageInfo.sdkBuildId = QNN_SDK_BUILD_ID; + sg_packageInfo.sdkApiVersion = &sg_sdkApiVersion; + + *info = &sg_packageInfo; + return QNN_SUCCESS; +} + +Qnn_ErrorHandle_t GgmlOpPackageLogInitialize(QnnLog_Callback_t callback, QnnLog_Level_t maxLogLevel) { + if (sg_logInitialized) return QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED; + if (!callback) return QNN_LOG_ERROR_INVALID_ARGUMENT; + if (maxLogLevel < QNN_LOG_LEVEL_ERROR) return QNN_LOG_ERROR_INVALID_ARGUMENT; + sg_logCallback = callback; + sg_maxLogLevel = maxLogLevel; + sg_logInitialized = true; + return QNN_SUCCESS; +} + +Qnn_ErrorHandle_t GgmlOpPackageLogSetLevel(QnnLog_Level_t maxLogLevel) { + if (maxLogLevel < QNN_LOG_LEVEL_ERROR) return QNN_LOG_ERROR_INVALID_ARGUMENT; + sg_maxLogLevel = maxLogLevel; + return QNN_SUCCESS; +} + +Qnn_ErrorHandle_t GgmlOpPackageLogTerminate() { + if (!sg_logInitialized) return QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED; + sg_logCallback = nullptr; + sg_maxLogLevel = (QnnLog_Level_t)0; + sg_logInitialized = false; + return QNN_SUCCESS; +} + +Qnn_ErrorHandle_t GgmlOpPackageValidateOpConfig (Qnn_OpConfig_t opConfig){ + if (std::string(sg_packageName) != opConfig.v1.packageName) { + return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; + } + + /* auto-generated validation code below + * Check if op config type matches any registered ops + * If a match is found, check number of inputs, outputs and params + */ + if (std::string(opConfig.v1.typeName) == "GgmlMulMat"){ + if (opConfig.v1.numOfParams != 0 || opConfig.v1.numOfInputs != 2 || opConfig.v1.numOfOutputs != 1){ + return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; + } + } + else{ + return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; + } + + /* + * additional validation code here + * */ + + return QNN_SUCCESS; +} + +/* The following three functions in this comment are not called by HTP backend for now, + * no auto-generated implementations are created. Users should see example for full function signatures. + * (version 1.3.0) Qnn_ErrorHandle_t GgmlOpPackageCreateKernels (QnnOpPackage_GraphInfrastructure_t + * graphInfrastructure, QnnOpPackage_Node_t node, QnnOpPackage_Kernel_t** kernels, uint32_t* + * numKernels) + * (version 1.3.0) Qnn_ErrorHandle_t GgmlOpPackageFreeKernels (QnnOpPackage_Kernel_t* kernels) + * + * (version 1.4.0) Qnn_ErrorHandle_t GgmlOpPackageCreateOpImpl (QnnOpPackage_GraphInfrastructure_t + * graphInfrastructure, QnnOpPackage_Node_t node, QnnOpPackage_OpImpl_t* opImpl) + *(version 1.4.0) Qnn_ErrorHandle_t GgmlOpPackageFreeOpImpl (QnnOpPackage_OpImpl_t opImpl) + */ + +Qnn_ErrorHandle_t GgmlOpPackageTerminate() { +if (!sg_packageInitialized) return QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED; + +sg_globalInfra = nullptr; +sg_packageInitialized = false; +return QNN_SUCCESS; +} + +#ifdef __cplusplus +extern "C" { +#endif + + +/* latest version */ +Qnn_ErrorHandle_t GgmlOpPackageInterfaceProvider(QnnOpPackage_Interface_t* interface) { + if (!interface) return QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT; + interface->interfaceVersion = {1, 4, 0}; + interface->v1_4.init = GgmlOpPackageInit; + interface->v1_4.terminate = GgmlOpPackageTerminate; + interface->v1_4.getInfo = GgmlOpPackageGetInfo; + interface->v1_4.validateOpConfig = GgmlOpPackageValidateOpConfig; + interface->v1_4.createOpImpl = nullptr; + interface->v1_4.freeOpImpl = nullptr; + interface->v1_4.logInitialize = GgmlOpPackageLogInitialize; + interface->v1_4.logSetLevel = GgmlOpPackageLogSetLevel; + interface->v1_4.logTerminate = GgmlOpPackageLogTerminate; + return QNN_SUCCESS; +} + +#ifdef __cplusplus +} +#endif + + diff --git a/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/src/ops/GgmlMulMat.cpp b/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/src/ops/GgmlMulMat.cpp new file mode 100644 index 0000000000000..137522cc80773 --- /dev/null +++ b/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/src/ops/GgmlMulMat.cpp @@ -0,0 +1,213 @@ +//============================================================================== +// Auto Generated Code for GgmlOpPackage +//============================================================================== + +#include "HTP/core/constraints.h" +#include "HTP/core/op_package_feature_support.h" +#include "HTP/core/op_register_ext.h" +#include "HTP/core/optimize.h" +#include "HTP/core/simple_reg.h" +#include "QnnOpPackage.h" + +BEGIN_PKG_OP_DEFINITION(PKG_GgmlMulMat); + +// op execute function declarations +template +GraphStatus ggmlmulmatImpl(TensorType & out_0, const TensorType & in_0, const TensorType & in_1); + +// forward declaration of sample cost function +static float ggmlmulmatCostFunc(const Op * op); + +/* + * method 1 for defining op, using default cost value (i.e. GLACIAL) and default flag (Flags::RESOURCE_HVX) + * syntax: DEF_PACKAGE_OP(F,OP) + * e.g. DEF_PACKAGE_OP((ggmlmulmatImpl), "GgmlMulMat") + */ +DEF_PACKAGE_OP((ggmlmulmatImpl), "GgmlMulMat") + +/* + * method 2 for defining op with specified cost value (one of GLACIAL, SNAIL, FAST, FREE) + * and provided flags + * syntax: DEF_PACKAGE_OP_AND_COST_AND_FLAGS(F,OP,COST,...) + * can use zero or more flags, FLAG options are IS_CONST, INHIBIT_CONST_PROP, + * RESOURCE_HVX, RESOURCE_HMX(not supported in external op packages) + * e.g. DEF_PACKAGE_OP_AND_COST_AND_FLAGS((ggmlmulmatImpl), "GgmlMulMat", SNAIL) + */ + +/* + * method 3 for defining op with cost function pointer and provided flags + * cost function pointer type: typedef float (*cost_function) (const Op * op); + * syntax: DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS(F,OP,COST_F,...) + * e.g. DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS((ggmlmulmatImpl), + * "GgmlMulMat", ggmlmulmatCostFunc, Flags::RESOURCE_HVX) + */ + +/* + * optimization definitions + * need to be global in the package + * one definition per optimization + * syntax: DEF_PACKAGE_OPTIMIZATION(PRIORITY,MATCHCODE,CONSTRAINTCODE,REPLACECODE) + * PRIORITY predefined values include EARLY(2000), MIDDLE(3000), LATE(4000) + * HTP core provides some replacement functions for op package to use + * for more information about optimization rules, please refer to HTP core documentations + */ + +/* + * op parameter order definitions + * need to be global in the package + * one definition per op, and this is optional + * syntax: DEF_PACKAGE_PARAM_ORDER(OP,PARAM1,MANDATORY1,DEFAULT1,PARAM2,MANDATORY2,DEFAULT2...) + * one or more parameters can be specified for each op + * order of parameters listed determines the order of parameters passed into op execution functions + * if an op does not have a parameter order definition, parameter order passed into Qnn_addNode + * will be passed into op execution functions + * if an op has a parameter order definition, any parameter passed into Qnn_addNode with unlisted + * name will be abandoned + * if two or more op packages with the same package name will be registered, they cannot list + * conflicting parameter orders + * PARAM refers to parameter name as a string literal + * MANDATORY refers to whether this parameter is required to be provided at Qnn_addNode + * DEFAULT is used when MANDATORY is false + * if provided as Qnn_Param_t*, + * DEFAULT will be used for graph construction when this parameter is not provided at + * Qnn_addNode + * if provided as nullptr, + * graph construction will skip this parameter when this parameter is not provided at + * Qnn_addNode + */ + +namespace { + +constexpr const size_t kBytesPerVector = sizeof(HVX_Vector); // 128 for v73 +constexpr const size_t kFloatsPerVector = kBytesPerVector / sizeof(float); +constexpr const size_t kAlignMask = kBytesPerVector - 1; + +inline size_t unaligned_bytes(const void * addr) { + return ((size_t) addr) & kAlignMask; +} + +inline bool is_addr_aligned(void * addr) { + return unaligned_bytes(addr) == 0; +} + +inline float vec_dot_product_f32(const float * src0, const float * src1, size_t count) { + HVX_Vector * iptr0 = ((HVX_Vector *) src0); + HVX_Vector * iptr0_end = ((HVX_Vector *) src0) + (count / kFloatsPerVector); + HVX_Vector * iptr1 = ((HVX_Vector *) src1); + HVX_Vector prev0 = *iptr0++; + HVX_Vector prev1 = *iptr1++; + HVX_Vector sum = Q6_V_vzero(); + + // TODO: prefetch? + while (iptr0 < iptr0_end) { + HVX_Vector curr0 = *iptr0++; + HVX_Vector curr1 = *iptr1++; + HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); + sum = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(s0, s1), sum); + prev0 = curr0; + prev1 = curr1; + } + + if ((iptr0_end - ((HVX_Vector *) src0)) > 0) { + // handle the last vector + // see also: https://github.com/UbiquitousLearning/mllm/blob/babf4410352ce8730824c87699c025a0d4ce3a6f/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/LLaMAMul.cpp#L147 + HVX_Vector curr0 = is_addr_aligned(iptr0) ? prev0 : *iptr0++; + HVX_Vector curr1 = is_addr_aligned(iptr1) ? prev1 : *iptr1++; + HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); + sum = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(s0, s1), sum); + prev0 = curr0; + prev1 = curr1; + } + + const size_t leftover = count % kFloatsPerVector; + const size_t leftover_bytes = leftover * sizeof(float); + if (leftover > 0) { + // handle the leftover elements + HVX_Vector curr0 = (leftover_bytes + unaligned_bytes(iptr0) > kBytesPerVector) ? *iptr0 : prev0; + curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + + HVX_Vector curr1 = (leftover_bytes + unaligned_bytes(iptr1) > kBytesPerVector) ? *iptr1 : prev1; + curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); + + sum = Q6_Vqf32_vadd_Vqf32Vqf32( + Q6_V_valign_VVR(Q6_Vqf32_vmpy_VsfVsf(curr0, curr1), Q6_V_vzero(), leftover_bytes), sum); + } + + // TODO: do we have a better way to do the reduction? + for (size_t i = kFloatsPerVector / 2; i > 0; i /= 2) { + sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_V_vror_VR(sum, i * sizeof(float))); + } + + float result; + q6op_vstu_variable_ARV(&result, sizeof(float), Q6_Vsf_equals_Vqf32(sum)); + return result; +} + +template +inline GraphStatus mul_mat_2d_f32(TensorType & out_0, const TensorType & in_0, const TensorType & in_1) { + // TODO: handle strides? + if (in_1.dim(1) != in_0.dim(1)) { + return GraphStatus::ErrorDimensions; + } + + size_t dims[4] = { in_1.dim(0), in_0.dim(0) }; + out_0.set_dims(dims); + + auto in0_ptr = (float *) in_0.raw_data_const(); + auto in1_ptr = (float *) in_1.raw_data_const(); + auto out_ptr = (float *) out_0.raw_data(); + + for (size_t i = 0; i < dims[0]; i++) { + // TODO: prefetch? + auto * in1_row = in1_ptr + i * in_1.dim(1); + auto * out_row = out_ptr + i * dims[1]; + for (size_t j = 0; j < dims[1]; j++) { + *out_row++ = vec_dot_product_f32(in0_ptr + j * in_0.dim(1), in1_row, in_0.dim(1)); + } + } + + return GraphStatus::Success; +} + +} // namespace + +/* execute functions for ops */ + +template +GraphStatus ggmlmulmatImpl(TensorType & out_0, const TensorType & in_0, const TensorType & in_1) { + if (!in_0.raw_data_const() || !in_1.raw_data_const() || !out_0.raw_data()) { + return GraphStatus::ErrorBadInput; + } + + if (in_0.rank() != in_1.rank()) { + return GraphStatus::ErrorRank; + } + + auto rank = in_0.rank(); + switch (rank) { + case 4: + case 3: + // TODO: add implementation + return GraphStatus::ErrorUnsupported; + case 2: + return mul_mat_2d_f32(out_0, in_0, in_1); + } + + return GraphStatus::ErrorRank; +} + +__attribute__((unused)) static float ggmlmulmatCostFunc(const Op * op) { + /* + * add code here + * */ + + float cost = 0.0; // add cost computation here + return cost; +} + +/* At the bottom of the op file, call END_PKG_OP_DEFINITION(), + where is as BEGIN_PKG_OP_DEFINITION +*/ +END_PKG_OP_DEFINITION(PKG_GgmlMulMat); diff --git a/ggml/src/ggml-qnn/logger.cpp b/ggml/src/ggml-qnn/qnn/logger.cpp similarity index 100% rename from ggml/src/ggml-qnn/logger.cpp rename to ggml/src/ggml-qnn/qnn/logger.cpp diff --git a/ggml/src/ggml-qnn/logger.hpp b/ggml/src/ggml-qnn/qnn/logger.hpp similarity index 100% rename from ggml/src/ggml-qnn/logger.hpp rename to ggml/src/ggml-qnn/qnn/logger.hpp diff --git a/ggml/src/ggml-qnn/op-config-base.hpp b/ggml/src/ggml-qnn/qnn/op-config-base.hpp similarity index 98% rename from ggml/src/ggml-qnn/op-config-base.hpp rename to ggml/src/ggml-qnn/qnn/op-config-base.hpp index 87ca798272058..c2370000b235d 100644 --- a/ggml/src/ggml-qnn/op-config-base.hpp +++ b/ggml/src/ggml-qnn/qnn/op-config-base.hpp @@ -3,6 +3,7 @@ #include #include +#include "common.hpp" #include "ggml-qnn.h" #include "qnn-types.hpp" #include "tensor.hpp" @@ -60,7 +61,7 @@ class ggml_qnn_op_config { * @param graph_handle * @return true if tensors and nodes are successfully created, false otherwise. */ - virtual bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) = 0; + virtual bool initialize_op_nodes(backend_index_type device, Qnn_GraphHandle_t graph_handle) = 0; /** * @brief Pure virtual function to retrieve the input tensors. diff --git a/ggml/src/ggml-qnn/op-config-caps.cpp b/ggml/src/ggml-qnn/qnn/op-config-caps.cpp similarity index 95% rename from ggml/src/ggml-qnn/op-config-caps.cpp rename to ggml/src/ggml-qnn/qnn/op-config-caps.cpp index 6fd65aec08a79..d5b55eff970c9 100644 --- a/ggml/src/ggml-qnn/op-config-caps.cpp +++ b/ggml/src/ggml-qnn/qnn/op-config-caps.cpp @@ -224,18 +224,23 @@ static_assert(kOpCaps[GGML_OP_COUNT + GGML_UNARY_OP_GELU].qnn_op_name, static_assert(std::size(kOpCaps) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), "GGML_OP_COUNT does not match the size of the kOpCaps table"); -std::shared_ptr mat_mul_op_constructor(const ggml_tensor * op, - const std::string & instance_name, - std::shared_ptr qnn_instance) { - GGML_UNUSED(op); +std::shared_ptr mat_mul_op_constructor(const ggml_tensor * op, + const std::string & instance_name, + qnn::qnn_instance_ptr qnn_instance) { + if (qnn_instance->has_custom_op_package() && ggml_n_dims(op) == 2) { + QNN_LOG_DEBUG("create GgmlMulMat, name %s, use GgmlOpPackage\n", instance_name.c_str()); + return std::make_shared(instance_name, "GgmlOpPackage", "GgmlMulMat", + qnn_instance); + } + QNN_LOG_DEBUG("create QNN_OP_MAT_MUL, name %s\n", instance_name.c_str()); return std::make_shared(instance_name, qnn_instance); } template -std::shared_ptr generic_op_constructor(const ggml_tensor * op, - const std::string & instance_name, - std::shared_ptr qnn_instance) { +std::shared_ptr generic_op_constructor(const ggml_tensor * op, + const std::string & instance_name, + qnn::qnn_instance_ptr qnn_instance) { GGML_UNUSED(op); static_assert(_op < std::size(kOpCaps)); static_assert(kOpCaps[_op].qnn_op_name != nullptr); @@ -251,8 +256,9 @@ void add_type_parameters(std::shared_ptr op, const } template -std::shared_ptr op_constructor_with_type_param( - const ggml_tensor * op, const std::string & instance_name, std::shared_ptr qnn_instance) { +std::shared_ptr op_constructor_with_type_param(const ggml_tensor * op, + const std::string & instance_name, + qnn::qnn_instance_ptr qnn_instance) { static_assert(std::is_base_of::value); static_assert(_op < std::size(kOpCaps)); diff --git a/ggml/src/ggml-qnn/op-config-impl.cpp b/ggml/src/ggml-qnn/qnn/op-config-impl.cpp similarity index 94% rename from ggml/src/ggml-qnn/op-config-impl.cpp rename to ggml/src/ggml-qnn/qnn/op-config-impl.cpp index b85f14504573a..e546da4929c77 100644 --- a/ggml/src/ggml-qnn/op-config-impl.cpp +++ b/ggml/src/ggml-qnn/qnn/op-config-impl.cpp @@ -48,7 +48,7 @@ void ggml_qnn_op_config_base::add_scalar_param(const std::string & name, const Q bool ggml_qnn_op_config_base::add_tensor_param(const std::string & name, const qnn_dimension_array_t & dimensions, int rank, const uint8_t * data, const Qnn_DataType_t data_type, - QNNBackend device, Qnn_GraphHandle_t graph_handle) { + backend_index_type device, Qnn_GraphHandle_t graph_handle) { std::string tensor_name = _name + name + std::to_string(_tensor_parameters.size()); auto param_tensor = std::make_shared(ggml_qnn_tensor::PARAMETER, tensor_name, dimensions, data_type, rank, device, graph_handle, _qnn_instance); @@ -131,7 +131,8 @@ bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) { auto qnn_interface = _qnn_instance->get_qnn_interface(); auto error = qnn_interface->qnn_graph_add_node(graph_handle, get_op_config()); if (error != QNN_SUCCESS) { - QNN_LOG_ERROR("[%s]qnn_graph_add_node.error: %s\n", _name.c_str(), get_qnn_error_string(error)); + QNN_LOG_ERROR("[%s][%s][%s]qnn_graph_add_node.error: %s\n", _name.c_str(), _package_name.c_str(), + _op_type.c_str(), get_qnn_error_string(error)); return false; } @@ -183,13 +184,13 @@ Qnn_OpConfig_t ggml_qnn_op_config_base::get_op_config() { return config; } -bool ggml_qnn_single_op_config::initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) { +bool ggml_qnn_single_op_config::initialize_op_nodes(backend_index_type device, Qnn_GraphHandle_t graph_handle) { GGML_UNUSED(device); GGML_UNUSED(graph_handle); return true; } -bool ggml_qnn_rmsnorm_op_config::initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) { +bool ggml_qnn_rmsnorm_op_config::initialize_op_nodes(backend_index_type device, Qnn_GraphHandle_t graph_handle) { constexpr const uint32_t kAxes[] = { 0 }; add_tensor_param(QNN_OP_RMS_NORM_PARAM_AXES, { 1 }, 1, reinterpret_cast(kAxes), QNN_DATATYPE_UINT_32, device, graph_handle); @@ -220,7 +221,7 @@ bool ggml_qnn_aggregate_op_config::bind_output_tensors(const ggml_tensor_array_t return qnn::bind_tensors(tensor_outputs, _tensor_outputs); } -bool ggml_qnn_matmul_op_config::initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) { +bool ggml_qnn_matmul_op_config::initialize_op_nodes(backend_index_type device, Qnn_GraphHandle_t graph_handle) { GGML_ASSERT(_tensor_inputs.size() == 2); GGML_ASSERT(_tensor_outputs.size() == 1); @@ -251,8 +252,9 @@ bool ggml_qnn_matmul_op_config::initialize_op_nodes(QNNBackend device, Qnn_Graph return true; } -qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, - const int rank, qnn_tensor_ptr_t tensor_input, +qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(backend_index_type device, + Qnn_GraphHandle_t graph_handle, const int rank, + qnn_tensor_ptr_t tensor_input, qnn_dimension_array_t output_dimensions) { if (rank <= 2) { return tensor_input; @@ -270,7 +272,7 @@ qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend devic // create concat nodes, to convert tensor shape from [ne03, ne02, n, k] to [ne03 * x, ne02 * y, n, k] constexpr const auto create_node = [](const std::string & name, const int rank, const int axis, const qnn_dimension_array_t & dimensions, - qnn_tensor_ptr_t tensor_input, QNNBackend device, Qnn_GraphHandle_t graph_handle, + qnn_tensor_ptr_t tensor_input, backend_index_type device, Qnn_GraphHandle_t graph_handle, qnn_instance_ptr qnn_instance, qnn_tensor_ptr_t & tensor_output) -> qnn_op_config_ptr_t { auto gather_out = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, name + "_out", dimensions, @@ -318,8 +320,8 @@ qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend devic return gather1_out; } -Qnn_DataType_t ggml_qnn_matmul_op_config::create_input_convert_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, - const int rank, +Qnn_DataType_t ggml_qnn_matmul_op_config::create_input_convert_nodes(backend_index_type device, + Qnn_GraphHandle_t graph_handle, const int rank, qnn_tensor_array_t & tensor_inputs) { if (device == QNN_BACKEND_GPU) { // there's no convert op for GPU, so we should create matmul nodes directly. @@ -352,8 +354,8 @@ Qnn_DataType_t ggml_qnn_matmul_op_config::create_input_convert_nodes(QNNBackend return tensor_type; } -qnn_op_config_ptr_t ggml_qnn_matmul_op_config::create_output_convert_nodes(QNNBackend device, - Qnn_GraphHandle_t graph_handle, +qnn_op_config_ptr_t ggml_qnn_matmul_op_config::create_output_convert_nodes(backend_index_type device, + Qnn_GraphHandle_t graph_handle, const int rank, Qnn_DataType_t tensor_type, qnn_tensor_array_t & tensor_outputs) { GGML_ASSERT(tensor_outputs.size() == 1); diff --git a/ggml/src/ggml-qnn/op-config-impl.hpp b/ggml/src/ggml-qnn/qnn/op-config-impl.hpp similarity index 83% rename from ggml/src/ggml-qnn/op-config-impl.hpp rename to ggml/src/ggml-qnn/qnn/op-config-impl.hpp index 558b5cafbe4cb..36de66858acb6 100644 --- a/ggml/src/ggml-qnn/op-config-impl.hpp +++ b/ggml/src/ggml-qnn/qnn/op-config-impl.hpp @@ -23,7 +23,7 @@ class ggml_qnn_op_config_base : public ggml_qnn_op_config { void add_scalar_param(const std::string & name, const Qnn_Scalar_t scalar); bool add_tensor_param(const std::string & name, const qnn_dimension_array_t & dimensions, int rank, - const uint8_t * data, const Qnn_DataType_t data_type, QNNBackend device, + const uint8_t * data, const Qnn_DataType_t data_type, backend_index_type device, Qnn_GraphHandle_t graph_handle); void set_input_tensors(qnn::qnn_tensor_array_t & tensor_inputs) override; @@ -65,7 +65,7 @@ class ggml_qnn_single_op_config : public ggml_qnn_op_config_base { const std::string & op_type, qnn_instance_ptr qnn_instance) : ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} - bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) override; + bool initialize_op_nodes(backend_index_type device, Qnn_GraphHandle_t graph_handle) override; private: DISABLE_COPY(ggml_qnn_single_op_config); @@ -78,7 +78,7 @@ class ggml_qnn_rmsnorm_op_config : public ggml_qnn_op_config_base { const std::string & op_type, qnn_instance_ptr qnn_instance) : ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} - bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) override; + bool initialize_op_nodes(backend_index_type device, Qnn_GraphHandle_t graph_handle) override; private: DISABLE_COPY(ggml_qnn_rmsnorm_op_config); @@ -143,15 +143,16 @@ class ggml_qnn_matmul_op_config : public ggml_qnn_aggregate_op_config { ggml_qnn_matmul_op_config(const std::string & name, qnn_instance_ptr qnn_instance) : ggml_qnn_aggregate_op_config(name, qnn_instance) {} - bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) override; + bool initialize_op_nodes(backend_index_type device, Qnn_GraphHandle_t graph_handle) override; private: - qnn_tensor_ptr_t create_gather_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, - qnn_tensor_ptr_t tensor_input, qnn_dimension_array_t output_dimensions); - Qnn_DataType_t create_input_convert_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, - qnn_tensor_array_t & tensor_inputs); - qnn_op_config_ptr_t create_output_convert_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, - Qnn_DataType_t tensor_type, qnn_tensor_array_t & tensor_outputs); + qnn_tensor_ptr_t create_gather_nodes(backend_index_type device, Qnn_GraphHandle_t graph_handle, const int rank, + qnn_tensor_ptr_t tensor_input, qnn_dimension_array_t output_dimensions); + Qnn_DataType_t create_input_convert_nodes(backend_index_type device, Qnn_GraphHandle_t graph_handle, const int rank, + qnn_tensor_array_t & tensor_inputs); + qnn_op_config_ptr_t create_output_convert_nodes(backend_index_type device, Qnn_GraphHandle_t graph_handle, + const int rank, Qnn_DataType_t tensor_type, + qnn_tensor_array_t & tensor_outputs); bool create_mat_mul_nodes(qnn_tensor_array_t & tensor_inputs, qnn_tensor_array_t & tensor_outputs); DISABLE_COPY(ggml_qnn_matmul_op_config); diff --git a/ggml/src/ggml-qnn/op-config.hpp b/ggml/src/ggml-qnn/qnn/op-config.hpp similarity index 100% rename from ggml/src/ggml-qnn/op-config.hpp rename to ggml/src/ggml-qnn/qnn/op-config.hpp diff --git a/ggml/src/ggml-qnn/profiler.cpp b/ggml/src/ggml-qnn/qnn/profiler.cpp similarity index 100% rename from ggml/src/ggml-qnn/profiler.cpp rename to ggml/src/ggml-qnn/qnn/profiler.cpp diff --git a/ggml/src/ggml-qnn/profiler.hpp b/ggml/src/ggml-qnn/qnn/profiler.hpp similarity index 100% rename from ggml/src/ggml-qnn/profiler.hpp rename to ggml/src/ggml-qnn/qnn/profiler.hpp diff --git a/ggml/src/ggml-qnn/qnn-lib.cpp b/ggml/src/ggml-qnn/qnn/qnn-lib.cpp similarity index 69% rename from ggml/src/ggml-qnn/qnn-lib.cpp rename to ggml/src/ggml-qnn/qnn/qnn-lib.cpp index 2ec76939c9e2e..12e94aaac747c 100644 --- a/ggml/src/ggml-qnn/qnn-lib.cpp +++ b/ggml/src/ggml-qnn/qnn/qnn-lib.cpp @@ -3,6 +3,9 @@ #include +#include "common.hpp" +#include "rpc-mem.hpp" + #if defined(__linux__) # include #endif @@ -10,19 +13,23 @@ namespace { #ifdef _WIN32 -constexpr const char * kQnnSystemLibName = "QnnSystem.dll"; -constexpr const char * kQnnRpcLibName = "libcdsprpc.dll"; -constexpr const char * kQnnCpuLibName = "QnnCpu.dll"; -constexpr const char * kQnnGpuLibName = "QnnGpu.dll"; -constexpr const char * kQnnNpuLibName = "QnnHtp.dll"; +# define PLATFORM_LIB_FILENAME(name) (name ".dll") +#else +# define PLATFORM_LIB_FILENAME(name) ("lib" name ".so") +#endif + +#if defined(__aarch64__) || defined(_M_ARM64) // TODO: check for other platforms +# define PLATFORM_LIB_POSFIX "_aarch64" #else -constexpr const char * kQnnSystemLibName = "libQnnSystem.so"; -constexpr const char * kQnnRpcLibName = "libcdsprpc.so"; -constexpr const char * kQnnCpuLibName = "libQnnCpu.so"; -constexpr const char * kQnnGpuLibName = "libQnnGpu.so"; -constexpr const char * kQnnNpuLibName = "libQnnHtp.so"; +# define PLATFORM_LIB_POSFIX "_x64" #endif +constexpr const char * kQnnSystemLibName = PLATFORM_LIB_FILENAME("QnnSystem"); +constexpr const char * kQnnCpuLibName = PLATFORM_LIB_FILENAME("QnnCpu"); +constexpr const char * kQnnGpuLibName = PLATFORM_LIB_FILENAME("QnnGpu"); +constexpr const char * kQnnNpuLibName = PLATFORM_LIB_FILENAME("QnnHtp"); +constexpr const char * kQnnCpuPackageLibName = PLATFORM_LIB_FILENAME("QnnGgmlOpPackage" PLATFORM_LIB_POSFIX); + constexpr const qnn::device_caps kDeviceCaps[] = { { // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/CpuOpDefSupplement.html#matmul @@ -46,8 +53,8 @@ constexpr const qnn::device_caps kDeviceCaps[] = { }, }; -static_assert(sizeof(kDeviceCaps) / sizeof(kDeviceCaps[0]) == GGML_QNN_MAX_DEVICES, - "The number of qnn devices should be equal to GGML_QNN_MAX_DEVICES"); +static_assert(sizeof(kDeviceCaps) / sizeof(kDeviceCaps[0]) == QNN_BACKEND_COUNT, + "The number of qnn devices should be equal to QNN_BACKEND_COUNT"); static_assert(kDeviceCaps[QNN_BACKEND_NPU].type == GGML_BACKEND_DEVICE_TYPE_ACCEL, "The NPU device should be an accelerator device"); static_assert(kDeviceCaps[QNN_BACKEND_GPU].type == GGML_BACKEND_DEVICE_TYPE_GPU, @@ -102,23 +109,67 @@ bool set_qnn_lib_search_path(const std::string & custom_lib_search_path) { return true; } -qnn::dl_handler_t load_lib_with_fallback(const std::string & lib_path, const std::string & load_directory) { +common::dl_handler_t load_lib_with_fallback(const std::string & lib_path, const std::string & load_directory) { std::filesystem::path full_path(load_directory); full_path /= std::filesystem::path(lib_path).filename(); - auto handle = qnn::dl_load(full_path.string()); + auto handle = common::dl_load(full_path.string()); if (!handle) { QNN_LOG_WARN("failed to load %s, fallback to %s\n", full_path.c_str(), lib_path.c_str()); - handle = qnn::dl_load(lib_path); + handle = common::dl_load(lib_path); } return handle; } +struct op_package_lib_info { + const char * lib_name; + const char * interface; + const char * type; + size_t htp_arch; + const char * extra_lib_name = nullptr; +}; + +const op_package_lib_info & get_op_package_lib_info(uint32_t soc_model, size_t htp_arch) { + constexpr static const op_package_lib_info kOpPackageLibInfo[] = { + { kQnnCpuPackageLibName, "GgmlOpPackageInterfaceProvider", "CPU", qnn::NONE, + PLATFORM_LIB_FILENAME("HtpPrepare") }, + { PLATFORM_LIB_FILENAME("QnnGgmlOpPackage_v68"), "GgmlOpPackageInterfaceProvider", "HTP", qnn::V68 }, + { PLATFORM_LIB_FILENAME("QnnGgmlOpPackage_v69"), "GgmlOpPackageInterfaceProvider", "HTP", qnn::V69 }, + { PLATFORM_LIB_FILENAME("QnnGgmlOpPackage_v73"), "GgmlOpPackageInterfaceProvider", "HTP", qnn::V73 }, + { PLATFORM_LIB_FILENAME("QnnGgmlOpPackage_v75"), "GgmlOpPackageInterfaceProvider", "HTP", qnn::V75 }, + { PLATFORM_LIB_FILENAME("QnnGgmlOpPackage_v79"), "GgmlOpPackageInterfaceProvider", "HTP", qnn::V79 }, + }; + + if (soc_model == qnn::UNKNOWN || soc_model == qnn::EMULATOR_X64 || soc_model == qnn::EMULATOR_AARCH64) { + return kOpPackageLibInfo[0]; + } + + switch (htp_arch) { + case qnn::V68: + static_assert(kOpPackageLibInfo[1].htp_arch == qnn::V68); + return kOpPackageLibInfo[1]; + case qnn::V69: + static_assert(kOpPackageLibInfo[2].htp_arch == qnn::V69); + return kOpPackageLibInfo[2]; + case qnn::V73: + static_assert(kOpPackageLibInfo[3].htp_arch == qnn::V73); + return kOpPackageLibInfo[3]; + case qnn::V75: + static_assert(kOpPackageLibInfo[4].htp_arch == qnn::V75); + return kOpPackageLibInfo[4]; + case qnn::V79: + default: + static_assert(kOpPackageLibInfo[5].htp_arch == qnn::V79); + return kOpPackageLibInfo[5]; + } +} + } // namespace namespace qnn { -qnn_system_interface::qnn_system_interface(const QnnSystemInterface_t & qnn_sys_interface, dl_handler_t lib_handle) : +qnn_system_interface::qnn_system_interface(const QnnSystemInterface_t & qnn_sys_interface, + common::dl_handler_t lib_handle) : _qnn_sys_interface(qnn_sys_interface), _lib_handle(lib_handle) { qnn_system_context_create(&_qnn_system_handle); @@ -139,15 +190,16 @@ qnn_system_interface::~qnn_system_interface() { } if (_lib_handle) { - if (!dl_unload(_lib_handle)) { - QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", dl_error()); + if (!common::dl_unload(_lib_handle)) { + QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", common::dl_error()); } } else { QNN_LOG_WARN("system lib handle is null\n"); } } -qnn_instance::qnn_instance(const std::string & lib_path, QNNBackend device) : _additional_lib_load_path(lib_path) { +qnn_instance::qnn_instance(const std::string & lib_path, backend_index_type device) : + _additional_lib_load_path(lib_path) { _backend_lib_name = kDeviceCaps[device].lib_name; if (set_qnn_lib_search_path(lib_path)) { QNN_LOG_DEBUG("[%s] set_qnn_lib_search_path succeed\n", _backend_lib_name.c_str()); @@ -156,23 +208,23 @@ qnn_instance::qnn_instance(const std::string & lib_path, QNNBackend device) : _a } } -int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { +bool qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { BackendIdType backend_id = QNN_BACKEND_ID_NULL; QNN_LOG_DEBUG("enter qnn_init\n"); std::lock_guard lock(_init_mutex); if (load_system() != 0) { QNN_LOG_WARN("failed to load QNN system lib\n"); - return 1; + return false; } else { QNN_LOG_DEBUG("load QNN system lib successfully\n"); } std::string backend_lib_path = _backend_lib_name; if (_lib_path_to_backend_id.count(backend_lib_path) == 0) { - if (load_backend(backend_lib_path, saver_config) != 0) { + if (!load_backend(backend_lib_path, saver_config)) { QNN_LOG_WARN("failed to load QNN backend\n"); - return 2; + return false; } } @@ -182,15 +234,15 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { "library %s is loaded but loaded backend count=%zu, " "loaded lib_handle count=%zu", backend_lib_path.c_str(), _loaded_backend.count(backend_id), _loaded_lib_handle.count(backend_id)); - return 3; + return false; } _qnn_interface = std::make_shared(*_loaded_backend[backend_id]); _qnn_interface->qnn_log_create(qnn::sdk_logcallback, _qnn_log_level, &_qnn_log_handle); if (!_qnn_log_handle) { // NPU backend not work on Qualcomm SoC equipped low-end phone - QNN_LOG_WARN("why failed to initialize qnn log\n"); - return 4; + QNN_LOG_WARN("failed to initialize qnn log\n"); + return false; } else { QNN_LOG_DEBUG("initialize qnn log successfully\n"); } @@ -199,22 +251,23 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { _qnn_interface->qnn_backend_create( _qnn_log_handle, temp_backend_config.empty() ? nullptr : temp_backend_config.data(), &_qnn_backend_handle); if (!_qnn_backend_handle) { - QNN_LOG_WARN("why failed to initialize qnn backend\n"); - return 5; + QNN_LOG_WARN("failed to initialize qnn backend\n"); + return false; } else { QNN_LOG_DEBUG("initialize qnn backend successfully\n"); } auto qnn_status = _qnn_interface->qnn_property_has_capability(QNN_PROPERTY_GROUP_DEVICE); - if (QNN_PROPERTY_NOT_SUPPORTED == qnn_status) { - QNN_LOG_WARN("device property is not supported\n"); - } - if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnn_status) { - QNN_LOG_WARN("device property is not known to backend\n"); + switch (qnn_status) { + case QNN_PROPERTY_NOT_SUPPORTED: + QNN_LOG_WARN("device property is not supported\n"); + break; + case QNN_PROPERTY_ERROR_UNKNOWN_KEY: + QNN_LOG_WARN("device property is unknown\n"); + break; } - qnn_status = QNN_SUCCESS; - if (_backend_lib_name.find("Htp") != _backend_lib_name.npos) { + { const QnnDevice_PlatformInfo_t * p_info = nullptr; qnn_status = _qnn_interface->qnn_device_get_platform_info(nullptr, &p_info); if (qnn_status == QNN_SUCCESS) { @@ -243,57 +296,50 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { _qnn_interface->qnn_device_free_platform_info(nullptr, p_info); } else { // For emulator, we can't get platform info - QNN_LOG_WARN("failed to get platform info, are we in emulator?\n"); - _soc_info = { NONE, UNKNOWN_SM, 0 }; + QNN_LOG_INFO("failed to get platform info, emulator or cpu backend?\n"); +#if defined(__aarch64__) || defined(_M_ARM64) + _soc_info = { EMULATOR_AARCH64, NONE, 0 }; +#elif defined(__x86_64__) || defined(__amd64__) || defined(_M_X64) + _soc_info = { EMULATOR_X64, NONE, 0 }; +#else + _soc_info = { UNKNOWN, NONE, 0 }; +#endif } + } - QnnHtpDevice_CustomConfig_t soc_customconfig; - soc_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_SOC; - soc_customconfig.socModel = _soc_info.soc_model; - QnnDevice_Config_t soc_devconfig; - soc_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; - soc_devconfig.customConfig = &soc_customconfig; - - QnnHtpDevice_CustomConfig_t arch_customconfig; - arch_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_ARCH; - arch_customconfig.arch.arch = (QnnHtpDevice_Arch_t) _soc_info.htp_arch; - arch_customconfig.arch.deviceId = 0; // Id of device to be used. 0 will use by default. - QnnDevice_Config_t arch_devconfig; - arch_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; - arch_devconfig.customConfig = &arch_customconfig; - - const QnnDevice_Config_t * p_deviceconfig[] = { &soc_devconfig, &arch_devconfig, nullptr }; - qnn_status = _qnn_interface->qnn_device_create(_qnn_log_handle, p_deviceconfig, &_qnn_device_handle); - } else { + { qnn_status = _qnn_interface->qnn_device_create(_qnn_log_handle, nullptr, &_qnn_device_handle); + if (QNN_SUCCESS != qnn_status && QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnn_status) { + QNN_LOG_WARN("failed to create QNN device\n"); + } else { + QNN_LOG_INFO("create QNN device successfully\n"); + } } - if (QNN_SUCCESS != qnn_status && QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnn_status) { - QNN_LOG_WARN("failed to create QNN device\n"); - } else { - QNN_LOG_INFO("create QNN device successfully\n"); - } - - _rpc_lib_handle = load_lib_with_fallback(kQnnRpcLibName, _additional_lib_load_path); - if (_rpc_lib_handle) { - _pfn_rpc_mem_alloc = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_alloc")); - _pfn_rpc_mem_free = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_free")); - _pfn_rpc_mem_to_fd = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_to_fd")); - if (!_pfn_rpc_mem_alloc || !_pfn_rpc_mem_free || !_pfn_rpc_mem_to_fd) { - QNN_LOG_WARN("unable to access symbols in QNN RPC lib. error: %s\n", dl_error()); - dl_unload(_rpc_lib_handle); - return 9; + + { + auto rpc_mem = std::make_unique(); + if (rpc_mem->is_valid()) { + _rpc_mem = std::move(rpc_mem); } + } - _pfn_rpc_mem_init = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_init")); - _pfn_rpc_mem_deinit = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_deinit")); - if (_pfn_rpc_mem_init) { - _pfn_rpc_mem_init(); + { + auto & op_package_info = get_op_package_lib_info(_soc_info.soc_model, _soc_info.htp_arch); + if (op_package_info.extra_lib_name) { + _custom_op_extra_lib_handle = + load_lib_with_fallback(op_package_info.extra_lib_name, _additional_lib_load_path); } - _rpcmem_initialized = true; - QNN_LOG_DEBUG("load rpcmem lib successfully\n"); - } else { - QNN_LOG_WARN("failed to load qualcomm rpc lib, skipping, error:%s\n", dl_error()); + qnn_status = _qnn_interface->qnn_backend_register_op_package(_qnn_backend_handle, op_package_info.lib_name, + op_package_info.interface, op_package_info.type); + if (qnn_status != QNN_SUCCESS) { + QNN_LOG_WARN("failed to register op package %s, interface: %s, error: %s\n", op_package_info.lib_name, + op_package_info.interface, qnn::get_qnn_error_string(qnn_status)); + } else { + QNN_LOG_DEBUG("register op package %s successfully, ID %u\n", op_package_info.lib_name, + _qnn_interface->get_backend_id()); + _has_custom_op_package = true; + } } /* TODO: not used, keep it for further usage @@ -302,35 +348,14 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { const QnnContext_Config_t * context_configs[] = {&qnn_context_config, nullptr}; */ _qnn_interface->qnn_context_create(_qnn_backend_handle, _qnn_device_handle, nullptr, &_qnn_context_handle); - if (nullptr == _qnn_context_handle) { - QNN_LOG_WARN("why failed to initialize qnn context\n"); - return 10; + if (!_qnn_context_handle) { + QNN_LOG_WARN("failed to initialize qnn context\n"); + return false; } else { QNN_LOG_DEBUG("initialize qnn context successfully\n"); } if (_backend_lib_name.find("Htp") != _backend_lib_name.npos) { - // TODO: faster approach to probe the accurate capacity of rpc ion memory - size_t candidate_size = 0; - uint8_t * rpc_buffer = nullptr; - const int size_in_mb = (1 << 20); - size_t probe_slots[] = { 1024, 1536, 2048 - 48, 2048 }; - size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); - for (size_t idx = 0; idx < probe_counts; idx++) { - rpc_buffer = static_cast(alloc_rpcmem(probe_slots[idx] * size_in_mb, sizeof(void *))); - if (!rpc_buffer) { - QNN_LOG_DEBUG("alloc rpcmem %d (MB) failure, %s\n", (int) probe_slots[idx], strerror(errno)); - break; - } else { - candidate_size = probe_slots[idx]; - free_rpcmem(rpc_buffer); - rpc_buffer = nullptr; - } - } - - _rpcmem_capacity = std::max(candidate_size, _rpcmem_capacity); - QNN_LOG_INFO("capacity of QNN rpc ion memory is about %d MB\n", (int) _rpcmem_capacity); - if (init_htp_perfinfra() != 0) { QNN_LOG_WARN("initialize HTP performance failure\n"); } @@ -343,33 +368,16 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { } QNN_LOG_DEBUG("leave qnn_init\n"); - - return 0; + return true; } -int qnn_instance::qnn_finalize() { - int ret_status = 0; - Qnn_ErrorHandle_t error = QNN_SUCCESS; - - if (_rpc_lib_handle) { - if (_pfn_rpc_mem_deinit) { - _pfn_rpc_mem_deinit(); - _pfn_rpc_mem_deinit = nullptr; - } - - if (dl_unload(_rpc_lib_handle)) { - QNN_LOG_DEBUG("succeed to close rpcmem lib\n"); - } else { - QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dl_error()); - } - } - +bool qnn_instance::qnn_finalize() { if (_backend_lib_name.find("Htp") != _backend_lib_name.npos) { _qnn_htp_perfinfra->destroyPowerConfigId(_qnn_power_configid); } if (_qnn_context_handle) { - error = _qnn_interface->qnn_context_free(_qnn_context_handle, nullptr); + auto error = _qnn_interface->qnn_context_free(_qnn_context_handle, nullptr); if (error != QNN_SUCCESS) { QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), (int) QNN_GET_ERROR_CODE(error)); @@ -378,7 +386,7 @@ int qnn_instance::qnn_finalize() { } if (_qnn_device_handle) { - error = _qnn_interface->qnn_device_free(_qnn_device_handle); + auto error = _qnn_interface->qnn_device_free(_qnn_device_handle); if (error != QNN_SUCCESS) { QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), (int) QNN_GET_ERROR_CODE(error)); @@ -387,7 +395,7 @@ int qnn_instance::qnn_finalize() { } if (_qnn_backend_handle) { - error = _qnn_interface->qnn_backend_free(_qnn_backend_handle); + auto error = _qnn_interface->qnn_backend_free(_qnn_backend_handle); if (error != QNN_SUCCESS) { QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), (int) QNN_GET_ERROR_CODE(error)); @@ -396,7 +404,7 @@ int qnn_instance::qnn_finalize() { } if (_qnn_log_handle) { - error = _qnn_interface->qnn_log_free(_qnn_log_handle); + auto error = _qnn_interface->qnn_log_free(_qnn_log_handle); if (error != QNN_SUCCESS) { QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), (int) QNN_GET_ERROR_CODE(error)); @@ -404,25 +412,31 @@ int qnn_instance::qnn_finalize() { _qnn_log_handle = nullptr; } + if (_custom_op_extra_lib_handle) { + common::dl_unload(_custom_op_extra_lib_handle); + } + unload_backend(); _qnn_sys_interface.reset(); - return ret_status; + _rpc_mem.reset(); + + return true; } int qnn_instance::load_system() { QNN_LOG_DEBUG("[%s]lib: %s\n", _backend_lib_name.c_str(), kQnnSystemLibName); auto system_lib_handle = load_lib_with_fallback(kQnnSystemLibName, _additional_lib_load_path); if (!system_lib_handle) { - QNN_LOG_WARN("can not load QNN library %s, error: %s\n", kQnnSystemLibName, dl_error()); + QNN_LOG_WARN("can not load QNN library %s, error: %s\n", kQnnSystemLibName, common::dl_error()); return 1; } - auto * get_providers = - dl_sym_typed(system_lib_handle, "QnnSystemInterface_getProviders"); + auto * get_providers = common::dl_sym_typed( + system_lib_handle, "QnnSystemInterface_getProviders"); if (!get_providers) { - QNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s\n", dl_error()); + QNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s\n", common::dl_error()); return 2; } @@ -473,38 +487,42 @@ int qnn_instance::load_system() { return 0; } -int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t ** /*saver_config*/) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; +bool qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t ** /*saver_config*/) { QNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str()); auto lib_handle = load_lib_with_fallback(lib_path, _additional_lib_load_path); if (!lib_handle) { - QNN_LOG_WARN("can not open QNN library %s, with error: %s\n", lib_path.c_str(), dl_error()); - return 1; + QNN_LOG_WARN("can not open QNN library %s, with error: %s\n", lib_path.c_str(), common::dl_error()); + return false; } - auto get_providers = dl_sym_typed(lib_handle, "QnnInterface_getProviders"); + auto get_providers = + common::dl_sym_typed(lib_handle, "QnnInterface_getProviders"); if (!get_providers) { - QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s\n", dl_error()); - return 2; + QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s\n", common::dl_error()); + common::dl_unload(lib_handle); + return false; } std::uint32_t num_providers = 0; const QnnInterface_t ** provider_list = nullptr; - error = get_providers(&provider_list, &num_providers); + auto error = get_providers(&provider_list, &num_providers); if (error != QNN_SUCCESS) { QNN_LOG_WARN("failed to get providers, error %d\n", (int) QNN_GET_ERROR_CODE(error)); - return 3; + common::dl_unload(lib_handle); + return false; } QNN_LOG_DEBUG("num_providers=%d\n", num_providers); if (num_providers != _required_num_providers) { QNN_LOG_WARN("providers is %d instead of required %d\n", num_providers, _required_num_providers); - return 4; + common::dl_unload(lib_handle); + return false; } if (!provider_list) { QNN_LOG_WARN("failed to get qnn interface providers\n"); - return 5; + common::dl_unload(lib_handle); + return false; } bool found_valid_interface = false; QNN_INTERFACE_VER_TYPE qnn_interface; @@ -519,7 +537,8 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t * if (!found_valid_interface) { QNN_LOG_WARN("unable to find a valid qnn interface\n"); - return 6; + common::dl_unload(lib_handle); + return false; } else { QNN_LOG_DEBUG("find a valid qnn interface\n"); } @@ -532,31 +551,29 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t * _loaded_backend[backend_id] = provider_list[0]; if (_loaded_lib_handle.count(backend_id) > 0) { QNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]); - if (!dl_unload(_loaded_lib_handle[backend_id])) { - QNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], dl_error()); + if (!common::dl_unload(_loaded_lib_handle[backend_id])) { + QNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], common::dl_error()); } } _loaded_lib_handle[backend_id] = lib_handle; _backend_id = backend_id; - return 0; + return true; } -int qnn_instance::unload_backend() { +void qnn_instance::unload_backend() { for (auto & it : _loaded_lib_handle) { - if (!dl_unload(it.second)) { - QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, dl_error()); + if (!common::dl_unload(it.second)) { + QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, common::dl_error()); } } _loaded_lib_handle.clear(); _lib_path_to_backend_id.clear(); _loaded_backend.clear(); - - return 0; } -const device_caps & get_device_caps(QNNBackend device) { +const device_caps & get_device_caps(backend_index_type device) { return kDeviceCaps[device]; } diff --git a/ggml/src/ggml-qnn/qnn-lib.hpp b/ggml/src/ggml-qnn/qnn/qnn-lib.hpp similarity index 91% rename from ggml/src/ggml-qnn/qnn-lib.hpp rename to ggml/src/ggml-qnn/qnn/qnn-lib.hpp index 3d0084b868da8..2e7c9339aa60c 100644 --- a/ggml/src/ggml-qnn/qnn-lib.hpp +++ b/ggml/src/ggml-qnn/qnn/qnn-lib.hpp @@ -24,8 +24,9 @@ #include #include -#include "dl-loader.hpp" +#include "dyn-lib-loader.hpp" #include "qnn-types.hpp" +#include "rpc-mem.hpp" #include "utils.hpp" namespace qnn { @@ -48,7 +49,7 @@ class qnn_system_interface { } public: - qnn_system_interface(const QnnSystemInterface_t & qnn_sys_interface, dl_handler_t lib_handle); + qnn_system_interface(const QnnSystemInterface_t & qnn_sys_interface, common::dl_handler_t lib_handle); ~qnn_system_interface(); bool is_valid() const { return _qnn_system_handle != nullptr; } @@ -67,7 +68,7 @@ class qnn_system_interface { void operator=(qnn_system_interface &&) = delete; const QnnSystemInterface_t _qnn_sys_interface = {}; - dl_handler_t _lib_handle = nullptr; + common::dl_handler_t _lib_handle = nullptr; QnnSystemContext_Handle_t _qnn_system_handle = nullptr; }; @@ -152,12 +153,12 @@ class qnn_instance { public: using BackendIdType = decltype(QnnInterface_t{}.backendId); - explicit qnn_instance(const std::string & lib_path, QNNBackend device); + explicit qnn_instance(const std::string & lib_path, backend_index_type device); ~qnn_instance() {} - int qnn_init(const QnnSaver_Config_t ** saver_config); - int qnn_finalize(); + bool qnn_init(const QnnSaver_Config_t ** saver_config); + bool qnn_finalize(); qnn_interface_ptr get_qnn_interface() { if (!_qnn_interface) { @@ -277,18 +278,14 @@ class qnn_instance { std::string & get_qnn_graph_name() { return _graph_name; } - bool is_rpcmem_initialized() { return _rpcmem_initialized; } - - size_t get_rpcmem_capacity() { return _rpcmem_capacity; } - void * alloc_rpcmem(size_t bytes, size_t alignment) { - if (!_rpcmem_initialized) { + if (!_rpc_mem) { QNN_LOG_WARN("rpc memory not initialized\n"); return nullptr; } auto allocate_bytes = static_cast(bytes + alignment); - void * buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, (int) allocate_bytes); + void * buf = _rpc_mem->alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, (int) allocate_bytes); if (!buf) { QNN_LOG_WARN("failed to allocate rpc memory, size: %d MB\n", (int) (allocate_bytes / (1 << 20))); return nullptr; @@ -298,32 +295,34 @@ class qnn_instance { bool status = _rpcmem_store_map.insert(std::pair(aligned_buf, buf)).second; if (!status) { QNN_LOG_WARN("failed to allocate rpc memory\n"); - _pfn_rpc_mem_free(buf); + _rpc_mem->free(buf); } return aligned_buf; } void free_rpcmem(void * buf) { - if (!_rpcmem_initialized) { + if (!_rpc_mem) { QNN_LOG_WARN("rpc memory not initialized\n"); } else if (_rpcmem_store_map.count(buf) == 0) { QNN_LOG_WARN("no allocated tensor\n"); } else { - _pfn_rpc_mem_free(_rpcmem_store_map[buf]); + _rpc_mem->free(_rpcmem_store_map[buf]); _rpcmem_store_map.erase(buf); } } - int32_t rpcmem_to_fd(void * buf) { - int32_t mem_fd = -1; - if (!is_rpcmem_initialized()) { + int rpcmem_to_fd(void * buf) { + int fd = -1; + if (!_rpc_mem) { QNN_LOG_WARN("rpc memory not initialized\n"); + } else if (_rpcmem_store_map.count(buf) == 0) { + QNN_LOG_WARN("no allocated tensor\n"); } else { - mem_fd = _pfn_rpc_mem_to_fd(buf); + buf = _rpcmem_store_map[buf]; + fd = _rpc_mem->to_fd(buf); } - - return mem_fd; + return fd; } Qnn_MemHandle_t register_rpcmem(void * p_data, const uint32_t rank, uint32_t * dimensions, @@ -333,7 +332,7 @@ class qnn_instance { return nullptr; } - if (!is_rpcmem_initialized()) { + if (!_rpc_mem) { QNN_LOG_WARN("rpc memory not initialized\n"); return nullptr; } @@ -390,10 +389,12 @@ class qnn_instance { const qnn::qcom_socinfo & get_soc_info() { return _soc_info; } + bool has_custom_op_package() const { return _has_custom_op_package; } + private: - int load_system(); - int load_backend(std::string & lib_path, const QnnSaver_Config_t ** /*saver_config*/); - int unload_backend(); + int load_system(); + bool load_backend(std::string & lib_path, const QnnSaver_Config_t ** /*saver_config*/); + void unload_backend(); private: static constexpr const int _required_num_providers = 1; @@ -422,23 +423,19 @@ class qnn_instance { std::unordered_map _qnn_rpc_buffer_to_handles; std::mutex _init_mutex; - std::unordered_map _loaded_lib_handle; + std::unordered_map _loaded_lib_handle; std::unordered_map _lib_path_to_backend_id; std::unordered_map _loaded_backend; - dl_handler_t _rpc_lib_handle = nullptr; - std::atomic_bool _rpcmem_initialized{ false }; - qnn::pfn_rpc_mem_alloc _pfn_rpc_mem_alloc = nullptr; - qnn::pfn_rpc_mem_free _pfn_rpc_mem_free = nullptr; - qnn::pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd = nullptr; - qnn::pfn_rpc_mem_init _pfn_rpc_mem_init = nullptr; - qnn::pfn_rpc_mem_deinit _pfn_rpc_mem_deinit = nullptr; + std::unique_ptr _rpc_mem; std::unordered_map _rpcmem_store_map; - size_t _rpcmem_capacity = 512; std::string _graph_name; qnn::qcom_socinfo _soc_info = {}; + + bool _has_custom_op_package = false; + common::dl_handler_t _custom_op_extra_lib_handle = nullptr; }; using qnn_instance_ptr = std::shared_ptr; @@ -457,6 +454,6 @@ struct device_caps { size_t max_tensor_size_in_bytes; }; -const device_caps & get_device_caps(QNNBackend device); +const device_caps & get_device_caps(backend_index_type device); } // namespace qnn diff --git a/ggml/src/ggml-qnn/qnn/qnn-types.hpp b/ggml/src/ggml-qnn/qnn/qnn-types.hpp new file mode 100644 index 0000000000000..4fe3e9155b185 --- /dev/null +++ b/ggml/src/ggml-qnn/qnn/qnn-types.hpp @@ -0,0 +1,51 @@ + +#pragma once + +#include +#include +#include +#include +#include + +#include "common.hpp" + +namespace qnn { + +enum qcom_htp_arch { + NONE = 0, + V68 = 68, + V69 = 69, + V73 = 73, + V75 = 75, + V79 = 79, // SD 8 Gen 4 (SM8750) +}; + +enum qcom_chipset { + UNKNOWN = 0, + EMULATOR_X64 = 0xFF00, // x86_64 emulator + EMULATOR_AARCH64 = 0xFF01, // ARM64 emulator + SM8350 = 30, // v68, SD 888/888+ + SM8450 = 36, // v69, SD 8 Gen 1 + SA8295 = 39, // v68 + SM8475 = 42, // v69, SD 8+ Gen 1 + SM8550 = 43, // v73, SD 8 Gen 2 + SSG2115P = 46, // v73 + SM7675 = 70, // V73, SD 7+ Gen 3 + SM8635 = 68, // v73, SD 8s Gen 3 + SM8650 = 57, // v75, SD 8 Gen 3 + SM8750 = 69, // v79, SD 8 Gen 4 +}; + +struct qcom_socinfo { + uint32_t soc_model; + size_t htp_arch; + size_t vtcm_size_in_mb; +}; + +using pfn_qnnsaver_initialize = decltype(QnnSaver_initialize); +using pfn_qnninterface_getproviders = decltype(QnnInterface_getProviders); +using pfn_qnnsysteminterface_getproviders = decltype(QnnSystemInterface_getProviders); +} // namespace qnn + +#define RPCMEM_DEFAULT_FLAGS 1 +#define RPCMEM_HEAP_ID_SYSTEM 25 diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/qnn/tensor.hpp similarity index 98% rename from ggml/src/ggml-qnn/tensor.hpp rename to ggml/src/ggml-qnn/qnn/tensor.hpp index 608a80fcf5aaa..ef501135b5d86 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/qnn/tensor.hpp @@ -25,7 +25,7 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string & name, const qnn_dimension_array_t & dimensions, Qnn_DataType_t data_type, int rank, - QNNBackend device, Qnn_GraphHandle_t graph_handle, qnn_instance_ptr qnn_instance) : + backend_index_type device, Qnn_GraphHandle_t graph_handle, qnn_instance_ptr qnn_instance) : _tensor_name(name), _device(device), _qnn_instance(qnn_instance), @@ -45,7 +45,7 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string & name, const ggml_dimension_array_t & dimensions, ggml_type data_type, int rank, - QNNBackend device, Qnn_GraphHandle_t graph_handle, qnn_instance_ptr qnn_instance) : + backend_index_type device, Qnn_GraphHandle_t graph_handle, qnn_instance_ptr qnn_instance) : ggml_qnn_tensor(tensor_type, name, get_internal_dimension(dimensions, rank), qnn_datatype_from_ggml_datatype(data_type), rank, device, graph_handle, qnn_instance) {} @@ -318,7 +318,7 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { std::string _tensor_name; qnn_buffer_ptr _buffer; bool _can_unbind = true; - QNNBackend _device; + backend_index_type _device; qnn_instance_ptr _qnn_instance; Qnn_Tensor_t _qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion); qnn_dimension_array_t _dimensions = {}; @@ -408,7 +408,7 @@ struct tensor_create_common_params { const char * name_prefix; int tensor_rank; bool is_input; - QNNBackend device; + backend_index_type device; Qnn_GraphHandle_t graph_handle; std::shared_ptr qnn_instance; }; diff --git a/ggml/src/ggml-qnn/utils.cpp b/ggml/src/ggml-qnn/qnn/utils.cpp similarity index 92% rename from ggml/src/ggml-qnn/utils.cpp rename to ggml/src/ggml-qnn/qnn/utils.cpp index 9696101b8b6e5..8f3878aa03115 100644 --- a/ggml/src/ggml-qnn/utils.cpp +++ b/ggml/src/ggml-qnn/qnn/utils.cpp @@ -178,7 +178,7 @@ const char * get_ggml_type_name(ggml_type type) { return traits->type_name; } -const char * get_backend_name(QNNBackend device) { +const char * get_backend_name(backend_index_type device) { switch (device) { case QNN_BACKEND_CPU: return "qnn-cpu"; @@ -192,7 +192,7 @@ const char * get_backend_name(QNNBackend device) { } } -const char * get_backend_desc(QNNBackend device) { +const char * get_backend_desc(backend_index_type device) { switch (device) { case QNN_BACKEND_CPU: return "CPU"; @@ -224,6 +224,10 @@ const char * get_chipset_desc(uint32_t soc_model) { return "Snapdragon 8 Gen 3"; case SM8750: return "Snapdragon 8 Elite"; + case EMULATOR_AARCH64: + return "AArch64 Emulator"; + case EMULATOR_X64: + return "x86_64 Emulator"; default: return "unknown"; } @@ -251,6 +255,10 @@ const char * get_chipset_model(uint32_t soc_model) { return "SM8650"; case SM8750: return "SM8750"; + case EMULATOR_AARCH64: + return "AARCH64EMU"; + case EMULATOR_X64: + return "X64EMU"; default: return "unknown"; } @@ -456,52 +464,4 @@ const char * get_qnn_error_string(Qnn_ErrorHandle_t error) { } } -#ifdef _WIN32 - -size_t get_system_total_memory_in_bytes() { - MEMORYSTATUSEX mem = {}; - mem.dwLength = sizeof(mem); - if (GlobalMemoryStatusEx(&mem)) { - return mem.ullTotalPhys; - } - - return 0; -} - -size_t get_system_free_memory_in_bytes() { - MEMORYSTATUSEX mem = {}; - mem.dwLength = sizeof(mem); - if (GlobalMemoryStatusEx(&mem)) { - return mem.ullAvailPhys; - } - - return 0; -} - -#else - -size_t get_system_total_memory_in_bytes() { - struct sysinfo info = {}; - if (sysinfo(&info) == 0) { - return (info.totalram + info.totalswap) * info.mem_unit; - } - - auto pages = (size_t) sysconf(_SC_PHYS_PAGES); - auto page_size = (size_t) sysconf(_SC_PAGE_SIZE); - return pages * page_size; -} - -size_t get_system_free_memory_in_bytes() { - struct sysinfo info = {}; - if (sysinfo(&info) == 0) { - return (info.freeram + info.freeswap) * info.mem_unit; - } - - auto avail_pages = (size_t) sysconf(_SC_AVPHYS_PAGES); - auto page_size = (size_t) sysconf(_SC_PAGE_SIZE); - return avail_pages * page_size; -} - -#endif - } // namespace qnn diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/qnn/utils.hpp similarity index 97% rename from ggml/src/ggml-qnn/utils.hpp rename to ggml/src/ggml-qnn/qnn/utils.hpp index 2e55e2f2d85b3..09596c4e6f6a4 100644 --- a/ggml/src/ggml-qnn/utils.hpp +++ b/ggml/src/ggml-qnn/qnn/utils.hpp @@ -5,6 +5,7 @@ #include #include +#include "common.hpp" #include "ggml-qnn.h" #include "ggml.h" #include "logger.hpp" @@ -23,8 +24,8 @@ qnn_dimension_array_t get_view_internal_dimension(const ggml_tensor * tensor, si uint32_t get_ggml_tensor_rank(const ggml_tensor * tensor); const char * get_ggml_type_name(ggml_type type); -const char * get_backend_name(QNNBackend device); -const char * get_backend_desc(QNNBackend device); +const char * get_backend_name(backend_index_type device); +const char * get_backend_desc(backend_index_type device); const char * get_chipset_desc(uint32_t soc_model); const char * get_chipset_model(uint32_t soc_model); const char * get_htparch_desc(size_t htp_arch); @@ -199,8 +200,6 @@ Qnn_DataType_t qnn_datatype_from_ggml_datatype(ggml_type ggml_type); ggml_type ggml_datatype_from_qnn_datatype(Qnn_DataType_t qnn_type); size_t qnn_datatype_size(Qnn_DataType_t qnn_type); const char * qnn_datatype_to_string(Qnn_DataType_t qnn_type); -size_t get_system_total_memory_in_bytes(); -size_t get_system_free_memory_in_bytes(); } // namespace qnn diff --git a/ggml/src/ggml-qnn/shared/CMakeLists.txt b/ggml/src/ggml-qnn/shared/CMakeLists.txt new file mode 100644 index 0000000000000..b901e656b9ee0 --- /dev/null +++ b/ggml/src/ggml-qnn/shared/CMakeLists.txt @@ -0,0 +1,35 @@ + +file(GLOB common_srcs "${CMAKE_CURRENT_LIST_DIR}/*.cpp") + +add_library(runtime-common STATIC + ${common_srcs} +) + +target_include_directories(runtime-common PUBLIC + ${CMAKE_CURRENT_LIST_DIR}/ + ${CMAKE_CURRENT_LIST_DIR}/../ + ${CMAKE_CURRENT_LIST_DIR}/../../ + ${CMAKE_CURRENT_LIST_DIR}/../../../include/ # TODO: figure out how to remove this +) + +if(GGML_QNN_ENABLE_HEXAGON_BACKEND) + if(DEFINED ENV{QNN_SDK_PATH}) + set(HEXAGON_SDK_ROOT $ENV{HEXAGON_SDK_ROOT}) + message("found HEXAGON_SDK_ROOT, setting to ${HEXAGON_SDK_ROOT}") + else() + message(FATAL_ERROR "HEXAGON_SDK_ROOT not defined") + endif() + + target_include_directories(runtime-common PUBLIC + ${HEXAGON_SDK_ROOT}/incs/ + ${HEXAGON_SDK_ROOT}/incs/stddef/ + ${HEXAGON_SDK_ROOT}/incs/HAP/ + ${HEXAGON_SDK_ROOT}/rtos/qurt/ + ${HEXAGON_SDK_ROOT}/utils/examples/ + ) + target_compile_definitions(runtime-common PRIVATE + GGML_QNN_ENABLE_HEXAGON_BACKEND + ) +else() + message("HEXAGON_SDK_ROOT not defined, not appending to include directories") +endif() diff --git a/ggml/src/ggml-qnn/shared/common.cpp b/ggml/src/ggml-qnn/shared/common.cpp new file mode 100644 index 0000000000000..d89a31c20ef39 --- /dev/null +++ b/ggml/src/ggml-qnn/shared/common.cpp @@ -0,0 +1,146 @@ + +#include "common.hpp" + +#include + +#include "ggml-backend-impl.h" +#include "ggml-impl.h" +#include "ggml-qnn.h" + +#ifdef _WIN32 +# include +#else +# include +# include +#endif + +namespace { + +struct ggml_backend_qnn_reg_impl : ggml_backend_reg { + std::vector device_proxies; + std::vector devices; + + explicit ggml_backend_qnn_reg_impl(ggml_backend_reg_i backend_iface) { + context = this; + iface = backend_iface; + + LOG_INFO("backend registry init\n"); + for (size_t i = 0; i < TOTAL_BACKEND_COUNT; i++) { + const auto device_enum = + (backend_index_type) (TOTAL_BACKEND_COUNT - 1 - i); // init from the last device, i.e. NPU + + backend_device_proxy_ptr device_proxy; + if (device_enum < QNN_BACKEND_COUNT) { +#ifdef GGML_HEXAGON_NPU_ONLY + device_proxy = create_qnn_backend_context(device_enum); +#else + LOG_DEBUG("skip qnn device %d\n", (int) device_enum); + continue; +#endif + } else { +#ifdef GGML_QNN_ENABLE_HEXAGON_BACKEND + device_proxy = create_hexagon_backend_context(device_enum); +#else + LOG_DEBUG("skip hexagon device %d\n", (int) device_enum); + continue; +#endif + } + + if (!device_proxy) { + LOG_DEBUG("skip device %d\n", (int) device_enum); + continue; + } + + devices.emplace_back(ggml_backend_device{ + /* iface = */ device_proxy->get_iface(), + /* reg = */ this, + /* context = */ device_proxy->get_context(), + }); + + device_proxies.emplace_back(device_proxy); + } + } +}; + +const char * ggml_backend_qnn_reg_get_name(ggml_backend_reg_t reg) { + GGML_UNUSED(reg); + // TODO: should we use a different name? + return "qualcomm"; +} + +size_t ggml_backend_qnn_reg_get_device_count(ggml_backend_reg_t reg) { + auto * ctx = (ggml_backend_qnn_reg_impl *) reg->context; + return ctx->devices.size(); +} + +ggml_backend_dev_t ggml_backend_qnn_reg_get_device(ggml_backend_reg_t reg, size_t index) { + auto * ctx = (ggml_backend_qnn_reg_impl *) reg->context; + GGML_ASSERT(index < ctx->devices.size()); + return &(ctx->devices[index]); +} + +const ggml_backend_reg_i ggml_backend_qnn_reg_interface = { + /* .get_name = */ ggml_backend_qnn_reg_get_name, + /* .get_device_count = */ ggml_backend_qnn_reg_get_device_count, + /* .get_device_get = */ ggml_backend_qnn_reg_get_device, + /* .get_proc_address = */ nullptr, +}; + +} // namespace + +ggml_backend_reg_t ggml_backend_qnn_reg() { + static ggml_backend_qnn_reg_impl reg{ ggml_backend_qnn_reg_interface }; + return ® +} + +namespace common { + +#ifdef _WIN32 + +size_t get_system_total_memory_in_bytes() { + MEMORYSTATUSEX mem = {}; + mem.dwLength = sizeof(mem); + if (GlobalMemoryStatusEx(&mem)) { + return mem.ullTotalPhys; + } + + return 0; +} + +size_t get_system_free_memory_in_bytes() { + MEMORYSTATUSEX mem = {}; + mem.dwLength = sizeof(mem); + if (GlobalMemoryStatusEx(&mem)) { + return mem.ullAvailPhys; + } + + return 0; +} + +#else + +size_t get_system_total_memory_in_bytes() { + struct sysinfo info = {}; + if (sysinfo(&info) == 0) { + return (info.totalram + info.totalswap) * info.mem_unit; + } + + auto pages = (size_t) sysconf(_SC_PHYS_PAGES); + auto page_size = (size_t) sysconf(_SC_PAGE_SIZE); + return pages * page_size; +} + +size_t get_system_free_memory_in_bytes() { + struct sysinfo info = {}; + if (sysinfo(&info) == 0) { + return (info.freeram + info.freeswap) * info.mem_unit; + } + + auto avail_pages = (size_t) sysconf(_SC_AVPHYS_PAGES); + auto page_size = (size_t) sysconf(_SC_PAGE_SIZE); + return avail_pages * page_size; +} + +#endif + +} // namespace common diff --git a/ggml/src/ggml-qnn/shared/common.hpp b/ggml/src/ggml-qnn/shared/common.hpp new file mode 100644 index 0000000000000..4feb3365ce102 --- /dev/null +++ b/ggml/src/ggml-qnn/shared/common.hpp @@ -0,0 +1,56 @@ +#pragma once + +#include +#include + +#include "ggml-backend-impl.h" +#include "ggml-impl.h" + +enum backend_index_type { + QNN_BACKEND_CPU = 0, + QNN_BACKEND_GPU, + QNN_BACKEND_NPU, + + HEXAGON_BACKEND, + + TOTAL_BACKEND_COUNT, + QNN_BACKEND_COUNT = HEXAGON_BACKEND, +}; + +class backend_device_proxy { + public: + virtual ~backend_device_proxy() = default; + + virtual const ggml_backend_device_i & get_iface() const = 0; + virtual void * get_context() = 0; +}; + +using backend_device_proxy_ptr = std::shared_ptr; + +backend_device_proxy_ptr create_qnn_backend_context(backend_index_type device); +backend_device_proxy_ptr create_hexagon_backend_context(backend_index_type device); + +namespace common { + +size_t get_system_total_memory_in_bytes(); +size_t get_system_free_memory_in_bytes(); + +} // namespace common + +#define DISABLE_COPY(class_name) \ + class_name(const class_name &) = delete; \ + void operator=(const class_name &) = delete + +#define DISABLE_MOVE(class_name) \ + class_name(class_name &&) = delete; \ + void operator=(class_name &&) = delete + +#define LOG_ERROR(...) (GGML_LOG_ERROR(__VA_ARGS__)) +#define LOG_WARN(...) (GGML_LOG_WARN(__VA_ARGS__)) +#define LOG_INFO(...) (GGML_LOG_INFO(__VA_ARGS__)) + +#ifndef NDEBUG +# define LOG_DEBUG(...) (GGML_LOG_DEBUG(__VA_ARGS__)) +#else +# define LOG_DEBUG(...) +#endif diff --git a/ggml/src/ggml-qnn/dl-loader.hpp b/ggml/src/ggml-qnn/shared/dyn-lib-loader.hpp similarity index 67% rename from ggml/src/ggml-qnn/dl-loader.hpp rename to ggml/src/ggml-qnn/shared/dyn-lib-loader.hpp index e183d190ce18f..22cf8901f3cbc 100644 --- a/ggml/src/ggml-qnn/dl-loader.hpp +++ b/ggml/src/ggml-qnn/shared/dyn-lib-loader.hpp @@ -13,20 +13,20 @@ #include -namespace qnn { +namespace common { #ifdef __linux__ typedef void * dl_handler_t; -inline qnn::dl_handler_t dl_load(const std::string & lib_path) { - return dlopen(lib_path.c_str(), RTLD_NOW | RTLD_LOCAL); +inline dl_handler_t dl_load(const std::string & lib_path) { + return dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL); } -inline void * dl_sym(qnn::dl_handler_t handle, const std::string & symbol) { +inline void * dl_sym(dl_handler_t handle, const std::string & symbol) { return dlsym(handle, symbol.c_str()); } -inline bool dl_unload(qnn::dl_handler_t handle) { +inline bool dl_unload(dl_handler_t handle) { return dlclose(handle) == 0; } @@ -36,7 +36,7 @@ inline const char * dl_error() { #elif defined(_WIN32) using dl_handler_t = HMODULE; -inline qnn::dl_handler_t dl_load(const std::string & lib_path) { +inline dl_handler_t dl_load(const std::string & lib_path) { // suppress error dialogs for missing DLLs auto old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); @@ -47,7 +47,7 @@ inline qnn::dl_handler_t dl_load(const std::string & lib_path) { return handle; } -inline void * dl_sym(qnn::dl_handler_t handle, const std::string & symbol) { +inline void * dl_sym(dl_handler_t handle, const std::string & symbol) { auto old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); @@ -57,7 +57,7 @@ inline void * dl_sym(qnn::dl_handler_t handle, const std::string & symbol) { return p; } -inline bool dl_unload(qnn::dl_handler_t handle) { +inline bool dl_unload(dl_handler_t handle) { FreeLibrary(handle); return true; } @@ -69,8 +69,8 @@ inline const char * dl_error() { #endif -template Fn dl_sym_typed(qnn::dl_handler_t handle, const std::string & function_name) { +template Fn dl_sym_typed(dl_handler_t handle, const std::string & function_name) { return reinterpret_cast(dl_sym(handle, function_name)); } -} // namespace qnn +} // namespace common diff --git a/ggml/src/ggml-qnn/shared/rpc-interface.hpp b/ggml/src/ggml-qnn/shared/rpc-interface.hpp new file mode 100644 index 0000000000000..5a64a03646e67 --- /dev/null +++ b/ggml/src/ggml-qnn/shared/rpc-interface.hpp @@ -0,0 +1,223 @@ +#pragma once + +#include + +#include "common.hpp" +#include "dyn-lib-loader.hpp" +#ifdef GGML_QNN_ENABLE_HEXAGON_BACKEND +# include +#else +// TODO: remove this when not needed + +/** + * @enum fastrpc_map_flags for fastrpc_mmap and fastrpc_munmap + * @brief Types of maps with cache maintenance + */ +enum fastrpc_map_flags { + /** + * Map memory pages with RW- permission and CACHE WRITEBACK. + * Driver will clean cache when buffer passed in a FastRPC call. + * Same remote virtual address will be assigned for subsequent + * FastRPC calls. + */ + FASTRPC_MAP_STATIC, + + /** Reserved for compatibility with deprecated flag */ + FASTRPC_MAP_RESERVED, + + /** + * Map memory pages with RW- permission and CACHE WRITEBACK. + * Mapping tagged with a file descriptor. User is responsible for + * maintenance of CPU and DSP caches for the buffer. Get virtual address + * of buffer on DSP using HAP_mmap_get() and HAP_mmap_put() functions. + */ + FASTRPC_MAP_FD, + + /** + * Mapping delayed until user calls HAP_mmap() and HAP_munmap() + * functions on DSP. User is responsible for maintenance of CPU and DSP + * caches for the buffer. Delayed mapping is useful for users to map + * buffer on DSP with other than default permissions and cache modes + * using HAP_mmap() and HAP_munmap() functions. + */ + FASTRPC_MAP_FD_DELAYED, + + /** Reserved for compatibility **/ + FASTRPC_MAP_RESERVED_4, + FASTRPC_MAP_RESERVED_5, + FASTRPC_MAP_RESERVED_6, + FASTRPC_MAP_RESERVED_7, + FASTRPC_MAP_RESERVED_8, + FASTRPC_MAP_RESERVED_9, + FASTRPC_MAP_RESERVED_10, + FASTRPC_MAP_RESERVED_11, + FASTRPC_MAP_RESERVED_12, + FASTRPC_MAP_RESERVED_13, + FASTRPC_MAP_RESERVED_14, + FASTRPC_MAP_RESERVED_15, + + /** + * This flag is used to skip CPU mapping, + * otherwise behaves similar to FASTRPC_MAP_FD_DELAYED flag. + */ + FASTRPC_MAP_FD_NOMAP, + + /** Update FASTRPC_MAP_MAX when adding new value to this enum **/ +}; + +#endif + +namespace common { + +#ifdef _WIN32 +constexpr const char * kQnnRpcLibName = "libcdsprpc.dll"; +#else +constexpr const char * kQnnRpcLibName = "libcdsprpc.so"; +#endif + +class rpc_interface { + using rpc_mem_init_t = void (*)(); + using rpc_mem_deinit_t = void (*)(); + using rpc_mem_alloc_t = void * (*) (int heapid, uint32_t flags, int size); + using rpc_mem_alloc2_t = void * (*) (int heapid, uint32_t flags, size_t size); + using rpc_mem_free_t = void (*)(void * po); + using rpc_mem_to_fd_t = int (*)(void * po); + using rpc_mem_fastrpc_mmap_t = int (*)(int domain, int fd, void * addr, int offset, size_t length, + enum fastrpc_map_flags flags); + using rpc_mem_fastrpc_munmap_t = int (*)(int domain, int fd, void * addr, size_t length); + using remote_handle_control_t = int (*)(uint32_t req, void * data, uint32_t datalen); + using remote_session_control_t = int (*)(uint32_t req, void * data, uint32_t datalen); + + public: + rpc_interface(const std::string & rpc_lib_path = kQnnRpcLibName) { + _rpc_lib_handle = dl_load(rpc_lib_path); + if (!_rpc_lib_handle) { + LOG_ERROR("failed to load %s, error: %s\n", rpc_lib_path.c_str(), dl_error()); + return; + } + + _rpc_mem_init = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_init")); + _rpc_mem_deinit = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_deinit")); + _rpc_mem_alloc = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_alloc")); + _rpc_mem_alloc2 = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_alloc2")); + _rpc_mem_free = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_free")); + _rpc_mem_to_fd = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_to_fd")); + _rpc_mem_fastrpc_mmap = reinterpret_cast(dl_sym(_rpc_lib_handle, "fastrpc_mmap")); + _rpc_mem_fastrpc_munmap = reinterpret_cast(dl_sym(_rpc_lib_handle, "fastrpc_munmap")); + _remote_handle_control = + reinterpret_cast(dl_sym(_rpc_lib_handle, "remote_handle_control")); + _remote_session_control = + reinterpret_cast(dl_sym(_rpc_lib_handle, "remote_session_control")); + } + + bool is_valid() const { return _rpc_lib_handle != nullptr; } + + bool is_alloc2_available() const { return _rpc_mem_alloc2 != nullptr; } + + void rpcmem_init() { + if (_rpc_mem_init) { + _rpc_mem_init(); + } + } + + void rpcmem_deinit() { + if (_rpc_mem_deinit) { + _rpc_mem_deinit(); + } + } + + void * rpcmem_alloc(int heapid, uint32_t flags, int size) { + if (!is_valid()) { + return nullptr; + } + + return _rpc_mem_alloc(heapid, flags, size); + } + + void * rpcmem_alloc2(int heapid, uint32_t flags, size_t size) { + if (!is_valid()) { + return nullptr; + } + + return _rpc_mem_alloc2(heapid, flags, size); + } + + void rpcmem_free(void * buf) { + if (is_valid()) { + _rpc_mem_free(buf); + } + } + + int rpcmem_to_fd(void * buf) { + int mem_fd = -1; + if (is_valid()) { + mem_fd = _rpc_mem_to_fd(buf); + } + + return mem_fd; + } + + int fastrpc_mmap(int domain, int fd, void * addr, int offset, size_t length, enum fastrpc_map_flags flags) { + if (!is_valid()) { + return -1; + } + + return _rpc_mem_fastrpc_mmap(domain, fd, addr, offset, length, flags); + } + + int fastrpc_munmap(int domain, int fd, void * addr, size_t length) { + if (!is_valid()) { + return -1; + } + + return _rpc_mem_fastrpc_munmap(domain, fd, addr, length); + } + + int remote_handle_control(uint32_t req, void * data, uint32_t datalen) { + if (!is_valid()) { + return -1; + } + + return _remote_handle_control(req, data, datalen); + } + + int remote_session_control(uint32_t req, void * data, uint32_t datalen) { + if (!is_valid()) { + return -1; + } + + return _remote_session_control(req, data, datalen); + } + + ~rpc_interface() { + if (_rpc_lib_handle) { + if (_rpc_mem_deinit) { + _rpc_mem_deinit(); + } + + dl_unload(_rpc_lib_handle); + } + } + + private: + dl_handler_t _rpc_lib_handle = nullptr; + rpc_mem_init_t _rpc_mem_init = nullptr; + rpc_mem_deinit_t _rpc_mem_deinit = nullptr; + rpc_mem_alloc_t _rpc_mem_alloc = nullptr; + rpc_mem_alloc2_t _rpc_mem_alloc2 = nullptr; + rpc_mem_free_t _rpc_mem_free = nullptr; + rpc_mem_to_fd_t _rpc_mem_to_fd = nullptr; + rpc_mem_fastrpc_mmap_t _rpc_mem_fastrpc_mmap = nullptr; + rpc_mem_fastrpc_munmap_t _rpc_mem_fastrpc_munmap = nullptr; + remote_handle_control_t _remote_handle_control = nullptr; + remote_session_control_t _remote_session_control = nullptr; + + rpc_interface(const rpc_interface &) = delete; + rpc_interface & operator=(const rpc_interface &) = delete; + rpc_interface(rpc_interface &&) = delete; + rpc_interface & operator=(rpc_interface &&) = delete; +}; + +using rpc_interface_ptr = std::shared_ptr; + +} // namespace common diff --git a/ggml/src/ggml-qnn/shared/rpc-mem.hpp b/ggml/src/ggml-qnn/shared/rpc-mem.hpp new file mode 100644 index 0000000000000..ba8449192b5dd --- /dev/null +++ b/ggml/src/ggml-qnn/shared/rpc-mem.hpp @@ -0,0 +1,129 @@ + +#pragma once + +#include +#include + +#include "common.hpp" +#include "dyn-lib-loader.hpp" +#include "rpc-interface.hpp" + +namespace common { + +class rpc_mem { + public: + rpc_mem() { + auto interface = std::make_shared(); + if (!interface->is_valid()) { + LOG_ERROR("failed to load rpcmem lib\n"); + return; + } + + interface->rpcmem_init(); + _rpc_interface = interface; + LOG_DEBUG("load rpcmem lib successfully\n"); + } + + explicit rpc_mem(rpc_interface_ptr interface) { + if (!interface->is_valid()) { + LOG_ERROR("failed to load rpcmem lib\n"); + return; + } + + interface->rpcmem_init(); + _rpc_interface = interface; + LOG_DEBUG("load rpcmem lib successfully\n"); + } + + ~rpc_mem() { + if (!is_valid()) { + LOG_DEBUG("rpc memory not initialized\n"); + return; + } + + if (_rpc_interface) { + _rpc_interface->rpcmem_deinit(); + _rpc_interface.reset(); + } + + LOG_DEBUG("unload rpcmem lib successfully\n"); + } + + bool is_valid() const { return (bool) _rpc_interface; } + + void * alloc(int heapid, uint32_t flags, size_t size) { + if (!is_valid()) { + LOG_ERROR("rpc memory not initialized\n"); + return nullptr; + } + + if (size > get_max_alloc_size()) { + LOG_ERROR("rpc memory size %zu exceeds max alloc size %zu\n", size, get_max_alloc_size()); + return nullptr; + } + + void * buf = nullptr; + if (_rpc_interface->is_alloc2_available()) { + buf = _rpc_interface->rpcmem_alloc2(heapid, flags, size); + } else { + buf = _rpc_interface->rpcmem_alloc(heapid, flags, size); + } + + if (!buf) { + LOG_ERROR("failed to allocate rpc memory, size: %d MB\n", (int) (size / (1 << 20))); + return nullptr; + } + + LOG_DEBUG("rpc buffer allocated, heapid: %d, flags: 0x%x, size: %zu\n", heapid, flags, size); + return buf; + } + + void free(void * buf) { + if (!is_valid()) { + LOG_ERROR("rpc memory not initialized\n"); + } else { + _rpc_interface->rpcmem_free(buf); + } + } + + int to_fd(void * buf) { + int mem_fd = -1; + if (!is_valid()) { + LOG_ERROR("rpc memory not initialized\n"); + } else { + mem_fd = _rpc_interface->rpcmem_to_fd(buf); + } + + return mem_fd; + } + + size_t get_max_alloc_size() { + return _rpc_interface->is_alloc2_available() ? std::numeric_limits::max() : + std::numeric_limits::max(); + } + + int fastrpc_mmap(int domain, int fd, void * addr, int offset, size_t length, enum fastrpc_map_flags flags) { + if (!is_valid()) { + LOG_ERROR("rpc memory not initialized\n"); + return -1; + } + + return _rpc_interface->fastrpc_mmap(domain, fd, addr, offset, length, flags); + } + + int fastrpc_munmap(int domain, int fd, void * addr, size_t length) { + if (!is_valid()) { + LOG_ERROR("rpc memory not initialized\n"); + return -1; + } + + return _rpc_interface->fastrpc_munmap(domain, fd, addr, length); + } + + private: + rpc_interface_ptr _rpc_interface; +}; + +using rpc_mem_ptr = std::shared_ptr; + +} // namespace common From c2b6fec63f2e2a29b62f5078d03de2836f912cc0 Mon Sep 17 00:00:00 2001 From: nullname Date: Sun, 27 Apr 2025 17:43:32 +0800 Subject: [PATCH 150/166] feat: perf opt part2 (#39) * add qurt_thread * add thread pool * add thread_pool obj at device ctx * wip * small refactoring to fit the thread pool structure * set start/end threads for add * init thread pool * fix thread creation * split complete and pending signals * opt mulmat * wip * 2 threads * back to 4 threads * use barrier * remove some unnecessary package * add multi thread support for mul mat * wip * use qurt_barrier_t instead of qurt_signal_t * wip * wip * add log * split qnn cmake config * create function to calculate the start and end func * wip * fix comment * fix comment * fix comment * wip * fix typo --- ggml/src/ggml-qnn/CMakeLists.txt | 40 +- ggml/src/ggml-qnn/npu/device/device.cpp | 46 ++- ggml/src/ggml-qnn/npu/device/graph.cpp | 45 ++- ggml/src/ggml-qnn/npu/device/graph.hpp | 19 +- ggml/src/ggml-qnn/npu/device/op_impl.cpp | 43 +-- ggml/src/ggml-qnn/npu/device/op_impl.hpp | 2 +- ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp | 56 +-- ggml/src/ggml-qnn/npu/device/op_mul_mat.hpp | 2 +- ggml/src/ggml-qnn/npu/device/tensor.hpp | 5 +- ggml/src/ggml-qnn/npu/device/thread_pool.hpp | 190 ++++++++++ ggml/src/ggml-qnn/npu/device/util.hpp | 28 +- ggml/src/ggml-qnn/qnn/CMakeLists.txt | 42 +++ .../ggml-qnn/qnn/hexagon/GgmlOpPackage.xml | 88 ----- .../qnn/hexagon/GgmlOpPackage/Makefile | 357 ------------------ .../GgmlOpPackage/config/GgmlOpPackage.xml | 88 ----- .../src/GgmlOpPackageInterface.cpp | 274 -------------- .../GgmlOpPackage/src/ops/GgmlMulMat.cpp | 213 ----------- 17 files changed, 400 insertions(+), 1138 deletions(-) create mode 100644 ggml/src/ggml-qnn/npu/device/thread_pool.hpp create mode 100644 ggml/src/ggml-qnn/qnn/CMakeLists.txt delete mode 100644 ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage.xml delete mode 100644 ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/Makefile delete mode 100644 ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/config/GgmlOpPackage.xml delete mode 100644 ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/src/GgmlOpPackageInterface.cpp delete mode 100644 ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/src/ops/GgmlMulMat.cpp diff --git a/ggml/src/ggml-qnn/CMakeLists.txt b/ggml/src/ggml-qnn/CMakeLists.txt index 3e8fa3a1b8117..e605ce8ff2d41 100644 --- a/ggml/src/ggml-qnn/CMakeLists.txt +++ b/ggml/src/ggml-qnn/CMakeLists.txt @@ -5,11 +5,9 @@ option(GGML_QNN_ENABLE_HEXAGON_BACKEND "ggml-qnn: Enable Hexagon custom package" if(CMAKE_SYSTEM_NAME STREQUAL "Android") find_library(LOG_LIB log) - set(QNN_LINK_LIBRARIES ${LOG_LIB}) - set(QNN_DEFAULT_LIB_SEARCH_PATH "/data/local/tmp/" CACHE STRING "customized library search path for QNN backend") - add_compile_options(-g -O0) + set(COMMON_LINK_LIBRARIES ${LOG_LIB}) elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows" OR CMAKE_SYSTEM_NAME STREQUAL "Linux") - set(QNN_DEFAULT_LIB_SEARCH_PATH "" CACHE STRING "customized library search path for QNN backend") + message("Building for Linux or Windows") else() message(FATAL_ERROR "QNN now only available on Android, Windows and Linux") endif() @@ -29,33 +27,15 @@ message("CMAKE_CXX_FLAGS_DEBUG: ${CMAKE_CXX_FLAGS_DEBUG}") message("CMAKE_CXX_FLAGS_RELEASE: ${CMAKE_CXX_FLAGS_RELEASE}") message("QNN_SDK_PATH: ${GGML_QNN_SDK_PATH}") -file(GLOB QNN_SOURCES "${CMAKE_CURRENT_LIST_DIR}/qnn/*.cpp") -file(GLOB COMMON_SOURCES "${CMAKE_CURRENT_LIST_DIR}/*.cpp") -ggml_add_backend_library(ggml-qnn - ${QNN_SOURCES} - ${COMMON_SOURCES} -) +message("GGML_QNN: ${GGML_QNN}") +message("GGML_QNN_ENABLE_PERFORMANCE_TRACKING: ${GGML_QNN_ENABLE_PERFORMANCE_TRACKING}") +message("GGML_QNN_ENABLE_HEXAGON_BACKEND: ${GGML_QNN_ENABLE_HEXAGON_BACKEND}") +message("GGML_HEXAGON_NPU_ONLY: ${GGML_HEXAGON_NPU_ONLY}") -target_include_directories(ggml-qnn PRIVATE - ${GGML_QNN_SDK_PATH}/include/QNN - ${CMAKE_CURRENT_LIST_DIR}/qnn - ${CMAKE_CURRENT_LIST_DIR} +ggml_add_backend_library(ggml-qnn + ../../include/ggml-qnn.h ) -target_link_libraries(ggml-qnn PRIVATE ${QNN_LINK_LIBRARIES}) - -if(NOT "${QNN_DEFAULT_LIB_SEARCH_PATH}" STREQUAL "") - string(REGEX REPLACE "/$" "" QNN_DEFAULT_LIB_SEARCH_PATH "${QNN_DEFAULT_LIB_SEARCH_PATH}") -endif() - -message("GGML_QNN_DEFAULT_LIB_SEARCH_PATH: ${QNN_DEFAULT_LIB_SEARCH_PATH}") -target_compile_definitions(ggml-qnn PRIVATE GGML_QNN_DEFAULT_LIB_SEARCH_PATH="${QNN_DEFAULT_LIB_SEARCH_PATH}") - -if(GGML_QNN_ENABLE_CPU_BACKEND) - message("GGML_QNN_ENABLE_CPU_BACKEND is enabled") - target_compile_definitions(ggml-qnn PRIVATE GGML_QNN_ENABLE_CPU_BACKEND) -else() - message("GGML_QNN_ENABLE_CPU_BACKEND is disabled") -endif() +target_link_libraries(ggml-qnn PRIVATE ${COMMON_LINK_LIBRARIES}) if(GGML_QNN_ENABLE_PERFORMANCE_TRACKING) message("GGML_QNN_ENABLE_PERFORMANCE_TRACKING is enabled") @@ -72,6 +52,8 @@ if(GGML_HEXAGON_NPU_ONLY) set(GGML_QNN_ENABLE_HEXAGON_BACKEND ON) else() message("GGML_HEXAGON_NPU_ONLY is disabled") + add_subdirectory(qnn) + target_link_libraries(ggml-qnn PRIVATE qnn-backend) endif() if(GGML_QNN_ENABLE_HEXAGON_BACKEND) diff --git a/ggml/src/ggml-qnn/npu/device/device.cpp b/ggml/src/ggml-qnn/npu/device/device.cpp index 2368d44f671ef..7281dd48d2fa8 100644 --- a/ggml/src/ggml-qnn/npu/device/device.cpp +++ b/ggml/src/ggml-qnn/npu/device/device.cpp @@ -3,6 +3,7 @@ #include #include +#include #include #include "graph.hpp" @@ -10,15 +11,30 @@ #include "op_impl.hpp" #include "remote.h" #include "tensor.hpp" +#include "thread_pool.hpp" #include "util.hpp" -#define NPU_UNUSED(x) (void) (x) - namespace { struct npu_device_context { - int unused = 0; - // TODO: should we add tensor context here? + std::unique_ptr thread_pool; + + bool init_thread_pool() { + if (thread_pool) { + DEVICE_LOG_DEBUG("Thread pool already initialized"); + return true; + } + + auto pool = std::make_unique(); + if (!pool) { + DEVICE_LOG_ERROR("Failed to create thread pool"); + return false; + } + + thread_pool = std::move(pool); + DEVICE_LOG_DEBUG("Thread pool initialized"); + return true; + } }; inline hexagon::tensor * tensor_from_handle(npu_device_graph_handle_t h) { @@ -37,6 +53,10 @@ inline npu_device_tensor_handle_t graph_to_handle(hexagon::graph * graph) { return reinterpret_cast(graph); } +inline npu_device_context * device_context_from_handle(remote_handle64 h) { + return reinterpret_cast(h); +} + } // namespace int npu_device_open(const char * uri, remote_handle64 * h) { @@ -47,12 +67,18 @@ int npu_device_open(const char * uri, remote_handle64 * h) { return AEE_ENOMEMORY; } + if (!context->init_thread_pool()) { + DEVICE_LOG_ERROR("Failed to initialize thread pool"); + delete context; + return AEE_EFAILED; + } + *h = reinterpret_cast(context); return AEE_SUCCESS; } int npu_device_close(remote_handle64 h) { - auto * context = reinterpret_cast(h); + auto * context = device_context_from_handle(h); if (!context) { DEVICE_LOG_ERROR("Invalid npu_device_context handle"); return AEE_EINVHANDLE; @@ -149,13 +175,19 @@ AEEResult npu_device_graph_set_tensor(remote_handle64 _h, npu_device_graph_handl } AEEResult npu_device_graph_compute(remote_handle64 _h, npu_device_graph_handle_t graph_handle) { - NPU_UNUSED(_h); + auto dev_ctx = device_context_from_handle(_h); + if (!dev_ctx) { + DEVICE_LOG_DEBUG("Invalid npu_device_context handle"); + return AEE_EINVHANDLE; + } + auto * graph = graph_from_handle(graph_handle); if (!graph) { + DEVICE_LOG_ERROR("Invalid graph handle"); return AEE_EINVHANDLE; } - if (!graph->compute()) { + if (!graph->compute(dev_ctx->thread_pool.get())) { return AEE_EFAILED; } diff --git a/ggml/src/ggml-qnn/npu/device/graph.cpp b/ggml/src/ggml-qnn/npu/device/graph.cpp index b21b8add2997c..2024d15a215be 100644 --- a/ggml/src/ggml-qnn/npu/device/graph.cpp +++ b/ggml/src/ggml-qnn/npu/device/graph.cpp @@ -8,24 +8,23 @@ namespace hexagon { +graph::graph() noexcept { + DEVICE_LOG_DEBUG("graph(%p) created\n", (void *) this); +} + graph::~graph() noexcept { - if (_tensors) { - delete[] _tensors; - } + _tensors.reset(); + DEVICE_LOG_DEBUG("graph(%p) destroyed\n", (void *) this); } void graph::set_tensor(const npu_device_tensor_handle_t * tensors, int tensor_count) { - if (_tensor_count > 0) { - delete[] _tensors; - } - if (tensor_count <= 0) { - _tensors = nullptr; + _tensors.reset(); _tensor_count = 0; return; } - _tensors = new (std::nothrow) tensor *[tensor_count]; + _tensors = std::make_unique(size_t(tensor_count)); for (int i = 0; i < tensor_count; ++i) { auto * tensor_obj = reinterpret_cast(tensors[i]); _tensors[i] = tensor_obj; @@ -37,31 +36,43 @@ void graph::set_tensor(const npu_device_tensor_handle_t * tensors, int tensor_co DEVICE_LOG_DEBUG("graph(%p) tensor count: %zu\n", (void *) this, _tensor_count); } -bool graph::compute() { +bool graph::compute(default_thread_pool * thread_pool) { if (!_tensors || !_tensor_count) { DEVICE_LOG_DEBUG("graph(%p) no tensors to compute\n", (void *) this); return true; // return success if no tensors to compute } DEVICE_LOG_DEBUG("graph(%p) compute\n", (void *) this); + thread_pool->sync_execute(reinterpret_cast(&graph::thread_pool_task), this); + + for (size_t i = 0; i < _tensor_count; ++i) { + auto * dst = _tensors[i]; + dst->flush(); // TODO: optimize this + } + + return true; +} + +void graph::thread_pool_task(default_thread_pool * pool, size_t thread_idx, size_t thread_count, graph * graph) { + NPU_UNUSED(pool); + graph->compute_impl(thread_idx, thread_count); +} + +void graph::compute_impl(size_t thread_idx, size_t thread_count) { for (size_t i = 0; i < _tensor_count; ++i) { auto * dst = _tensors[i]; auto op = dst->get_op(); auto * func = get_compute_func(op); if (!func) { DEVICE_LOG_ERROR("graph(%p) tensor[%zu] op %d not supported\n", (void *) this, i, op); - return false; + return; } - if (!func(dst)) { + if (!func(dst, thread_idx, thread_count)) { DEVICE_LOG_ERROR("graph(%p) tensor[%zu] op %d compute failed\n", (void *) this, i, op); - return false; + return; } - - dst->flush(); // TODO: optimize this } - - return true; } } // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/device/graph.hpp b/ggml/src/ggml-qnn/npu/device/graph.hpp index 22f6615d1435f..7ca29316991b5 100644 --- a/ggml/src/ggml-qnn/npu/device/graph.hpp +++ b/ggml/src/ggml-qnn/npu/device/graph.hpp @@ -1,29 +1,32 @@ #pragma once +#include + #include "hexagon_npu.h" #include "tensor.hpp" +#include "thread_pool.hpp" namespace hexagon { class graph { public: // TODO: add execute direction here - explicit graph() noexcept {} + explicit graph() noexcept; ~graph() noexcept; void set_tensor(const npu_device_tensor_handle_t * tensors, int tensor_count); - bool compute(); + bool compute(default_thread_pool * thread_pool); private: - tensor ** _tensors = nullptr; - size_t _tensor_count = 0; + static void thread_pool_task(default_thread_pool * pool, size_t thread_idx, size_t thread_count, graph * graph); + void compute_impl(size_t thread_idx, size_t thread_count); + + std::unique_ptr _tensors; + size_t _tensor_count = 0; - graph(const graph &) = delete; - void operator=(const graph &) = delete; - graph(graph &&) = delete; - void operator=(graph &&) = delete; + DISABLE_COPY_AND_MOVE(graph); }; } // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/device/op_impl.cpp b/ggml/src/ggml-qnn/npu/device/op_impl.cpp index 7067a1d52bc9a..8d55971a72d45 100644 --- a/ggml/src/ggml-qnn/npu/device/op_impl.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_impl.cpp @@ -76,11 +76,12 @@ inline HVX_Vector vmul_f32_f32(HVX_Vector a, HVX_Vector b) { } template -bool element_wise_op(hexagon::tensor * out) { +bool element_wise_op(hexagon::tensor * out, size_t tidx, size_t tcnt) { if (!out) { return false; } + static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "element_wise_op requires max dims 4"); auto * src0 = out->get_src(0); auto * src1 = out->get_src(1); if (!src0 || !src1) { @@ -93,28 +94,24 @@ bool element_wise_op(hexagon::tensor * out) { return false; } - static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "element_wise_op requires max dims 4"); - - const auto * src0_ptr = reinterpret_cast(src0->get_data()); - const auto * src1_ptr = reinterpret_cast(src1->get_data()); - auto * dst_ptr = reinterpret_cast(out->get_data()); - for (int64_t i3 = 0; i3 < out->get_ne(3); i3++) { - const auto * src0_cube = src0_ptr + i3 * src0->get_nb(3); - const auto * src1_cube = src1_ptr + (i3 % src1->get_ne(3)) * src1->get_nb(3); - auto * dst_cube = dst_ptr + i3 * out->get_nb(3); - for (int64_t i2 = 0; i2 < out->get_ne(2); i2++) { - const auto * src0_plane = src0_cube + i2 * src0->get_nb(2); - const auto * src1_plane = src1_cube + (i2 % src1->get_ne(2)) * src1->get_nb(2); - auto * dst_plane = dst_cube + i2 * out->get_nb(2); - for (int64_t i1 = 0; i1 < out->get_ne(1); i1++) { - // TODO: prefetch row? - auto * src0_row = src0_plane + i1 * src0->get_nb(1); - auto * src1_row = src1_plane + (i1 % src1->get_ne(1)) * src1->get_nb(1); - auto * dst_row = reinterpret_cast(dst_plane + i1 * out->get_nb(1)); - _RowFunc(reinterpret_cast(src0_row), reinterpret_cast(src1_row), - static_cast(out->get_ne(0)), reinterpret_cast<_TyDst *>(dst_row)); - } - } + const auto * src0_ptr = reinterpret_cast(src0->get_data()); + const auto * src1_ptr = reinterpret_cast(src1->get_data()); + auto * dst_ptr = reinterpret_cast(out->get_data()); + auto total_rows = out->get_ne(3) * out->get_ne(2) * out->get_ne(1); + const auto rows_per_box = out->get_ne(2) * out->get_ne(1); + const auto start_end = hexagon::get_thread_work_slice(total_rows, tidx, tcnt); + for (int64_t ir = start_end.first; ir < start_end.second; ++ir) { + const auto i03 = ir / rows_per_box; + const auto i02 = ir / out->get_ne(1) - i03 * out->get_ne(2); + const auto i01 = ir % out->get_ne(1); + const auto i13 = i03 % src1->get_ne(3); + const auto i12 = i02 % src1->get_ne(2); + const auto i11 = i01 % src1->get_ne(1); + auto * src0_row = src0_ptr + i03 * src0->get_nb(3) + i02 * src0->get_nb(2) + i01 * src0->get_nb(1); + auto * src1_row = src1_ptr + i13 * src1->get_nb(3) + i12 * src1->get_nb(2) + i11 * src1->get_nb(1); + auto * dst_row = dst_ptr + i03 * out->get_nb(3) + i02 * out->get_nb(2) + i01 * out->get_nb(1); + _RowFunc(reinterpret_cast(src0_row), reinterpret_cast(src1_row), + static_cast(out->get_ne(0)), reinterpret_cast<_TyDst *>(dst_row)); } return true; diff --git a/ggml/src/ggml-qnn/npu/device/op_impl.hpp b/ggml/src/ggml-qnn/npu/device/op_impl.hpp index 1fee7769ce04c..6b30d24819e89 100644 --- a/ggml/src/ggml-qnn/npu/device/op_impl.hpp +++ b/ggml/src/ggml-qnn/npu/device/op_impl.hpp @@ -5,7 +5,7 @@ namespace hexagon { -typedef bool (*compute_func_type)(tensor * dst); +typedef bool (*compute_func_type)(tensor * dst, size_t tidx, size_t tcnt); typedef bool (*op_is_supported_func_type)(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, const npu_device_tensor_spec & dst, npu_device_tensor_op op); diff --git a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp index fbda69d2d7cc2..381629da3437c 100644 --- a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp @@ -71,43 +71,45 @@ inline float vec_dot_product_f32_f32(const float * src0, const float * src1, siz namespace hexagon { -bool mul_mat_f32(hexagon::tensor * out) { +bool mul_mat_f32(hexagon::tensor * out, size_t tidx, size_t tcnt) { if (!out) { return false; } + static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "mul_mat_f32 requires max dims 4"); auto * src0 = out->get_src(0); auto * src1 = out->get_src(1); if (!src0 || !src1) { return true; // skip if no src } - static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "mul_mat_f32 requires max dims 4"); - - const auto r02 = src1->get_ne(2) / src0->get_ne(2); - const auto r03 = src1->get_ne(3) / src0->get_ne(3); - const auto * src0_ptr = reinterpret_cast(src0->get_data()); - const auto * src1_ptr = reinterpret_cast(src1->get_data()); - auto * dst_ptr = reinterpret_cast(out->get_data()); - for (int64_t i3 = 0; i3 < out->get_ne(3); i3++) { - const auto * src0_cube = src0_ptr + i3 / r03 * src0->get_nb(3); - const auto * src1_cube = src1_ptr + i3 * src1->get_nb(3); - auto * dst_cube = dst_ptr + i3 * out->get_nb(3); - for (int64_t i2 = 0; i2 < out->get_ne(2); i2++) { - const auto * src0_plane = src0_cube + i2 / r02 * src0->get_nb(2); - const auto * src1_plane = src1_cube + i2 * src1->get_nb(2); - auto * dst_plane = dst_cube + i2 * out->get_nb(2); - for (int64_t i1 = 0; i1 < out->get_ne(1); i1++) { - // TODO: prefetch row? - auto * src1_row = src1_plane + i1 * src1->get_nb(1); - auto * dst_row = reinterpret_cast(dst_plane + i1 * out->get_nb(1)); - for (int64_t i0 = 0; i0 < out->get_ne(0); i0++) { - auto * src0_row = src0_plane + i0 * src0->get_nb(1); - // TODO: figure out how to handle a entire row - *dst_row++ = - vec_dot_product_f32_f32(reinterpret_cast(src0_row), - reinterpret_cast(src1_row), (size_t) src0->get_ne(0)); - } + const auto r02 = src1->get_ne(2) / src0->get_ne(2); + const auto r03 = src1->get_ne(3) / src0->get_ne(3); + const auto * src0_ptr = reinterpret_cast(src0->get_data()); + const auto * src1_ptr = reinterpret_cast(src1->get_data()); + auto * dst_ptr = reinterpret_cast(out->get_data()); + const auto total_planes = out->get_ne(3) * out->get_ne(2); + + const auto start_end_plane = (total_planes >= tcnt) ? get_thread_work_slice(total_planes, tidx, tcnt) : + std::pair{ 0, total_planes }; + const auto start_end_row = (total_planes >= tcnt) ? std::pair{ 0, out->get_ne(1) } : + get_thread_work_slice(out->get_ne(1), tidx, tcnt); + for (int64_t ip = start_end_plane.first; ip < start_end_plane.second; ip++) { + const auto i3 = ip / out->get_ne(2); + const auto i2 = ip - i3 * out->get_ne(2); + const auto * src0_plane = src0_ptr + i3 / r03 * src0->get_nb(3) + i2 / r02 * src0->get_nb(2); + const auto * src1_plane = src1_ptr + i3 * src1->get_nb(3) + i2 * src1->get_nb(2); + auto * dst_plane = dst_ptr + i3 * out->get_nb(3) + i2 * out->get_nb(2); + for (int64_t i1 = start_end_row.first; i1 < start_end_row.second; i1++) { + // TODO: prefetch row? + auto * src1_row = src1_plane + i1 * src1->get_nb(1); + auto * dst_row = reinterpret_cast(dst_plane + i1 * out->get_nb(1)); + for (int64_t i0 = 0; i0 < out->get_ne(0); i0++) { + auto * src0_row = src0_plane + i0 * src0->get_nb(1); + // TODO: figure out how to handle a entire row + *dst_row++ = + vec_dot_product_f32_f32(reinterpret_cast(src0_row), + reinterpret_cast(src1_row), (size_t) src0->get_ne(0)); } } } diff --git a/ggml/src/ggml-qnn/npu/device/op_mul_mat.hpp b/ggml/src/ggml-qnn/npu/device/op_mul_mat.hpp index cc57d3d1fe6d4..fc2eb2c97e3eb 100644 --- a/ggml/src/ggml-qnn/npu/device/op_mul_mat.hpp +++ b/ggml/src/ggml-qnn/npu/device/op_mul_mat.hpp @@ -20,7 +20,7 @@ inline bool is_addr_aligned(void * addr) { return unaligned_bytes(addr) == 0; } -bool mul_mat_f32(tensor * out); +bool mul_mat_f32(tensor * out, size_t tidx, size_t tcnt); bool is_mul_mat_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, const npu_device_tensor_spec & dst, npu_device_tensor_op op); diff --git a/ggml/src/ggml-qnn/npu/device/tensor.hpp b/ggml/src/ggml-qnn/npu/device/tensor.hpp index 83aa29a609cfc..ad1915ecb6418 100644 --- a/ggml/src/ggml-qnn/npu/device/tensor.hpp +++ b/ggml/src/ggml-qnn/npu/device/tensor.hpp @@ -81,10 +81,7 @@ class tensor { tensor * _src[kMaxTensorSrc] = {}; uint8_t * _data = nullptr; - tensor(const tensor &) = delete; - void operator=(const tensor &) = delete; - tensor(tensor &&) = delete; - void operator=(tensor &&) = delete; + DISABLE_COPY_AND_MOVE(tensor); }; } // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/device/thread_pool.hpp b/ggml/src/ggml-qnn/npu/device/thread_pool.hpp new file mode 100644 index 0000000000000..a936ae0c4cafc --- /dev/null +++ b/ggml/src/ggml-qnn/npu/device/thread_pool.hpp @@ -0,0 +1,190 @@ +#pragma once + +#include + +#include +#include +#include +#include + +#include "util.hpp" + +namespace hexagon { + +constexpr const size_t kMaxThreadCount = 4; +constexpr const size_t kDefaultStackSize = 1024 * 16; // 16KB +constexpr const unsigned long long kThreadTaskPendingBit = 1; + +template class qurt_thread { + public: + typedef void (*qurt_thread_func_type)(qurt_thread * thread, void * arg); + + explicit qurt_thread(const std::string & thread_name, qurt_thread_func_type thread_func, void * arg, + unsigned short priority) { + DEVICE_LOG_DEBUG("qurt_thread.create: %s", thread_name.c_str()); + qurt_thread_attr_init(&_attributes); + qurt_thread_attr_set_name(&_attributes, (char *) thread_name.c_str()); + qurt_thread_attr_set_stack_addr(&_attributes, _stack); + qurt_thread_attr_set_stack_size(&_attributes, _stack_size); + qurt_thread_attr_set_priority(&_attributes, priority); + + _func = thread_func; + _arg = arg; + auto ret = qurt_thread_create( + &_tid, &_attributes, reinterpret_cast(&qurt_thread::thread_func_impl), (void *) this); + if (ret != QURT_EOK) { + DEVICE_LOG_ERROR("Failed to create thread: %d", (int) ret); + _func = nullptr; + _arg = nullptr; + return; + } + + DEVICE_LOG_DEBUG("qurt_thread.created: %s, id: %d", thread_name.c_str(), (int) _tid); + } + + ~qurt_thread() { + DEVICE_LOG_DEBUG("qurt_thread.destroy: %d", (int) _tid); + int thread_exit_code = QURT_EOK; + auto ret = qurt_thread_join(_tid, &thread_exit_code); + if (ret != QURT_EOK && ret != QURT_ENOTHREAD) { + DEVICE_LOG_ERROR("Failed to join thread: %d", (int) ret); + return; + } + + if (thread_exit_code != QURT_EOK) { + DEVICE_LOG_ERROR("Thread exit code: %d", (int) thread_exit_code); + } + } + + bool is_valid() const { return _tid != 0 && _func != nullptr; } + + private: + static void thread_func_impl(qurt_thread * thread) { + if (thread->_func) { + thread->_func(thread, thread->_arg); + } + + qurt_thread_exit(QURT_EOK); + } + + uint8_t _stack[_stack_size] = {}; + qurt_thread_t _tid; + qurt_thread_attr_t _attributes; + qurt_thread_func_type _func = nullptr; + void * _arg = nullptr; + + DISABLE_COPY_AND_MOVE(qurt_thread); +}; + +using qurt_thread_ptr = std::unique_ptr>; + +template class thread_pool { + static_assert(_thread_count > 1, "Thread count must be greater than 1"); + constexpr const static size_t kMaxThreadCount = _thread_count - 1; + + public: + typedef qurt_thread thread_type; + typedef void (*task_type)(thread_pool * pool, size_t thread_idx, size_t thread_count, void * arg); + + thread_pool() { + std::string thread_name_base = "thread_pool_"; + qurt_barrier_init(&_pending, kMaxThreadCount + 1); + qurt_barrier_init(&_completed, kMaxThreadCount + 1); + for (size_t i = 0; i < kMaxThreadCount; ++i) { + auto & thread_arg = _thread_args[i]; + thread_arg.pool = this; + thread_arg.thread_idx = i + 1; + + auto thread = std::make_unique( + thread_name_base + std::to_string(i), + reinterpret_cast(&thread_pool::thread_func_impl), &thread_arg, + QURT_THREAD_ATTR_PRIORITY_DEFAULT); + if (!thread->is_valid()) { + DEVICE_LOG_ERROR("Failed to create thread: %zu", i); + // destroy all barriers and threads at destructor + return; + } + + _threads[i] = std::move(thread); + } + DEVICE_LOG_DEBUG("thread_pool.created: %zu", kMaxThreadCount); + } + + ~thread_pool() { + DEVICE_LOG_DEBUG("thread_pool.destroy"); + _thread_exit = true; + qurt_barrier_wait(&_pending); // release all task threads + + for (auto & thread : _threads) { + thread.reset(); + } + + qurt_barrier_destroy(&_completed); + qurt_barrier_destroy(&_pending); + } + + bool sync_execute(task_type task, void * arg) { + if (!task) { + DEVICE_LOG_ERROR("Invalid task"); + return false; + } + + _task = task; + _arg = arg; + qurt_barrier_wait(&_pending); + + task(this, 0, kMaxThreadCount + 1, arg); + DEVICE_LOG_DEBUG("main_thread.task_completed: 0"); + + qurt_barrier_wait(&_completed); + + _task = nullptr; + _arg = nullptr; + return true; + } + + private: + struct thread_pool_arg { + thread_pool * pool = nullptr; + size_t thread_idx = 0; + }; + + static void thread_func_impl(thread_type * thread, thread_pool_arg * arg) { + NPU_UNUSED(thread); + + DEVICE_LOG_DEBUG("thread_func_impl.start: %zu", arg->thread_idx); + + auto & pool = *arg->pool; + for (;;) { + qurt_barrier_wait(&pool._pending); + if (pool._thread_exit) { + DEVICE_LOG_DEBUG("thread_func_impl.exit: %zu", arg->thread_idx); + break; + } + + auto task = pool._task; + if (task) { + task(arg->pool, arg->thread_idx, kMaxThreadCount + 1, pool._arg); + } + + DEVICE_LOG_DEBUG("thread_func_impl.task_completed: %zu", arg->thread_idx); + qurt_barrier_wait(&pool._completed); + } + + DEVICE_LOG_DEBUG("thread_func_impl.end: %zu", arg->thread_idx); + } + + std::atomic_bool _thread_exit = false; + std::array _threads; + thread_pool_arg _thread_args[kMaxThreadCount] = {}; + qurt_barrier_t _pending = {}; + qurt_barrier_t _completed = {}; + task_type _task = nullptr; + void * _arg = nullptr; + + DISABLE_COPY_AND_MOVE(thread_pool); +}; + +using default_thread_pool = thread_pool; + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/device/util.hpp b/ggml/src/ggml-qnn/npu/device/util.hpp index 12b7dde81e9c4..f6f5479694edd 100644 --- a/ggml/src/ggml-qnn/npu/device/util.hpp +++ b/ggml/src/ggml-qnn/npu/device/util.hpp @@ -2,6 +2,10 @@ #include +#include +#include +#include + #include "hexagon_npu.h" #define DEVICE_LOG_ERROR(...) FARF(FATAL, __VA_ARGS__) @@ -16,9 +20,24 @@ # define DEVICE_LOG_DEBUG(...) (void) 0 #endif +// TODO: reuse the declaration at host +#define DISABLE_COPY(class_name) \ + class_name(const class_name &) = delete; \ + void operator=(const class_name &) = delete + +#define DISABLE_MOVE(class_name) \ + class_name(class_name &&) = delete; \ + void operator=(class_name &&) = delete + +#define DISABLE_COPY_AND_MOVE(class_name) \ + DISABLE_COPY(class_name); \ + DISABLE_MOVE(class_name) + +#define NPU_UNUSED(x) (void) (x) + namespace hexagon { -constexpr const char * op_get_name(npu_device_tensor_op op) { +inline constexpr const char * op_get_name(npu_device_tensor_op op) { switch (op) { case NPU_OP_MUL_MAT: return "MUL_MAT"; @@ -33,4 +52,11 @@ constexpr const char * op_get_name(npu_device_tensor_op op) { } } +inline constexpr std::pair get_thread_work_slice(int64_t total, size_t tidx, size_t tcnt) { + const auto elements_per_thread = (total + tcnt - 1) / tcnt; + const auto start = tidx * elements_per_thread; + const auto end = std::min(start + elements_per_thread, total); + return { start, end }; +} + } // namespace hexagon diff --git a/ggml/src/ggml-qnn/qnn/CMakeLists.txt b/ggml/src/ggml-qnn/qnn/CMakeLists.txt new file mode 100644 index 0000000000000..2a9455b8642cc --- /dev/null +++ b/ggml/src/ggml-qnn/qnn/CMakeLists.txt @@ -0,0 +1,42 @@ + + +file(GLOB qnn_srcs "${CMAKE_CURRENT_LIST_DIR}/*.cpp") + +add_library(qnn-backend STATIC + ${qnn_srcs} +) + +target_include_directories(qnn-backend PRIVATE + ${GGML_QNN_SDK_PATH}/include/QNN/ + ${CMAKE_CURRENT_LIST_DIR}/ + ${CMAKE_CURRENT_LIST_DIR}/../ + ${CMAKE_CURRENT_LIST_DIR}/../../ + ${CMAKE_CURRENT_LIST_DIR}/../../../include/ # TODO: figure out how to remove this + ${CMAKE_CURRENT_LIST_DIR}/../shared/ +) + +target_link_directories(qnn-backend PRIVATE + runtime-common +) + +if(GGML_QNN_ENABLE_CPU_BACKEND) + message("GGML_QNN_ENABLE_CPU_BACKEND is enabled") + target_compile_definitions(qnn-backend PUBLIC GGML_QNN_ENABLE_CPU_BACKEND) +else() + message("GGML_QNN_ENABLE_CPU_BACKEND is disabled") +endif() + +if(CMAKE_SYSTEM_NAME STREQUAL "Android") + set(QNN_DEFAULT_LIB_SEARCH_PATH "/data/local/tmp/" CACHE STRING "customized library search path for QNN backend") +elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows" OR CMAKE_SYSTEM_NAME STREQUAL "Linux") + set(QNN_DEFAULT_LIB_SEARCH_PATH "" CACHE STRING "customized library search path for QNN backend") +else() + message(FATAL_ERROR "QNN now only available on Android, Windows and Linux") +endif() + +if(NOT "${QNN_DEFAULT_LIB_SEARCH_PATH}" STREQUAL "") + string(REGEX REPLACE "/$" "" QNN_DEFAULT_LIB_SEARCH_PATH "${QNN_DEFAULT_LIB_SEARCH_PATH}") +endif() + +message("GGML_QNN_DEFAULT_LIB_SEARCH_PATH: ${QNN_DEFAULT_LIB_SEARCH_PATH}") +target_compile_definitions(qnn-backend PUBLIC GGML_QNN_DEFAULT_LIB_SEARCH_PATH="${QNN_DEFAULT_LIB_SEARCH_PATH}") diff --git a/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage.xml b/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage.xml deleted file mode 100644 index f4c6575902948..0000000000000 --- a/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage.xml +++ /dev/null @@ -1,88 +0,0 @@ - - - - - GgmlMulMat - - - GGML MulMat operator - - - - - in[0] - - src0 - - true - BACKEND_SPECIFIC - - 4D - NHWC - [N, C, H , W] - - - - - in[1] - - src1 - - true - BACKEND_SPECIFIC - - 4D - NHWC - [N, C, H , W] - - - - - out[0] - - dst - - true - BACKEND_SPECIFIC - - 4D - [N, C, H , W] - - - - - HTP - - - - - - - GgmlMulMat - - - - - GgmlMulMat - - - in[0] - QNN_DATATYPE_FLOAT_16 - QNN_DATATYPE_FLOAT_32 - - - in[1] - QNN_DATATYPE_FLOAT_16 - QNN_DATATYPE_FLOAT_32 - - - - out[0] - QNN_DATATYPE_FLOAT_16 - QNN_DATATYPE_FLOAT_32 - - - - - - diff --git a/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/Makefile b/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/Makefile deleted file mode 100644 index f177822d35a06..0000000000000 --- a/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/Makefile +++ /dev/null @@ -1,357 +0,0 @@ -# check all setup prerequisites if the command goal is not clean -ifneq ($(MAKECMDGOALS),clean) -ifndef QNN_INCLUDE -$(info "INFO: Qnn include not explicitly defined, attempting to use QNN_SDK_ROOT if it is valid") -QNN_INCLUDE := $(QNN_SDK_ROOT)/include/QNN -endif -ifeq ($(wildcard $(QNN_INCLUDE)),) -$(error "ERROR: QNN_INCLUDE path is not set. QNN include paths must be set to obtain BE headers necessary to compile the package") -endif -ifndef QNN_TARGET_LIB -$(info "INFO: Qnn target not explicitly defined, attempting to use QNN_SDK_ROOT if it is valid") -QNN_TARGET_LIB := $(QNN_SDK_ROOT)/lib/aarch64-android -endif -ifeq ($(wildcard $(QNN_TARGET_LIB)),) -ifeq ($(MAKECMDGOALS),htp_aarch64) -$(error "ERROR: QNN_TARGET_LIB is needed to compile package for aarch64") -else ifeq ($(MAKECMDGOALS),all) -$(info "WARNING:QNN_TARGET_LIB may need to be defined to compile packages") -endif -endif - -ifndef HEXAGON_SDK_ROOT -$(error "ERROR: HEXAGON_SDK_ROOT is not set. Hexagon-SDK path must be set to the latest hexagon-sdk-x.y.z") -endif - -ifeq ($(wildcard $(HEXAGON_SDK_ROOT)),) -$(error "ERROR: HEXAGON_SDK_ROOT is not set correctly. Please set HEXAGON_SDK_ROOT to latest hexagon-sdk-X.Y.Z path") -endif - -HEXAGON_SDK_BASE := $(dir $(HEXAGON_SDK_ROOT)) - -$(info "HEXAGON_SDK_ROOT is [${HEXAGON_SDK_ROOT}]") -# Users should note that the tools version may change between hexagon sdk versions -# Following combination of SDK and Tool version is supported -# fix the sdk root for new versions -HEXAGON_SDK_ROOT_V68 := $(HEXAGON_SDK_ROOT) -HEXAGON_SDK_ROOT_V69 := $(HEXAGON_SDK_ROOT) -HEXAGON_SDK_ROOT_V73 := $(HEXAGON_SDK_ROOT) -HEXAGON_SDK_ROOT_V75 := $(HEXAGON_SDK_ROOT) -HEXAGON_SDK_ROOT_V79 := $(HEXAGON_SDK_ROOT) - -#Updated to point to latest sdk to match with libQnnHtp.so -HEXAGON_SDK_ROOT_X86 := $(HEXAGON_SDK_ROOT) -HEXAGON_TOOLS_VERSION_V68 := 8.7.06 -HEXAGON_TOOLS_VERSION_V69 := 8.7.06 -HEXAGON_TOOLS_VERSION_V73 := 8.7.06 -HEXAGON_TOOLS_VERSION_V75 := 8.7.06 -HEXAGON_TOOLS_VERSION_V79 := 8.7.06 - -#Updated to point to latest sdk to match with libQnnHtp.so -HEXAGON_TOOLS_VERSION_X86 := 8.7.06 - -ifndef ANDROID_NDK_ROOT -ifeq ($(MAKECMDGOALS),htp_aarch64) -$(error "ERROR: ANDROID_NDK_ROOT is not set. Android NDK path must be set to compile package for aarch64") -else ifeq ($(MAKECMDGOALS),all) -$(info "WARNING: ANDROID_NDK_ROOT is not set. Android NDK path must be set to compile package for aarch64") -endif -endif - -ifndef PACKAGE_NAME -export -PACKAGE_NAME := $(notdir $(shell pwd)) -$(info "INFO: No package name defined. Using current directory name: $(PACKAGE_NAME) as the package name") -endif - -WORK := build -SRC_DIR := src -OP_SRC_DIR := src/ops -OP_INCLUDE_DIR := ./include -OP_INCLUDES = #$(wildcard $(OP_INCLUDE_DIR)/*.h) user defined if any op specific headers are needed, add -I to common flags -LIBRARY_NAME := libQnn$(PACKAGE_NAME).so -SUPPORTED_TARGETS = x86_64-linux-clang hexagon-v68 hexagon-v69 hexagon-v73 hexagon-v75 hexagon-v79 aarch64-android - - -COMMON_CXX_FLAGS = -std=c++17 -I$(QNN_INCLUDE) -fPIC -Wall -Wreorder -Wno-missing-braces -Wno-unused-function -COMMON_CXX_FLAGS += -Werror -Wno-format -Wno-unused-command-line-argument -fvisibility=default -stdlib=libc++ -COMMON_CXX_FLAGS += -DQNN_API="__attribute__((visibility(\"default\")))" -D__QAIC_HEADER_EXPORT="__attribute__((visibility(\"default\")))" - -X86_LIBNATIVE_RELEASE_DIR := $(HEXAGON_SDK_ROOT_X86)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_X86)/Tools - -# Ensure hexagon sdk tool version can be retrieved -ifeq ($(wildcard $(X86_LIBNATIVE_RELEASE_DIR)/.),) -$(error "Cannot retrieve hexagon tools from: $(X86_LIBNATIVE_RELEASE_DIR). \ - \ - Please check that hexagon tools version is correct. Expected: $(HEXAGON_TOOLS_VERSION_X86)") -endif - -#Check tools for hexagon_v68 are present. -ifeq ($(MAKECMDGOALS),htp_v68) -ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V68)),) -$(error "ERROR: HEXAGON_SDK_ROOT_V68 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V68)") -endif -endif - -ifeq ($(MAKECMDGOALS),htp_v69) -ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V69)),) -$(error "ERROR: HEXAGON_SDK_ROOT_V69 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V69)") -endif -endif - -ifeq ($(MAKECMDGOALS),htp_v73) -ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V73)),) -$(error "ERROR: HEXAGON_SDK_ROOT_V73 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V73)") -endif -endif - -ifeq ($(MAKECMDGOALS),htp_v75) -ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V75)),) -$(error "ERROR: HEXAGON_SDK_ROOT_V75 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V75)") -endif -endif - -#Check tools for hexagon_v79 are present. -ifeq ($(MAKECMDGOALS),htp_v79) -ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V79)),) -$(error "ERROR: HEXAGON_SDK_ROOT_V79 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V79)") -endif -endif - - - -endif -OP_SOURCES = $(wildcard $(OP_SRC_DIR)/*.cpp) -OTHER_SOURCES = $(wildcard $(SRC_DIR)/*.cpp) -HFILES = $(wildcard $(QNN_INCLUDE)/*.h) -HFILES += $(wildcard $(QNN_INCLUDE)/HTP/*.h) -HFILES += $(wildcard $(QNN_INCLUDE)/HTP/core/*.h) -OP_OBJS = $(patsubst $(SRC_DIR)/%,%,$(patsubst %.cpp,%.o,$(OP_SOURCES))) -OTHER_OBJS = $(patsubst $(SRC_DIR)/%,%,$(patsubst %.cpp,%.o,$(OTHER_SOURCES))) - -#======= Assembly ======== -OP_SOURCES_ASM_X86 += $(wildcard $(OP_SRC_DIR)/x86_asm/*.S) -OP_OBJS_ASM_X86 += $(subst /x86_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_X86)))) -OP_SOURCES_ASM_V68 += $(wildcard $(OP_SRC_DIR)/v68_asm/*.S) -OP_OBJS_ASM_V68 += $(subst /v68_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V68)))) -OP_SOURCES_ASM_V69 += $(wildcard $(OP_SRC_DIR)/v69_asm/*.S) -OP_OBJS_ASM_V69 += $(subst /v69_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V69)))) -OP_SOURCES_ASM_V73 += $(wildcard $(OP_SRC_DIR)/v73_asm/*.S) -OP_OBJS_ASM_V73 += $(subst /v73_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V73)))) -OP_SOURCES_ASM_V75 += $(wildcard $(OP_SRC_DIR)/v75_asm/*.S) -OP_OBJS_ASM_V75 += $(subst /v75_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V75)))) -OP_SOURCES_ASM_V79 += $(wildcard $(OP_SRC_DIR)/v79_asm/*.S) -OP_OBJS_ASM_V79 += $(subst /v79_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V79)))) - -OP_SOURCES_ASM_ANDROID += $(wildcard $(OP_SRC_DIR)/android_asm/*.S) -OP_OBJS_ASM_ANDROID += $(subst /android_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_ANDROID)))) - - -all: htp_v68 htp_x86 htp_aarch64 - -#============================================================================================================ -# Setup compiler, compiler instructions and linker for x86 -X86_CXX ?= clang++-9 -# Checking if clang++-9 is present. If not switch to clang++ -ifeq ($(shell $(X86_CXX) -v 2>&1 | grep -c "clang version"), 0) - X86_CXX := clang++ -endif -X86_LDFLAGS:= -Wl,--whole-archive -L$(X86_LIBNATIVE_RELEASE_DIR)/libnative/lib -lnative -Wl,--no-whole-archive -lpthread -L$(QNN_SDK_ROOT)/lib/x86_64-linux-clang -lHtpPrepare -X86_C_FLAGS := -D__HVXDBL__ -I$(X86_LIBNATIVE_RELEASE_DIR)/libnative/include -ffast-math -DUSE_OS_LINUX -X86_CXX_FLAGS = $(COMMON_CXX_FLAGS) $(X86_C_FLAGS) -fomit-frame-pointer -Wno-invalid-offsetof -linux_objs = -#============================================================================================================ -# Setup compiler, compiler instructions and linker for hexagon -HEXAGON_CXX_FLAGS := $(COMMON_CXX_FLAGS) -mhvx -mhvx-length=128B -mhmx -DUSE_OS_QURT -O2 -Wno-reorder -DPREPARE_DISABLED - -HEXAGON_CXX_FLAGS_V68 := $(HEXAGON_CXX_FLAGS) -mv68 -I$(HEXAGON_SDK_ROOT_V68)/rtos/qurt/computev68/include/qurt -I$(HEXAGON_SDK_ROOT_V68)/rtos/qurt/computev68/include/posix -I$(HEXAGON_SDK_ROOT_V68)/incs -I$(HEXAGON_SDK_ROOT_V68)/incs/stddef -HEXAGON_CXX_FLAGS_V69 := $(HEXAGON_CXX_FLAGS) -mv69 -I$(HEXAGON_SDK_ROOT_V69)/rtos/qurt/computev69/include/qurt -I$(HEXAGON_SDK_ROOT_V69)/rtos/qurt/computev69/include/posix -I$(HEXAGON_SDK_ROOT_V69)/incs -I$(HEXAGON_SDK_ROOT_V69)/incs/stddef -HEXAGON_CXX_FLAGS_V73 := $(HEXAGON_CXX_FLAGS) -mv73 -I$(HEXAGON_SDK_ROOT_V73)/rtos/qurt/computev73/include/qurt -I$(HEXAGON_SDK_ROOT_V73)/rtos/qurt/computev73/include/posix -I$(HEXAGON_SDK_ROOT_V73)/incs -I$(HEXAGON_SDK_ROOT_V73)/incs/stddef -HEXAGON_CXX_FLAGS_V75 := $(HEXAGON_CXX_FLAGS) -mv75 -I$(HEXAGON_SDK_ROOT_V75)/rtos/qurt/computev75/include/qurt -I$(HEXAGON_SDK_ROOT_V75)/rtos/qurt/computev75/include/posix -I$(HEXAGON_SDK_ROOT_V75)/incs -I$(HEXAGON_SDK_ROOT_V75)/incs/stddef -HEXAGON_CXX_FLAGS_V79 := $(HEXAGON_CXX_FLAGS) -mv79 -I$(HEXAGON_SDK_ROOT_V79)/rtos/qurt/computev79/include/qurt -I$(HEXAGON_SDK_ROOT_V79)/rtos/qurt/computev79/include/posix -I$(HEXAGON_SDK_ROOT_V79)/incs -I$(HEXAGON_SDK_ROOT_V79)/incs/stddef - - -HEXAGON_CXX_V68 := $(HEXAGON_SDK_ROOT_V68)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V68)/Tools/bin/hexagon-clang++ -HEXAGON_CXX_V69 := $(HEXAGON_SDK_ROOT_V69)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V69)/Tools/bin/hexagon-clang++ -HEXAGON_CXX_V73 := $(HEXAGON_SDK_ROOT_V73)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V73)/Tools/bin/hexagon-clang++ -HEXAGON_CXX_V75 := $(HEXAGON_SDK_ROOT_V75)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V75)/Tools/bin/hexagon-clang++ -HEXAGON_CXX_V79 := $(HEXAGON_SDK_ROOT_V79)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V79)/Tools/bin/hexagon-clang++ - - -HEX_LDFLAGS = -hexagon_objs = -#============================================================================================================ -# Setup compiler, compiler instructions and linker for aarch64 -AARCH64_C__FLAGS = -D__HVXDBL__ -I$(X86_LIBNATIVE_RELEASE_DIR)/libnative/include -ffast-math -DUSE_OS_LINUX -DANDROID -AARCH64_CXX_FLAGS = $(COMMON_CXX_FLAGS) $(AARCH64_C__FLAGS) -fomit-frame-pointer -Wno-invalid-offsetof -Wno-unused-variable -Wno-unused-parameter -Wno-missing-braces -Wno-sign-compare -Wno-unused-private-field -Wno-unused-variable -Wno-ignored-qualifiers -Wno-missing-field-initializers -ARM_CLANG_OPTS =--target=aarch64-none-linux-android21 --sysroot=$(ANDROID_NDK_ROOT)/toolchains/llvm/prebuilt/linux-x86_64/sysroot -stdlib=libc++ -static-libstdc++ -AARCH64_CXX = $(ANDROID_NDK_ROOT)/toolchains/llvm/prebuilt/linux-x86_64/bin/clang++ $(ARM_CLANG_OPTS) -AARCH64_LDFLAGS = -L$(QNN_TARGET_LIB) -lQnnHtp -lQnnHtpPrepare -aarch64_objs = -#============================================================================================================ -# Setup targets and goals - -htp_x86: X86_BUILD - -htp_v68: HEXAGON_BUILD_V68 - -htp_v69: HEXAGON_BUILD_V69 - -htp_v73: HEXAGON_BUILD_V73 - -htp_v75: HEXAGON_BUILD_V75 - -htp_v79: HEXAGON_BUILD_V79 - - - -htp_aarch64: AARCH64_BUILD - -AARCH64_BUILD: $(WORK)/aarch64-android/$(LIBRARY_NAME) - -HEXAGON_BUILD_V68: $(WORK)/hexagon-v68/$(LIBRARY_NAME) - -HEXAGON_BUILD_V69: $(WORK)/hexagon-v69/$(LIBRARY_NAME) - -HEXAGON_BUILD_V73: $(WORK)/hexagon-v73/$(LIBRARY_NAME) - -HEXAGON_BUILD_V75: $(WORK)/hexagon-v75/$(LIBRARY_NAME) - -HEXAGON_BUILD_V79: $(WORK)/hexagon-v79/$(LIBRARY_NAME) - - - -X86_BUILD: $(WORK)/x86_64-linux-clang/$(LIBRARY_NAME) - - -define build_objs = -ifneq ($(filter $(2),$(SUPPORTED_TARGETS)),) -$(2)_objs += $(foreach x,$(1),$(WORK)/$(2)/$(x)) -else -$$(error "Unknown target option provided: $(2): Supported targets are: $(SUPPORTED_TARGETS)") -endif -endef - -$(eval $(call build_objs,$(OTHER_OBJS),x86_64-linux-clang)) -$(eval $(call build_objs,$(OP_OBJS),x86_64-linux-clang)) -$(eval $(call build_objs,$(OP_OBJS_ASM_X86),x86_64-linux-clang)) -$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v68)) -$(eval $(call build_objs,$(OP_OBJS),hexagon-v68)) -$(eval $(call build_objs,$(OP_OBJS_ASM_V68),hexagon-v68)) -$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v69)) -$(eval $(call build_objs,$(OP_OBJS),hexagon-v69)) -$(eval $(call build_objs,$(OP_OBJS_ASM_V69),hexagon-v69)) -$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v73)) -$(eval $(call build_objs,$(OP_OBJS),hexagon-v73)) -$(eval $(call build_objs,$(OP_OBJS_ASM_V73),hexagon-v73)) -$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v75)) -$(eval $(call build_objs,$(OP_OBJS),hexagon-v75)) -$(eval $(call build_objs,$(OP_OBJS_ASM_V75),hexagon-v75)) -$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v79)) -$(eval $(call build_objs,$(OP_OBJS),hexagon-v79)) -$(eval $(call build_objs,$(OP_OBJS_ASM_V75),hexagon-v79)) - -$(eval $(call build_objs,$(OTHER_OBJS),aarch64-android)) -$(eval $(call build_objs,$(OP_OBJS),aarch64-android)) -$(eval $(call build_objs,$(OP_OBJS_ASM_ANDROID),aarch64-android)) - -# x86 -$(WORK)/x86_64-linux-clang $(WORK)/hexagon-v68 $(WORK)/hexagon-v69 $(WORK)/hexagon-v73 $(WORK)/hexagon-v75 $(WORK)/hexagon-v79 $(WORK)/aarch64-android: - @mkdir -p $@/ops - -$(WORK)/x86_64-linux-clang/%.o: $(SRC_DIR)/%.cpp | $(WORK)/x86_64-linux-clang - $(X86_CXX) $(X86_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ - -$(WORK)/x86_64-linux-clang/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/x86_64-linux-clang - $(X86_CXX) $(X86_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ - -$(WORK)/x86_64-linux-clang/ops/%.o: $(OP_SRC_DIR)/x86_asm/%.S | $(WORK)/x86_64-linux-clang - $(X86_CXX) $(X86_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ - -$(WORK)/x86_64-linux-clang/$(LIBRARY_NAME): $(x86_64-linux-clang_objs) | $(HFILES) - $(X86_CXX) -fPIC -std=c++17 -g -shared -o $@ $^ $(X86_LDFLAGS) - -# v68 -$(WORK)/hexagon-v68/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v68 - $(HEXAGON_CXX_V68) $(HEXAGON_CXX_FLAGS_V68) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ - -$(WORK)/hexagon-v68/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v68 - $(HEXAGON_CXX_V68) $(HEXAGON_CXX_FLAGS_V68) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ - -$(WORK)/hexagon-v68/ops/%.o: $(OP_SRC_DIR)/v68_asm/%.S | $(WORK)/hexagon-v68 - $(HEXAGON_CXX_V68) $(HEXAGON_CXX_FLAGS_V68) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ - -$(WORK)/hexagon-v68/$(LIBRARY_NAME): $(hexagon-v68_objs) | $(HFILES) - $(HEXAGON_CXX_V68) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS) - -# v69 -$(WORK)/hexagon-v69/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v69 - $(HEXAGON_CXX_V69) $(HEXAGON_CXX_FLAGS_V69) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ - -$(WORK)/hexagon-v69/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v69 - $(HEXAGON_CXX_V69) $(HEXAGON_CXX_FLAGS_V69) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ - -$(WORK)/hexagon-v69/ops/%.o: $(OP_SRC_DIR)/v69_asm/%.S | $(WORK)/hexagon-v69 - $(HEXAGON_CXX_V69) $(HEXAGON_CXX_FLAGS_V69) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ - -$(WORK)/hexagon-v69/$(LIBRARY_NAME): $(hexagon-v69_objs) | $(HFILES) - $(HEXAGON_CXX_V69) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS) - -# v73 -$(WORK)/hexagon-v73/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v73 - $(HEXAGON_CXX_V73) $(HEXAGON_CXX_FLAGS_V73) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ - -$(WORK)/hexagon-v73/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v73 - $(HEXAGON_CXX_V73) $(HEXAGON_CXX_FLAGS_V73) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ - -$(WORK)/hexagon-v73/ops/%.o: $(OP_SRC_DIR)/v73_asm/%.S | $(WORK)/hexagon-v73 - $(HEXAGON_CXX_V73) $(HEXAGON_CXX_FLAGS_V73) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ - -$(WORK)/hexagon-v73/$(LIBRARY_NAME): $(hexagon-v73_objs) | $(HFILES) - $(HEXAGON_CXX_V73) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS) - -#v75 -$(WORK)/hexagon-v75/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v75 - $(HEXAGON_CXX_V75) $(HEXAGON_CXX_FLAGS_V75) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ - -$(WORK)/hexagon-v75/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v75 - $(HEXAGON_CXX_V75) $(HEXAGON_CXX_FLAGS_V75) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ - -$(WORK)/hexagon-v75/ops/%.o: $(OP_SRC_DIR)/v75_asm/%.S | $(WORK)/hexagon-v75 - $(HEXAGON_CXX_V75) $(HEXAGON_CXX_FLAGS_V75) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ - -$(WORK)/hexagon-v75/$(LIBRARY_NAME): $(hexagon-v75_objs) | $(HFILES) - $(HEXAGON_CXX_V75) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS) - -#v79 -$(WORK)/hexagon-v79/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v79 - $(HEXAGON_CXX_V79) $(HEXAGON_CXX_FLAGS_V79) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ - -$(WORK)/hexagon-v79/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v79 - $(HEXAGON_CXX_V79) $(HEXAGON_CXX_FLAGS_V79) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ - -$(WORK)/hexagon-v79/ops/%.o: $(OP_SRC_DIR)/v79_asm/%.S | $(WORK)/hexagon-v79 - $(HEXAGON_CXX_V79) $(HEXAGON_CXX_FLAGS_V79) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ - -$(WORK)/hexagon-v79/$(LIBRARY_NAME): $(hexagon-v79_objs) | $(HFILES) - $(HEXAGON_CXX_V79) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS) - - - -# aarch64 -$(WORK)/aarch64-android/%.o: $(SRC_DIR)/%.cpp | $(WORK)/aarch64-android - $(AARCH64_CXX) $(AARCH64_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ - -$(WORK)/aarch64-android/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/aarch64-android - $(AARCH64_CXX) $(AARCH64_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ - -$(WORK)/aarch64-android/ops/%.o: $(OP_SRC_DIR)/android_asm/%.S | $(WORK)/aarch64-android - $(AARCH64_CXX) $(AARCH64_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ - -$(WORK)/aarch64-android/$(LIBRARY_NAME): $(aarch64-android_objs) | $(HFILES) - $(AARCH64_CXX) -fPIC -std=c++17 -g -shared -o $@ $^ $(AARCH64_LDFLAGS) - -clean: - -rm -rf $(WORK) - -.PHONY: all clean diff --git a/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/config/GgmlOpPackage.xml b/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/config/GgmlOpPackage.xml deleted file mode 100644 index f4c6575902948..0000000000000 --- a/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/config/GgmlOpPackage.xml +++ /dev/null @@ -1,88 +0,0 @@ - - - - - GgmlMulMat - - - GGML MulMat operator - - - - - in[0] - - src0 - - true - BACKEND_SPECIFIC - - 4D - NHWC - [N, C, H , W] - - - - - in[1] - - src1 - - true - BACKEND_SPECIFIC - - 4D - NHWC - [N, C, H , W] - - - - - out[0] - - dst - - true - BACKEND_SPECIFIC - - 4D - [N, C, H , W] - - - - - HTP - - - - - - - GgmlMulMat - - - - - GgmlMulMat - - - in[0] - QNN_DATATYPE_FLOAT_16 - QNN_DATATYPE_FLOAT_32 - - - in[1] - QNN_DATATYPE_FLOAT_16 - QNN_DATATYPE_FLOAT_32 - - - - out[0] - QNN_DATATYPE_FLOAT_16 - QNN_DATATYPE_FLOAT_32 - - - - - - diff --git a/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/src/GgmlOpPackageInterface.cpp b/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/src/GgmlOpPackageInterface.cpp deleted file mode 100644 index df9ab364209b5..0000000000000 --- a/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/src/GgmlOpPackageInterface.cpp +++ /dev/null @@ -1,274 +0,0 @@ -//============================================================================== -// Auto Generated Code for GgmlOpPackage -//============================================================================== - -#include "HTP/QnnHtpCommon.h" -#include "HTP/core/constraints.h" -#include "HTP/core/op_package_feature_support.h" -#include "HTP/core/op_register_ext.h" -#include "HTP/core/optimize.h" -#include "HTP/core/simple_reg.h" -#include "HTP/core/unique_types.h" -#include "QnnOpPackage.h" -#include "QnnSdkBuildId.h" - -DEFINE_UNIQ_TY() -BEGIN_PKG_OPS_OPTS_LIST() - -/** Note that the order of declarations given here defines the order in which ops and graph optimizations are - * registered to the HTP Core. - * Append the latest OpName at the bottom - */ -DECLARE_PKG_OPS_OPTS_LIST(PKG_GgmlMulMat) - -END_PKG_OPS_OPTS_LIST() - -// op package info -static constexpr auto sg_packageName = THIS_PKG_NAME_STR; // package name passed in as compile flag - -static std::array sg_opNames{{"GgmlMulMat"}}; - -static Qnn_ApiVersion_t sg_sdkApiVersion = QNN_HTP_API_VERSION_INIT; -static QnnOpPackage_Info_t sg_packageInfo = QNN_OP_PACKAGE_INFO_INIT; - -// global data -static QnnOpPackage_GlobalInfrastructure_t sg_globalInfra = -nullptr; // global infrastructure not in use for now -static bool sg_packageInitialized = false; - -/* - * user provided logging call back function - * currently only supported on linux x86-64 and nonrpc versions - * typedef void (*QnnLog_Callback_t)(const char* fmt, - * QnnLog_Level_t level, - * uint64_t timestamp, - * va_list args); - * usage: if(sg_logInitialized && level <= sg_maxLogLevel) - * sg_logCallback(fmt, level, timestamp, args); - * - * for cross rpc versions, skel side user provided logging call back function - * can be defined as part of op packages. maximal log level sg_maxLogLevel - * can be set by Qnn_ErrorHandle_t GgmlOpPackageLogSetLevel(QnnLog_Level_t maxLogLevel) - */ -/* - * for alternative logging method provided by HTP core, please refer to log.h - */ -static QnnLog_Callback_t sg_logCallback = - nullptr; // user provided call back function pointer for logging -static QnnLog_Level_t sg_maxLogLevel = - (QnnLog_Level_t)0; // maximal log level used in user provided logging -static bool sg_logInitialized = - false; // tracks whether user provided logging method has been initialized - - -/* -* op initialization -* needs to be global in the package -* one initialization per package before any op definitions -* syntax: INIT_PACKAGE_OP_DEF() -*/ -INIT_PACKAGE_OP_DEF() - -/* -* optimization initialization -* needs to be global in the package -* one initialization per package before any optimization definitions -* syntax: INIT_PACKAGE_OPTIMIZATION_DEF() -*/ -INIT_PACKAGE_OPTIMIZATION_DEF() - -/* - * op parameter order initialization - * needs to be global in the package - * one initialization per package before any op parameter order definitions - * syntax: INIT_PACKAGE_PARAM_ORDER_DEF() - */ -INIT_PACKAGE_PARAM_ORDER_DEF() - -/* - * axis parameter name list - * optional - * needs to be global in the package - * one list per package - * for listing axis parameter names passed into Qnn_AddNode API - * HTP backend auto-adjusts values in axis parameters based on HTP backfilling - * note: HTP backend backfills tensor dimensions to 4 dimensions - * syntax: LIST_PACKAGE_AXIS_PARAMS(...) - * e.g. LIST_PACKAGE_AXIS_PARAMS("Axis", "AXIS", "axis") - */ -// LIST_PACKAGE_AXIS_PARAMS() - -/* - * per-channel quantized op name list - * optional - * needs to be global in the package - * one list per package - * for listing op names which support per-channel quantization - * per-axis quantization info of an op is embeded in axisScaleOffsetEncoding - * inside Qnn_Tensor_t types - * HTP backend only supports per-channel scale ops - * i.e. along last dimension, offset is always zero - * if an op name is marked as having per-channel scale support, and in - * QNN_AddNode, at least one input, parameter, or output has - * QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET type: - * then: - * HTP backend will pass to op implementation function the following: - * output(s), input(s), parameter(s), - * outputPerChannelScale(s), inputPerChannelScale(s), paramPerChannelScale(s) - * - * optimization rules can be used to remove extra perChannelScale tensors - * - * syntax: LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS(...) - * e.g. LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS(sg_op1Name, sg_op2Name) - */ - -// LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS() - -/* -* Declare and define the special intialize function for HTP Backend to load -*/ -INIT_PKG_CORE_INIT_FUNC() - -/* op package API's */ - -Qnn_ErrorHandle_t GgmlOpPackageInit(QnnOpPackage_GlobalInfrastructure_t infrastructure) { - if (sg_packageInitialized) return QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED; - - /* - * op parameter order registration - * registers all defined op parameter orders in the package - * syntax: REGISTER_PACKAGE_PARAM_ORDERS() - */ - REGISTER_PACKAGE_PARAM_ORDERS() - - /* - * op axis parameter name registration - * registers all axis parameter names in the package - * used with LIST_PACKAGE_AXIS_PARAMS(...) - * syntax: REGISTER_PACKAGE_AXIS_PARAMS() - */ - REGISTER_PACKAGE_AXIS_PARAMS() - - /* - * per-channel scale op name registration - * registers all per-channel scale op names in the package - * used with LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS(...) - * syntax: REGISTER_PACKAGE_PER_CHANNEL_QUANTIZED_OPS() - */ - REGISTER_PACKAGE_PER_CHANNEL_QUANTIZED_OPS() - - sg_globalInfra = infrastructure; - sg_packageInitialized = true; - return QNN_SUCCESS; -} - -Qnn_ErrorHandle_t GgmlOpPackageGetInfo(const QnnOpPackage_Info_t** info) { - if (!sg_packageInitialized) return QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED; - if (!info) return QNN_OP_PACKAGE_ERROR_INVALID_INFO; - - sg_packageInfo = QNN_OP_PACKAGE_INFO_INIT; - sg_packageInfo.packageName = sg_packageName; - sg_packageInfo.operationNames = sg_opNames.data(); - sg_packageInfo.numOperations = sg_opNames.size(); - sg_packageInfo.sdkBuildId = QNN_SDK_BUILD_ID; - sg_packageInfo.sdkApiVersion = &sg_sdkApiVersion; - - *info = &sg_packageInfo; - return QNN_SUCCESS; -} - -Qnn_ErrorHandle_t GgmlOpPackageLogInitialize(QnnLog_Callback_t callback, QnnLog_Level_t maxLogLevel) { - if (sg_logInitialized) return QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED; - if (!callback) return QNN_LOG_ERROR_INVALID_ARGUMENT; - if (maxLogLevel < QNN_LOG_LEVEL_ERROR) return QNN_LOG_ERROR_INVALID_ARGUMENT; - sg_logCallback = callback; - sg_maxLogLevel = maxLogLevel; - sg_logInitialized = true; - return QNN_SUCCESS; -} - -Qnn_ErrorHandle_t GgmlOpPackageLogSetLevel(QnnLog_Level_t maxLogLevel) { - if (maxLogLevel < QNN_LOG_LEVEL_ERROR) return QNN_LOG_ERROR_INVALID_ARGUMENT; - sg_maxLogLevel = maxLogLevel; - return QNN_SUCCESS; -} - -Qnn_ErrorHandle_t GgmlOpPackageLogTerminate() { - if (!sg_logInitialized) return QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED; - sg_logCallback = nullptr; - sg_maxLogLevel = (QnnLog_Level_t)0; - sg_logInitialized = false; - return QNN_SUCCESS; -} - -Qnn_ErrorHandle_t GgmlOpPackageValidateOpConfig (Qnn_OpConfig_t opConfig){ - if (std::string(sg_packageName) != opConfig.v1.packageName) { - return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; - } - - /* auto-generated validation code below - * Check if op config type matches any registered ops - * If a match is found, check number of inputs, outputs and params - */ - if (std::string(opConfig.v1.typeName) == "GgmlMulMat"){ - if (opConfig.v1.numOfParams != 0 || opConfig.v1.numOfInputs != 2 || opConfig.v1.numOfOutputs != 1){ - return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; - } - } - else{ - return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; - } - - /* - * additional validation code here - * */ - - return QNN_SUCCESS; -} - -/* The following three functions in this comment are not called by HTP backend for now, - * no auto-generated implementations are created. Users should see example for full function signatures. - * (version 1.3.0) Qnn_ErrorHandle_t GgmlOpPackageCreateKernels (QnnOpPackage_GraphInfrastructure_t - * graphInfrastructure, QnnOpPackage_Node_t node, QnnOpPackage_Kernel_t** kernels, uint32_t* - * numKernels) - * (version 1.3.0) Qnn_ErrorHandle_t GgmlOpPackageFreeKernels (QnnOpPackage_Kernel_t* kernels) - * - * (version 1.4.0) Qnn_ErrorHandle_t GgmlOpPackageCreateOpImpl (QnnOpPackage_GraphInfrastructure_t - * graphInfrastructure, QnnOpPackage_Node_t node, QnnOpPackage_OpImpl_t* opImpl) - *(version 1.4.0) Qnn_ErrorHandle_t GgmlOpPackageFreeOpImpl (QnnOpPackage_OpImpl_t opImpl) - */ - -Qnn_ErrorHandle_t GgmlOpPackageTerminate() { -if (!sg_packageInitialized) return QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED; - -sg_globalInfra = nullptr; -sg_packageInitialized = false; -return QNN_SUCCESS; -} - -#ifdef __cplusplus -extern "C" { -#endif - - -/* latest version */ -Qnn_ErrorHandle_t GgmlOpPackageInterfaceProvider(QnnOpPackage_Interface_t* interface) { - if (!interface) return QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT; - interface->interfaceVersion = {1, 4, 0}; - interface->v1_4.init = GgmlOpPackageInit; - interface->v1_4.terminate = GgmlOpPackageTerminate; - interface->v1_4.getInfo = GgmlOpPackageGetInfo; - interface->v1_4.validateOpConfig = GgmlOpPackageValidateOpConfig; - interface->v1_4.createOpImpl = nullptr; - interface->v1_4.freeOpImpl = nullptr; - interface->v1_4.logInitialize = GgmlOpPackageLogInitialize; - interface->v1_4.logSetLevel = GgmlOpPackageLogSetLevel; - interface->v1_4.logTerminate = GgmlOpPackageLogTerminate; - return QNN_SUCCESS; -} - -#ifdef __cplusplus -} -#endif - - diff --git a/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/src/ops/GgmlMulMat.cpp b/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/src/ops/GgmlMulMat.cpp deleted file mode 100644 index 137522cc80773..0000000000000 --- a/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/src/ops/GgmlMulMat.cpp +++ /dev/null @@ -1,213 +0,0 @@ -//============================================================================== -// Auto Generated Code for GgmlOpPackage -//============================================================================== - -#include "HTP/core/constraints.h" -#include "HTP/core/op_package_feature_support.h" -#include "HTP/core/op_register_ext.h" -#include "HTP/core/optimize.h" -#include "HTP/core/simple_reg.h" -#include "QnnOpPackage.h" - -BEGIN_PKG_OP_DEFINITION(PKG_GgmlMulMat); - -// op execute function declarations -template -GraphStatus ggmlmulmatImpl(TensorType & out_0, const TensorType & in_0, const TensorType & in_1); - -// forward declaration of sample cost function -static float ggmlmulmatCostFunc(const Op * op); - -/* - * method 1 for defining op, using default cost value (i.e. GLACIAL) and default flag (Flags::RESOURCE_HVX) - * syntax: DEF_PACKAGE_OP(F,OP) - * e.g. DEF_PACKAGE_OP((ggmlmulmatImpl), "GgmlMulMat") - */ -DEF_PACKAGE_OP((ggmlmulmatImpl), "GgmlMulMat") - -/* - * method 2 for defining op with specified cost value (one of GLACIAL, SNAIL, FAST, FREE) - * and provided flags - * syntax: DEF_PACKAGE_OP_AND_COST_AND_FLAGS(F,OP,COST,...) - * can use zero or more flags, FLAG options are IS_CONST, INHIBIT_CONST_PROP, - * RESOURCE_HVX, RESOURCE_HMX(not supported in external op packages) - * e.g. DEF_PACKAGE_OP_AND_COST_AND_FLAGS((ggmlmulmatImpl), "GgmlMulMat", SNAIL) - */ - -/* - * method 3 for defining op with cost function pointer and provided flags - * cost function pointer type: typedef float (*cost_function) (const Op * op); - * syntax: DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS(F,OP,COST_F,...) - * e.g. DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS((ggmlmulmatImpl), - * "GgmlMulMat", ggmlmulmatCostFunc, Flags::RESOURCE_HVX) - */ - -/* - * optimization definitions - * need to be global in the package - * one definition per optimization - * syntax: DEF_PACKAGE_OPTIMIZATION(PRIORITY,MATCHCODE,CONSTRAINTCODE,REPLACECODE) - * PRIORITY predefined values include EARLY(2000), MIDDLE(3000), LATE(4000) - * HTP core provides some replacement functions for op package to use - * for more information about optimization rules, please refer to HTP core documentations - */ - -/* - * op parameter order definitions - * need to be global in the package - * one definition per op, and this is optional - * syntax: DEF_PACKAGE_PARAM_ORDER(OP,PARAM1,MANDATORY1,DEFAULT1,PARAM2,MANDATORY2,DEFAULT2...) - * one or more parameters can be specified for each op - * order of parameters listed determines the order of parameters passed into op execution functions - * if an op does not have a parameter order definition, parameter order passed into Qnn_addNode - * will be passed into op execution functions - * if an op has a parameter order definition, any parameter passed into Qnn_addNode with unlisted - * name will be abandoned - * if two or more op packages with the same package name will be registered, they cannot list - * conflicting parameter orders - * PARAM refers to parameter name as a string literal - * MANDATORY refers to whether this parameter is required to be provided at Qnn_addNode - * DEFAULT is used when MANDATORY is false - * if provided as Qnn_Param_t*, - * DEFAULT will be used for graph construction when this parameter is not provided at - * Qnn_addNode - * if provided as nullptr, - * graph construction will skip this parameter when this parameter is not provided at - * Qnn_addNode - */ - -namespace { - -constexpr const size_t kBytesPerVector = sizeof(HVX_Vector); // 128 for v73 -constexpr const size_t kFloatsPerVector = kBytesPerVector / sizeof(float); -constexpr const size_t kAlignMask = kBytesPerVector - 1; - -inline size_t unaligned_bytes(const void * addr) { - return ((size_t) addr) & kAlignMask; -} - -inline bool is_addr_aligned(void * addr) { - return unaligned_bytes(addr) == 0; -} - -inline float vec_dot_product_f32(const float * src0, const float * src1, size_t count) { - HVX_Vector * iptr0 = ((HVX_Vector *) src0); - HVX_Vector * iptr0_end = ((HVX_Vector *) src0) + (count / kFloatsPerVector); - HVX_Vector * iptr1 = ((HVX_Vector *) src1); - HVX_Vector prev0 = *iptr0++; - HVX_Vector prev1 = *iptr1++; - HVX_Vector sum = Q6_V_vzero(); - - // TODO: prefetch? - while (iptr0 < iptr0_end) { - HVX_Vector curr0 = *iptr0++; - HVX_Vector curr1 = *iptr1++; - HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); - HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); - sum = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(s0, s1), sum); - prev0 = curr0; - prev1 = curr1; - } - - if ((iptr0_end - ((HVX_Vector *) src0)) > 0) { - // handle the last vector - // see also: https://github.com/UbiquitousLearning/mllm/blob/babf4410352ce8730824c87699c025a0d4ce3a6f/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/LLaMAMul.cpp#L147 - HVX_Vector curr0 = is_addr_aligned(iptr0) ? prev0 : *iptr0++; - HVX_Vector curr1 = is_addr_aligned(iptr1) ? prev1 : *iptr1++; - HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); - HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); - sum = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(s0, s1), sum); - prev0 = curr0; - prev1 = curr1; - } - - const size_t leftover = count % kFloatsPerVector; - const size_t leftover_bytes = leftover * sizeof(float); - if (leftover > 0) { - // handle the leftover elements - HVX_Vector curr0 = (leftover_bytes + unaligned_bytes(iptr0) > kBytesPerVector) ? *iptr0 : prev0; - curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); - - HVX_Vector curr1 = (leftover_bytes + unaligned_bytes(iptr1) > kBytesPerVector) ? *iptr1 : prev1; - curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); - - sum = Q6_Vqf32_vadd_Vqf32Vqf32( - Q6_V_valign_VVR(Q6_Vqf32_vmpy_VsfVsf(curr0, curr1), Q6_V_vzero(), leftover_bytes), sum); - } - - // TODO: do we have a better way to do the reduction? - for (size_t i = kFloatsPerVector / 2; i > 0; i /= 2) { - sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_V_vror_VR(sum, i * sizeof(float))); - } - - float result; - q6op_vstu_variable_ARV(&result, sizeof(float), Q6_Vsf_equals_Vqf32(sum)); - return result; -} - -template -inline GraphStatus mul_mat_2d_f32(TensorType & out_0, const TensorType & in_0, const TensorType & in_1) { - // TODO: handle strides? - if (in_1.dim(1) != in_0.dim(1)) { - return GraphStatus::ErrorDimensions; - } - - size_t dims[4] = { in_1.dim(0), in_0.dim(0) }; - out_0.set_dims(dims); - - auto in0_ptr = (float *) in_0.raw_data_const(); - auto in1_ptr = (float *) in_1.raw_data_const(); - auto out_ptr = (float *) out_0.raw_data(); - - for (size_t i = 0; i < dims[0]; i++) { - // TODO: prefetch? - auto * in1_row = in1_ptr + i * in_1.dim(1); - auto * out_row = out_ptr + i * dims[1]; - for (size_t j = 0; j < dims[1]; j++) { - *out_row++ = vec_dot_product_f32(in0_ptr + j * in_0.dim(1), in1_row, in_0.dim(1)); - } - } - - return GraphStatus::Success; -} - -} // namespace - -/* execute functions for ops */ - -template -GraphStatus ggmlmulmatImpl(TensorType & out_0, const TensorType & in_0, const TensorType & in_1) { - if (!in_0.raw_data_const() || !in_1.raw_data_const() || !out_0.raw_data()) { - return GraphStatus::ErrorBadInput; - } - - if (in_0.rank() != in_1.rank()) { - return GraphStatus::ErrorRank; - } - - auto rank = in_0.rank(); - switch (rank) { - case 4: - case 3: - // TODO: add implementation - return GraphStatus::ErrorUnsupported; - case 2: - return mul_mat_2d_f32(out_0, in_0, in_1); - } - - return GraphStatus::ErrorRank; -} - -__attribute__((unused)) static float ggmlmulmatCostFunc(const Op * op) { - /* - * add code here - * */ - - float cost = 0.0; // add cost computation here - return cost; -} - -/* At the bottom of the op file, call END_PKG_OP_DEFINITION(), - where is as BEGIN_PKG_OP_DEFINITION -*/ -END_PKG_OP_DEFINITION(PKG_GgmlMulMat); From 161c4ee1246396104015c9ea5fad20d51e9c70a6 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Tue, 6 May 2025 20:07:06 +0800 Subject: [PATCH 151/166] fix typo --- ggml/src/ggml-qnn/shared/common.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-qnn/shared/common.cpp b/ggml/src/ggml-qnn/shared/common.cpp index d89a31c20ef39..534b5b5a4b55e 100644 --- a/ggml/src/ggml-qnn/shared/common.cpp +++ b/ggml/src/ggml-qnn/shared/common.cpp @@ -31,7 +31,7 @@ struct ggml_backend_qnn_reg_impl : ggml_backend_reg { backend_device_proxy_ptr device_proxy; if (device_enum < QNN_BACKEND_COUNT) { -#ifdef GGML_HEXAGON_NPU_ONLY +#ifndef GGML_HEXAGON_NPU_ONLY device_proxy = create_qnn_backend_context(device_enum); #else LOG_DEBUG("skip qnn device %d\n", (int) device_enum); From 039f835410cbf4f43c541d35c1b425ff3307cb09 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Thu, 8 May 2025 10:17:11 +0800 Subject: [PATCH 152/166] fix compiling error --- ggml/src/ggml-qnn/qnn/backend-ops.cpp | 1 + ggml/src/ggml-qnn/qnn/op-config-caps.cpp | 2 ++ 2 files changed, 3 insertions(+) diff --git a/ggml/src/ggml-qnn/qnn/backend-ops.cpp b/ggml/src/ggml-qnn/qnn/backend-ops.cpp index d4d2c57cbf4fe..1446115a57ba0 100644 --- a/ggml/src/ggml-qnn/qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/qnn/backend-ops.cpp @@ -112,6 +112,7 @@ constexpr const bool kQnnSupportedOps[] = { false, // GGML_OP_CONV_TRANSPOSE_1D false, // GGML_OP_IM2COL false, // GGML_OP_IM2COL_BACK + false, // GGML_OP_CONV_2D_DW false, // GGML_OP_CONV_TRANSPOSE_2D false, // GGML_OP_POOL_1D false, // GGML_OP_POOL_2D diff --git a/ggml/src/ggml-qnn/qnn/op-config-caps.cpp b/ggml/src/ggml-qnn/qnn/op-config-caps.cpp index d5b55eff970c9..081b7fba7f3ef 100644 --- a/ggml/src/ggml-qnn/qnn/op-config-caps.cpp +++ b/ggml/src/ggml-qnn/qnn/op-config-caps.cpp @@ -157,6 +157,7 @@ constexpr const qnn_op_caps_t kOpCaps[] = { {}, // GGML_OP_CONV_TRANSPOSE_1D {}, // GGML_OP_IM2COL {}, // GGML_OP_IM2COL_BACK + {}, // GGML_OP_CONV_2D_DW {}, // GGML_OP_CONV_TRANSPOSE_2D {}, // GGML_OP_POOL_1D {}, // GGML_OP_POOL_2D @@ -329,6 +330,7 @@ constexpr const op_constructor_t kOpConstructors[] = { nullptr, // GGML_OP_CONV_TRANSPOSE_1D nullptr, // GGML_OP_IM2COL nullptr, // GGML_OP_IM2COL_BACK + nullptr, // GGML_OP_CONV_2D_DW nullptr, // GGML_OP_CONV_TRANSPOSE_2D nullptr, // GGML_OP_POOL_1D nullptr, // GGML_OP_POOL_2D From 0ce53ce7cdba0c02f5acead4e9a2af5ff0c74503 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Thu, 8 May 2025 12:19:40 +0800 Subject: [PATCH 153/166] fix linking error --- ggml/src/ggml-qnn/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-qnn/CMakeLists.txt b/ggml/src/ggml-qnn/CMakeLists.txt index e605ce8ff2d41..43b1fe42f9126 100644 --- a/ggml/src/ggml-qnn/CMakeLists.txt +++ b/ggml/src/ggml-qnn/CMakeLists.txt @@ -53,7 +53,7 @@ if(GGML_HEXAGON_NPU_ONLY) else() message("GGML_HEXAGON_NPU_ONLY is disabled") add_subdirectory(qnn) - target_link_libraries(ggml-qnn PRIVATE qnn-backend) + target_link_libraries(runtime-common PUBLIC qnn-backend) endif() if(GGML_QNN_ENABLE_HEXAGON_BACKEND) From 02af8ff65379be31992e7666c65cb9f1be5f3ea3 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Thu, 8 May 2025 19:53:04 +0800 Subject: [PATCH 154/166] fix qnn only build flag --- ggml/src/ggml-qnn/CMakeLists.txt | 1 - ggml/src/ggml-qnn/qnn/qnn-lib.cpp | 24 ------------------------ ggml/src/ggml-qnn/shared/CMakeLists.txt | 4 ++++ 3 files changed, 4 insertions(+), 25 deletions(-) diff --git a/ggml/src/ggml-qnn/CMakeLists.txt b/ggml/src/ggml-qnn/CMakeLists.txt index 43b1fe42f9126..b563bc798da8a 100644 --- a/ggml/src/ggml-qnn/CMakeLists.txt +++ b/ggml/src/ggml-qnn/CMakeLists.txt @@ -48,7 +48,6 @@ add_subdirectory(shared) if(GGML_HEXAGON_NPU_ONLY) message("GGML_HEXAGON_NPU_ONLY is enabled") - add_compile_definitions(GGML_HEXAGON_NPU_ONLY) set(GGML_QNN_ENABLE_HEXAGON_BACKEND ON) else() message("GGML_HEXAGON_NPU_ONLY is disabled") diff --git a/ggml/src/ggml-qnn/qnn/qnn-lib.cpp b/ggml/src/ggml-qnn/qnn/qnn-lib.cpp index 12e94aaac747c..474bf53434628 100644 --- a/ggml/src/ggml-qnn/qnn/qnn-lib.cpp +++ b/ggml/src/ggml-qnn/qnn/qnn-lib.cpp @@ -323,30 +323,6 @@ bool qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { } } - { - auto & op_package_info = get_op_package_lib_info(_soc_info.soc_model, _soc_info.htp_arch); - if (op_package_info.extra_lib_name) { - _custom_op_extra_lib_handle = - load_lib_with_fallback(op_package_info.extra_lib_name, _additional_lib_load_path); - } - - qnn_status = _qnn_interface->qnn_backend_register_op_package(_qnn_backend_handle, op_package_info.lib_name, - op_package_info.interface, op_package_info.type); - if (qnn_status != QNN_SUCCESS) { - QNN_LOG_WARN("failed to register op package %s, interface: %s, error: %s\n", op_package_info.lib_name, - op_package_info.interface, qnn::get_qnn_error_string(qnn_status)); - } else { - QNN_LOG_DEBUG("register op package %s successfully, ID %u\n", op_package_info.lib_name, - _qnn_interface->get_backend_id()); - _has_custom_op_package = true; - } - } - - /* TODO: not used, keep it for further usage - QnnContext_Config_t qnn_context_config = QNN_CONTEXT_CONFIG_INIT; - qnn_context_config.priority = QNN_PRIORITY_DEFAULT; - const QnnContext_Config_t * context_configs[] = {&qnn_context_config, nullptr}; - */ _qnn_interface->qnn_context_create(_qnn_backend_handle, _qnn_device_handle, nullptr, &_qnn_context_handle); if (!_qnn_context_handle) { QNN_LOG_WARN("failed to initialize qnn context\n"); diff --git a/ggml/src/ggml-qnn/shared/CMakeLists.txt b/ggml/src/ggml-qnn/shared/CMakeLists.txt index b901e656b9ee0..b08b2f07eb11c 100644 --- a/ggml/src/ggml-qnn/shared/CMakeLists.txt +++ b/ggml/src/ggml-qnn/shared/CMakeLists.txt @@ -12,6 +12,10 @@ target_include_directories(runtime-common PUBLIC ${CMAKE_CURRENT_LIST_DIR}/../../../include/ # TODO: figure out how to remove this ) +if(GGML_HEXAGON_NPU_ONLY) + add_compile_definitions(GGML_HEXAGON_NPU_ONLY) +endif() + if(GGML_QNN_ENABLE_HEXAGON_BACKEND) if(DEFINED ENV{QNN_SDK_PATH}) set(HEXAGON_SDK_ROOT $ENV{HEXAGON_SDK_ROOT}) From db2a1254386add5d736e988d42ef49ba653c4208 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Tue, 13 May 2025 20:18:09 +0800 Subject: [PATCH 155/166] fix GGML_QNN_ENABLE_PERFORMANCE_TRACKING option --- ggml/src/ggml-qnn/CMakeLists.txt | 7 ------- ggml/src/ggml-qnn/qnn/CMakeLists.txt | 7 +++++++ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-qnn/CMakeLists.txt b/ggml/src/ggml-qnn/CMakeLists.txt index b563bc798da8a..64c5d1a91ec64 100644 --- a/ggml/src/ggml-qnn/CMakeLists.txt +++ b/ggml/src/ggml-qnn/CMakeLists.txt @@ -37,13 +37,6 @@ ggml_add_backend_library(ggml-qnn ) target_link_libraries(ggml-qnn PRIVATE ${COMMON_LINK_LIBRARIES}) -if(GGML_QNN_ENABLE_PERFORMANCE_TRACKING) - message("GGML_QNN_ENABLE_PERFORMANCE_TRACKING is enabled") - target_compile_definitions(ggml-qnn PRIVATE GGML_QNN_ENABLE_PERFORMANCE_TRACKING) -else() - message("GGML_QNN_ENABLE_PERFORMANCE_TRACKING is disabled") -endif() - add_subdirectory(shared) if(GGML_HEXAGON_NPU_ONLY) diff --git a/ggml/src/ggml-qnn/qnn/CMakeLists.txt b/ggml/src/ggml-qnn/qnn/CMakeLists.txt index 2a9455b8642cc..010fcf08db186 100644 --- a/ggml/src/ggml-qnn/qnn/CMakeLists.txt +++ b/ggml/src/ggml-qnn/qnn/CMakeLists.txt @@ -26,6 +26,13 @@ else() message("GGML_QNN_ENABLE_CPU_BACKEND is disabled") endif() +if(GGML_QNN_ENABLE_PERFORMANCE_TRACKING) + message("GGML_QNN_ENABLE_PERFORMANCE_TRACKING is enabled") + target_compile_definitions(qnn-backend PUBLIC GGML_QNN_ENABLE_PERFORMANCE_TRACKING) +else() + message("GGML_QNN_ENABLE_PERFORMANCE_TRACKING is disabled") +endif() + if(CMAKE_SYSTEM_NAME STREQUAL "Android") set(QNN_DEFAULT_LIB_SEARCH_PATH "/data/local/tmp/" CACHE STRING "customized library search path for QNN backend") elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows" OR CMAKE_SYSTEM_NAME STREQUAL "Linux") From 295f7f5957acc33a4b050ee42cc85f8b051a578f Mon Sep 17 00:00:00 2001 From: nullname Date: Fri, 16 May 2025 19:57:33 +0800 Subject: [PATCH 156/166] feat: perf opt part3 (#42) * add f16 support to etl wise op * wip * Revert "wip" This reverts commit efa88deb0e8265614fd91db3c3dba777c00e858b. * qf32 for mul * wip * Revert "wip" This reverts commit bb419f89ca4599470d61d636fe6fa1e033d62748. * disable fp16 add/sub * tempate trick * wip * add f16 mulmat * add log * fix view liked op * add log * fix f16 mulmat * add quant type * wip * add l2fetch * add vtcm_mem * wip * fix fetch * use vtcm cache in mulmat * revert vtcm cache * cache plane * small opt for plane cache * cache plane for some element wise op * wip * enable fetch even on vtcm * wip * copy sysMonApp * small opt * init ltu * add compute_params * add op common header * move vtcm_mem allocation to compute_param * fallback to memcache when vtcm allocate failed * pre-calculate quantize type * wip * try fix test failure * try fix mulmat nan * fix inf in mulmat * remove debug logs * wip * small refactoring on the dequant row func * fix typo * improve logging * add q4_0 and q8_0 * wip * wip * build hexagon libs in cmake * wip * fix qnn only build flag * fix typo * fix todo * wip * wip * add to_float * use to)float directly instead of ltu * wip * cache f16_to_f32 table into vtcm * print tensor dims at log * init device in supports_op_impl * revert cache ltu * wip * wip * fix graph calc issues by validate cache manually after each op * add cache invalidate func * enable cache fallback only in quantize tensors * add option to disable quantized tensors * propagate the asan flag to npu build * fix asan option * wip * invalidate tensors after finished * implement backend_buffer_reset * wip * wip * refactoring plane cache mechanism * wip * split row elements across thread * use table for f16 to f32 conversion * sync after each op * small refactoring to invalidate l2 cahce * wip * opt on float fetching * unroll for loop manually * reduce vtcm usage * add perf tracking for npu * print dimensions for profiler log * wip * wip * wip * add sub proc tracker * fix typo * print pcycles * wip * wip * prefetch rows * add l2fetch_row * small tweak based on perf tracer * opt l2 fetching * wip --- ggml/src/ggml-qnn/CMakeLists.txt | 52 +-- ggml/src/ggml-qnn/npu/CMakeLists.txt | 107 ++++++- ggml/src/ggml-qnn/npu/device/device.cpp | 38 ++- ggml/src/ggml-qnn/npu/device/graph.cpp | 40 ++- ggml/src/ggml-qnn/npu/device/graph.hpp | 7 +- ggml/src/ggml-qnn/npu/device/op_impl.cpp | 188 ++++++++--- ggml/src/ggml-qnn/npu/device/op_impl.hpp | 9 +- ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp | 317 ++++++++++++++++--- ggml/src/ggml-qnn/npu/device/op_mul_mat.hpp | 35 +- ggml/src/ggml-qnn/npu/device/op_types.hpp | 58 ++++ ggml/src/ggml-qnn/npu/device/quants.cpp | 151 +++++++++ ggml/src/ggml-qnn/npu/device/quants.hpp | 78 +++++ ggml/src/ggml-qnn/npu/device/tensor.hpp | 16 +- ggml/src/ggml-qnn/npu/device/thread_pool.hpp | 2 + ggml/src/ggml-qnn/npu/device/util.hpp | 107 ++++++- ggml/src/ggml-qnn/npu/device/vtcm_mem.hpp | 101 ++++++ ggml/src/ggml-qnn/npu/host/buffer.cpp | 13 +- ggml/src/ggml-qnn/npu/host/buffer.hpp | 2 + ggml/src/ggml-qnn/npu/host/graph.cpp | 7 +- ggml/src/ggml-qnn/npu/host/host.cpp | 2 +- ggml/src/ggml-qnn/npu/host/host_device.cpp | 113 +++++-- ggml/src/ggml-qnn/npu/host/host_device.hpp | 25 +- ggml/src/ggml-qnn/npu/host/tensor.hpp | 12 +- ggml/src/ggml-qnn/npu/host/util.cpp | 71 +++++ ggml/src/ggml-qnn/npu/host/util.hpp | 2 + ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl | 26 ++ ggml/src/ggml-qnn/qnn/CMakeLists.txt | 8 +- ggml/src/ggml-qnn/qnn/graph.cpp | 4 +- ggml/src/ggml-qnn/qnn/graph.hpp | 2 +- ggml/src/ggml-qnn/qnn/profiler.hpp | 4 +- 30 files changed, 1358 insertions(+), 239 deletions(-) create mode 100644 ggml/src/ggml-qnn/npu/device/op_types.hpp create mode 100644 ggml/src/ggml-qnn/npu/device/quants.cpp create mode 100644 ggml/src/ggml-qnn/npu/device/quants.hpp create mode 100644 ggml/src/ggml-qnn/npu/device/vtcm_mem.hpp diff --git a/ggml/src/ggml-qnn/CMakeLists.txt b/ggml/src/ggml-qnn/CMakeLists.txt index 64c5d1a91ec64..bd37ada3ac7c9 100644 --- a/ggml/src/ggml-qnn/CMakeLists.txt +++ b/ggml/src/ggml-qnn/CMakeLists.txt @@ -2,6 +2,8 @@ message(STATUS "Using QNN backend") option(GGML_HEXAGON_NPU_ONLY "ggml-qnn: Only use Hexagon NPU" OFF) option(GGML_QNN_ENABLE_HEXAGON_BACKEND "ggml-qnn: Enable Hexagon custom package" ${GGML_HEXAGON_NPU_ONLY}) +option(GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS "ggml-qnn: Enable quantized tensors support" OFF) +option(GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING "ggml-qnn: Enable performance tracking" OFF) if(CMAKE_SYSTEM_NAME STREQUAL "Android") find_library(LOG_LIB log) @@ -17,6 +19,9 @@ if(NOT DEFINED GGML_QNN_SDK_PATH) # TODO: create a function to search for the SDK path if(DEFINED ENV{QNN_SDK_PATH}) set(GGML_QNN_SDK_PATH $ENV{QNN_SDK_PATH}) + elseif(DEFINED ENV{QNN_SDK_ROOT}) + message("found QNN_SDK_ROOT: ${QNN_SDK_ROOT}") + set(GGML_QNN_SDK_PATH $ENV{QNN_SDK_ROOT}) else() message(FATAL_ERROR "GGML_QNN_SDK_PATH not defined") endif() @@ -28,9 +33,10 @@ message("CMAKE_CXX_FLAGS_RELEASE: ${CMAKE_CXX_FLAGS_RELEASE}") message("QNN_SDK_PATH: ${GGML_QNN_SDK_PATH}") message("GGML_QNN: ${GGML_QNN}") -message("GGML_QNN_ENABLE_PERFORMANCE_TRACKING: ${GGML_QNN_ENABLE_PERFORMANCE_TRACKING}") message("GGML_QNN_ENABLE_HEXAGON_BACKEND: ${GGML_QNN_ENABLE_HEXAGON_BACKEND}") message("GGML_HEXAGON_NPU_ONLY: ${GGML_HEXAGON_NPU_ONLY}") +message("GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS: ${GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS}") +message("GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING: ${GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING}") ggml_add_backend_library(ggml-qnn ../../include/ggml-qnn.h @@ -58,8 +64,8 @@ else() target_link_libraries(ggml-qnn PRIVATE runtime-common) endif() -# Copy QNN dynamic libraries -set(QNN_DYNAMIC_LIBS "") +# Copy dynamic libraries +set(BACKEND_RUNTIME_LIBS "") if(CMAKE_SYSTEM_NAME STREQUAL "Android" OR CMAKE_SYSTEM_NAME STREQUAL "Linux") if(CMAKE_SYSTEM_NAME STREQUAL "Android") @@ -73,35 +79,35 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Android" OR CMAKE_SYSTEM_NAME STREQUAL "Linux") set(QNN_SDK_LIB_PATH "${GGML_QNN_SDK_PATH}/lib/aarch64-oe-linux-gcc11.2") endif() - list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/libQnnSystem.so") - list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/libQnnCpu.so") - list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/libQnnGpu.so") - list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/libQnnHtp.so") + list(APPEND BACKEND_RUNTIME_LIBS "${QNN_SDK_LIB_PATH}/libQnnSystem.so") + list(APPEND BACKEND_RUNTIME_LIBS "${QNN_SDK_LIB_PATH}/libQnnCpu.so") + list(APPEND BACKEND_RUNTIME_LIBS "${QNN_SDK_LIB_PATH}/libQnnGpu.so") + list(APPEND BACKEND_RUNTIME_LIBS "${QNN_SDK_LIB_PATH}/libQnnHtp.so") file(GLOB HTP_STUB_LIBS "${QNN_SDK_LIB_PATH}/libQnnHtp*.so") - list(APPEND QNN_DYNAMIC_LIBS ${HTP_STUB_LIBS}) + list(APPEND BACKEND_RUNTIME_LIBS ${HTP_STUB_LIBS}) if(CMAKE_SYSTEM_NAME STREQUAL "Android") file(GLOB HTP_SKEL_LIBS "${GGML_QNN_SDK_PATH}/lib/hexagon-*/unsigned/libQnnHtp*Skel.so") - list(APPEND QNN_DYNAMIC_LIBS ${HTP_SKEL_LIBS}) + list(APPEND BACKEND_RUNTIME_LIBS ${HTP_SKEL_LIBS}) if(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") if(EXISTS "${CMAKE_ANDROID_NDK}/prebuilt/android-arm64/gdbserver/gdbserver") - list(APPEND QNN_DYNAMIC_LIBS "${CMAKE_ANDROID_NDK}/prebuilt/android-arm64/gdbserver/gdbserver") + list(APPEND BACKEND_RUNTIME_LIBS "${CMAKE_ANDROID_NDK}/prebuilt/android-arm64/gdbserver/gdbserver") message("old ndk, copy gdbserver") else() file(GLOB LLDB_SERVER "${CMAKE_ANDROID_NDK}/toolchains/llvm/prebuilt/linux-x86_64/lib64/clang/*/lib/linux/aarch64/lldb-server") - list(APPEND QNN_DYNAMIC_LIBS ${LLDB_SERVER}) + list(APPEND BACKEND_RUNTIME_LIBS ${LLDB_SERVER}) message("new ndk, copy lldb-server") endif() file(GLOB OMP_LIBS "${CMAKE_ANDROID_NDK}/toolchains/llvm/prebuilt/linux-x86_64/lib64/clang/*/lib/linux/aarch64/libomp.so") file(GLOB ASAN_LIBS "${CMAKE_ANDROID_NDK}/toolchains/llvm/prebuilt/linux-x86_64/lib64/clang/*/lib/linux/libclang_rt.asan-aarch64-android.so") - list(APPEND QNN_DYNAMIC_LIBS ${OMP_LIBS}) - list(APPEND QNN_DYNAMIC_LIBS ${ASAN_LIBS}) + list(APPEND BACKEND_RUNTIME_LIBS ${OMP_LIBS}) + list(APPEND BACKEND_RUNTIME_LIBS ${ASAN_LIBS}) endif() else() # Linux - list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/libHtpPrepare.so") + list(APPEND BACKEND_RUNTIME_LIBS "${QNN_SDK_LIB_PATH}/libHtpPrepare.so") endif() elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows") if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") @@ -112,24 +118,24 @@ elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows") set(QNN_SDK_LIB_PATH "${GGML_QNN_SDK_PATH}/lib/aarch64-windows-msvc") endif() - list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/QnnSystem.dll") - list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/QnnCpu.dll") - list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/QnnGpu.dll") - list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/QnnHtp.dll") + list(APPEND BACKEND_RUNTIME_LIBS "${QNN_SDK_LIB_PATH}/QnnSystem.dll") + list(APPEND BACKEND_RUNTIME_LIBS "${QNN_SDK_LIB_PATH}/QnnCpu.dll") + list(APPEND BACKEND_RUNTIME_LIBS "${QNN_SDK_LIB_PATH}/QnnGpu.dll") + list(APPEND BACKEND_RUNTIME_LIBS "${QNN_SDK_LIB_PATH}/QnnHtp.dll") file(GLOB HTP_STUB_LIBS "${QNN_SDK_LIB_PATH}/QnnHtp*.dll") if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") - list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/HtpPrepare.dll") + list(APPEND BACKEND_RUNTIME_LIBS "${QNN_SDK_LIB_PATH}/HtpPrepare.dll") endif() - list(APPEND QNN_DYNAMIC_LIBS ${HTP_STUB_LIBS}) + list(APPEND BACKEND_RUNTIME_LIBS ${HTP_STUB_LIBS}) endif() -foreach(QNN_DYNAMIC_LIB ${QNN_DYNAMIC_LIBS}) - message("Copy: ${QNN_DYNAMIC_LIB} -> ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}") +foreach(RUNTIME_LIB ${BACKEND_RUNTIME_LIBS}) + message("Copy: ${RUNTIME_LIB} -> ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}") add_custom_command( TARGET ggml-qnn POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy - ${QNN_DYNAMIC_LIB} + ${RUNTIME_LIB} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) endforeach() diff --git a/ggml/src/ggml-qnn/npu/CMakeLists.txt b/ggml/src/ggml-qnn/npu/CMakeLists.txt index 4c734bb098999..5f1009bb9bea6 100644 --- a/ggml/src/ggml-qnn/npu/CMakeLists.txt +++ b/ggml/src/ggml-qnn/npu/CMakeLists.txt @@ -26,6 +26,56 @@ set(common_incs include_directories(${common_incs}) +function(add_device_target target_name DSP_ARCH IS_SIMULATOR BUILD_CPU_COUNT) + if(${CMAKE_BUILD_TYPE} MATCHES "Debug|Dbg") + set(HEXAGON_BUILD_CONFIG "Debug") + set(EXTRA_BUILD_FLAGS + VERBOSE=1 + TREE=1 + ) + else() + set(HEXAGON_BUILD_CONFIG "Release") + set(EXTRA_BUILD_FLAGS) + endif() + + if(${GGML_SANITIZE_ADDRESS} OR ${LLAMA_SANITIZE_ADDRESS}) + set(GGML_HEXAGON_NPU_SANITIZE_ADDRESS ON) + else() + set(GGML_HEXAGON_NPU_SANITIZE_ADDRESS OFF) + endif() + + set(EXTRA_BUILD_FLAGS ${EXTRA_BUILD_FLAGS} GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS=${GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS}) + set(EXTRA_BUILD_FLAGS ${EXTRA_BUILD_FLAGS} GGML_HEXAGON_NPU_SANITIZE_ADDRESS=${GGML_HEXAGON_NPU_SANITIZE_ADDRESS}) + set(EXTRA_BUILD_FLAGS ${EXTRA_BUILD_FLAGS} GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING=${GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING}) + + set(HEXAGON_TOOLS_VARIANT $ENV{DEFAULT_TOOLS_VARIANT}) + set(BUILD_DIR ${CMAKE_CURRENT_LIST_DIR}/hexagon_${HEXAGON_BUILD_CONFIG}_${HEXAGON_TOOLS_VARIANT}_${DSP_ARCH}) + set(BUILD_BINARY_NAME ${BUILD_DIR}/libhexagon_npu_skel_${DSP_ARCH}.so) + + if(${IS_SIMULATOR}) + set(HEXAGON_TOOLCHAIN_TYPE "hexagonsim") + set(OUTPUT_BINARY_NAME libhexagon_npu_skel_${DSP_ARCH}_sim.so) + else() + set(HEXAGON_TOOLCHAIN_TYPE "hexagon") + set(OUTPUT_BINARY_NAME libhexagon_npu_skel_${DSP_ARCH}.so) + endif() + + add_custom_target(${target_name} ALL + COMMAND ${CMAKE_COMMAND} -E remove_directory ${BUILD_DIR} + COMMAND build_cmake ${HEXAGON_TOOLCHAIN_TYPE} DSP_ARCH=${DSP_ARCH} BUILD=${HEXAGON_BUILD_CONFIG} ${EXTRA_BUILD_FLAGS} -j${BUILD_CPU_COUNT} + COMMAND ${CMAKE_COMMAND} -E copy ${BUILD_BINARY_NAME} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${OUTPUT_BINARY_NAME} + BYPRODUCTS ${BUILD_BINARY_NAME} + WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR} + ) +endfunction() + +function(add_dsp_targets_for_host host_target DSP_ARCH BUILD_CPU_COUNT) + add_device_target(hexagon-npu-device-${DSP_ARCH} ${DSP_ARCH} FALSE ${BUILD_CPU_COUNT}) + add_device_target(hexagon-npu-device-${DSP_ARCH}-sim ${DSP_ARCH} TRUE ${BUILD_CPU_COUNT}) + add_dependencies(hexagon-npu-device-${DSP_ARCH}-sim hexagon-npu-device-${DSP_ARCH}) + add_dependencies(${host_target} hexagon-npu-device-${DSP_ARCH}-sim) +endfunction() + if(${CMAKE_SYSTEM_NAME} MATCHES "Android|Linux|Windows") # host build file(GLOB common_srcs "${CMAKE_CURRENT_LIST_DIR}/common/*.cpp") @@ -52,6 +102,12 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Android|Linux|Windows") GGML_QNN_ENABLE_HEXAGON_BACKEND ) + if(GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS) + target_compile_definitions(hexagon-npu-host PUBLIC + GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS + ) + endif() + target_include_directories(hexagon-npu-host PRIVATE ${HEXAGON_SDK_ROOT}/ipc/fastrpc/rpcmem/inc/ ${QNN_SDK_ROOT}/include/QNN/ @@ -71,6 +127,13 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Android|Linux|Windows") target_link_options(hexagon-npu-host PUBLIC -pie) endif() + if(GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING) + message("GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING is enabled") + target_compile_definitions(hexagon-npu-host PUBLIC GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING) + else() + message("GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING is disabled") + endif() + link_options(hexagon-npu-host) if(${CMAKE_SYSTEM_NAME} MATCHES "Android") @@ -84,8 +147,24 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Android|Linux|Windows") choose_dsprpc("3" dsprpc) # cdsprpc link_custom_library(hexagon-npu-host ${dsprpc}) + + cmake_host_system_information(RESULT BUILD_CPU_COUNT QUERY NUMBER_OF_PHYSICAL_CORES) + add_dsp_targets_for_host(hexagon-npu-host "v73" ${BUILD_CPU_COUNT}) + add_dsp_targets_for_host(hexagon-npu-host "v75" ${BUILD_CPU_COUNT}) + + list(APPEND NPU_RUNTIME_LIBS "${HEXAGON_SDK_ROOT}/tools/utils/sysmon/sysMonApp") + list(APPEND NPU_RUNTIME_LIBS "${HEXAGON_SDK_ROOT}/tools/utils/sysmon/sysMonAppLE") + + foreach(RUNTIME_LIB ${NPU_RUNTIME_LIBS}) + message("Copy: ${RUNTIME_LIB} -> ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}") + add_custom_command( + TARGET hexagon-npu-host POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + ${RUNTIME_LIB} + ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) + endforeach() else() - # hexagon npu build + # hexagon npu build, this section will run inside the `build_cmake` script cmake_minimum_required(VERSION 3.14.3) project(hexagon_npu C CXX ASM) @@ -96,6 +175,8 @@ else() set(QNN_SDK_ROOT $ENV{QNN_SDK_ROOT}) message("QNN_SDK_ROOT: ${QNN_SDK_ROOT}") + message("GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS: ${GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS}") + include_directories( ${QNN_SDK_ROOT}/include/QNN/ ) @@ -124,6 +205,30 @@ else() ) endif() + if(GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS) + message("GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS is enabled") + target_compile_definitions(hexagon_npu_skel_OBJS PUBLIC + GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS + ) + endif() + + if(GGML_HEXAGON_NPU_SANITIZE_ADDRESS) + message("GGML_HEXAGON_NPU_SANITIZE_ADDRESS is enabled") + target_compile_options(hexagon_npu_skel_OBJS PUBLIC + -fsanitize=address -fno-omit-frame-pointer + ) + target_link_libraries(hexagon_npu_skel_OBJS PUBLIC + -fsanitize=address + ) + endif() + + if(GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING) + message("GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING is enabled") + target_compile_definitions(hexagon_npu_skel_OBJS PUBLIC + GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING + ) + endif() + build_idl(idl/hexagon_npu.idl hexagon_npu_skel_OBJS) # disable warnings for the skel diff --git a/ggml/src/ggml-qnn/npu/device/device.cpp b/ggml/src/ggml-qnn/npu/device/device.cpp index 7281dd48d2fa8..fbed4b0a28fa6 100644 --- a/ggml/src/ggml-qnn/npu/device/device.cpp +++ b/ggml/src/ggml-qnn/npu/device/device.cpp @@ -9,6 +9,7 @@ #include "graph.hpp" #include "hexagon_npu.h" #include "op_impl.hpp" +#include "quants.hpp" #include "remote.h" #include "tensor.hpp" #include "thread_pool.hpp" @@ -18,6 +19,37 @@ namespace { struct npu_device_context { std::unique_ptr thread_pool; + std::unique_ptr f16_to_f32_table; // TODO: store vtcm? + + bool init() { + if (!init_ltu()) { + DEVICE_LOG_ERROR("Failed to initialize LTU"); + return false; + } + + if (!init_thread_pool()) { + DEVICE_LOG_ERROR("Failed to initialize thread pool"); + return false; + } + + DEVICE_LOG_DEBUG("NPU device context initialized"); + return true; + } + + private: + bool init_ltu() { + constexpr const size_t kLtuCount = 1U << 16; + + f16_to_f32_table = std::make_unique(kLtuCount); + if (!f16_to_f32_table) { + DEVICE_LOG_ERROR("Failed to allocate memory for f16_to_f32 table"); + return false; + } + + hexagon::init_f16_f32_table(f16_to_f32_table.get(), kLtuCount); + DEVICE_LOG_DEBUG("f16_to_f32 table initialized"); + return true; + } bool init_thread_pool() { if (thread_pool) { @@ -67,8 +99,8 @@ int npu_device_open(const char * uri, remote_handle64 * h) { return AEE_ENOMEMORY; } - if (!context->init_thread_pool()) { - DEVICE_LOG_ERROR("Failed to initialize thread pool"); + if (!context->init()) { + DEVICE_LOG_ERROR("Failed to initialize npu_device_context"); delete context; return AEE_EFAILED; } @@ -187,7 +219,7 @@ AEEResult npu_device_graph_compute(remote_handle64 _h, npu_device_graph_handle_t return AEE_EINVHANDLE; } - if (!graph->compute(dev_ctx->thread_pool.get())) { + if (!graph->compute(dev_ctx->thread_pool.get(), dev_ctx->f16_to_f32_table.get())) { return AEE_EFAILED; } diff --git a/ggml/src/ggml-qnn/npu/device/graph.cpp b/ggml/src/ggml-qnn/npu/device/graph.cpp index 2024d15a215be..5201edefea924 100644 --- a/ggml/src/ggml-qnn/npu/device/graph.cpp +++ b/ggml/src/ggml-qnn/npu/device/graph.cpp @@ -5,6 +5,7 @@ #include "op_impl.hpp" #include "util.hpp" +#include "vtcm_mem.hpp" namespace hexagon { @@ -28,50 +29,57 @@ void graph::set_tensor(const npu_device_tensor_handle_t * tensors, int tensor_co for (int i = 0; i < tensor_count; ++i) { auto * tensor_obj = reinterpret_cast(tensors[i]); _tensors[i] = tensor_obj; - DEVICE_LOG_DEBUG("graph(%p) set_tensor[%d]: %p(%p,%p), op: %d\n", (void *) this, i, (void *) tensor_obj, - (void *) tensor_obj->get_src(0), (void *) tensor_obj->get_src(1), tensor_obj->get_op()); + DEVICE_LOG_DEBUG("graph(%p) set_tensor[%d]: %p(%p,%p), op: %s\n", (void *) this, i, (void *) tensor_obj, + (void *) tensor_obj->get_src(0), (void *) tensor_obj->get_src(1), + op_get_name(tensor_obj->get_op())); } _tensor_count = tensor_count; DEVICE_LOG_DEBUG("graph(%p) tensor count: %zu\n", (void *) this, _tensor_count); } -bool graph::compute(default_thread_pool * thread_pool) { - if (!_tensors || !_tensor_count) { +bool graph::compute(default_thread_pool * thread_pool, const float * f16_to_f32_table) { + if (_tensors == nullptr || !_tensor_count) { DEVICE_LOG_DEBUG("graph(%p) no tensors to compute\n", (void *) this); return true; // return success if no tensors to compute } DEVICE_LOG_DEBUG("graph(%p) compute\n", (void *) this); - thread_pool->sync_execute(reinterpret_cast(&graph::thread_pool_task), this); - - for (size_t i = 0; i < _tensor_count; ++i) { - auto * dst = _tensors[i]; - dst->flush(); // TODO: optimize this + _f16_to_f32_table = f16_to_f32_table; + if (thread_pool) { + thread_pool->sync_execute(reinterpret_cast(&graph::thread_pool_task), this); + } else { + compute_impl(nullptr, 0, 1); } + _f16_to_f32_table = nullptr; return true; } void graph::thread_pool_task(default_thread_pool * pool, size_t thread_idx, size_t thread_count, graph * graph) { - NPU_UNUSED(pool); - graph->compute_impl(thread_idx, thread_count); + graph->compute_impl(pool, thread_idx, thread_count); } -void graph::compute_impl(size_t thread_idx, size_t thread_count) { +void graph::compute_impl(default_thread_pool * pool, size_t thread_idx, size_t thread_count) { for (size_t i = 0; i < _tensor_count; ++i) { auto * dst = _tensors[i]; auto op = dst->get_op(); - auto * func = get_compute_func(op); - if (!func) { + auto * func = get_compute_func(dst); + if (func == nullptr) { DEVICE_LOG_ERROR("graph(%p) tensor[%zu] op %d not supported\n", (void *) this, i, op); return; } - if (!func(dst, thread_idx, thread_count)) { + hexagon::compute_params params = { thread_idx, thread_count, _f16_to_f32_table }; + if (!func(dst, ¶ms)) { DEVICE_LOG_ERROR("graph(%p) tensor[%zu] op %d compute failed\n", (void *) this, i, op); - return; } + + // TODO: figure out which ops need to sync + if (pool) { + pool->sync_thread(); + } + dst->invalidate(); } } diff --git a/ggml/src/ggml-qnn/npu/device/graph.hpp b/ggml/src/ggml-qnn/npu/device/graph.hpp index 7ca29316991b5..126d2541786a0 100644 --- a/ggml/src/ggml-qnn/npu/device/graph.hpp +++ b/ggml/src/ggml-qnn/npu/device/graph.hpp @@ -17,14 +17,15 @@ class graph { void set_tensor(const npu_device_tensor_handle_t * tensors, int tensor_count); - bool compute(default_thread_pool * thread_pool); + bool compute(default_thread_pool * thread_pool, const float * f16_to_f32_table); private: static void thread_pool_task(default_thread_pool * pool, size_t thread_idx, size_t thread_count, graph * graph); - void compute_impl(size_t thread_idx, size_t thread_count); + void compute_impl(default_thread_pool * pool, size_t thread_idx, size_t thread_count); std::unique_ptr _tensors; - size_t _tensor_count = 0; + size_t _tensor_count = 0; + const float * _f16_to_f32_table = nullptr; DISABLE_COPY_AND_MOVE(graph); }; diff --git a/ggml/src/ggml-qnn/npu/device/op_impl.cpp b/ggml/src/ggml-qnn/npu/device/op_impl.cpp index 8d55971a72d45..d68fd9a53b4d4 100644 --- a/ggml/src/ggml-qnn/npu/device/op_impl.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_impl.cpp @@ -6,25 +6,27 @@ #include #include "op_mul_mat.hpp" +#include "quants.hpp" namespace { -template -inline void vec_op_f32_f32(const float * src0, const float * src1, size_t count, float * dst) { +template +inline void vec_op_impl(const _TyData * src0, const _TyData * src1, size_t count, _TyData * dst) { + constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(_TyData); + HVX_Vector * iptr0 = ((HVX_Vector *) src0); - HVX_Vector * iptr0_end = ((HVX_Vector *) src0) + (count / hexagon::kFloatsPerVector); + HVX_Vector * iptr0_end = ((HVX_Vector *) src0) + (count / kElementsPerVector); HVX_Vector * iptr1 = ((HVX_Vector *) src1); HVX_Vector * optr = ((HVX_Vector *) dst); HVX_Vector prev0 = *iptr0++; HVX_Vector prev1 = *iptr1++; - // TODO: prefetch or just use VTCM? while (iptr0 < iptr0_end) { HVX_Vector curr0 = *iptr0++; HVX_Vector curr1 = *iptr1++; HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); - *optr++ = Q6_Vsf_equals_Vqf32(_OpIntrinsic(s0, s1)); + *optr++ = _OpIntrinsic(s0, s1); prev0 = curr0; prev1 = curr1; } @@ -42,13 +44,13 @@ inline void vec_op_f32_f32(const float * src0, const float * src1, size_t count, iptr1 = iptr1_aligned ? iptr1 : iptr1 + 1; HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); - *optr++ = Q6_Vsf_equals_Vqf32(_OpIntrinsic(s0, s1)); + *optr++ = _OpIntrinsic(s0, s1); prev0 = curr0; prev1 = curr1; } - const size_t leftover = count % hexagon::kFloatsPerVector; - const size_t leftover_bytes = leftover * sizeof(float); + const size_t leftover = count % kElementsPerVector; + const size_t leftover_bytes = leftover * sizeof(_TyData); if (leftover > 0) { // handle the leftover elements HVX_Vector curr0 = @@ -59,24 +61,56 @@ inline void vec_op_f32_f32(const float * src0, const float * src1, size_t count, (leftover_bytes + hexagon::unaligned_bytes(iptr1) > hexagon::kBytesPerVector) ? *iptr1 : prev1; curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); - q6op_vstu_variable_ARV(optr, leftover_bytes, Q6_Vsf_equals_Vqf32(_OpIntrinsic(curr0, curr1))); + q6op_vstu_variable_ARV(optr, leftover_bytes, _OpIntrinsic(curr0, curr1)); } } +template +inline void vec_op_f32_f32(const float * src0, const float * src1, size_t count, float * dst) { + vec_op_impl<_OpIntrinsic, float>(src0, src1, count, dst); +} + inline HVX_Vector vadd_f32_f32(HVX_Vector a, HVX_Vector b) { - return Q6_Vqf32_vadd_VsfVsf(a, b); + return Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(a, b)); } inline HVX_Vector vsub_f32_f32(HVX_Vector a, HVX_Vector b) { - return Q6_Vqf32_vsub_VsfVsf(a, b); + return Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(a, b)); } inline HVX_Vector vmul_f32_f32(HVX_Vector a, HVX_Vector b) { - return Q6_Vqf32_vmpy_VsfVsf(a, b); + return Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a, b)); +} + +template +inline void vec_op_f16_f16(const npu_device_fp16_t * src0, const npu_device_fp16_t * src1, size_t count, + npu_device_fp16_t * dst) { + vec_op_impl<_OpIntrinsic, npu_device_fp16_t>(src0, src1, count, dst); +} + +inline HVX_Vector vadd_f16_f16(HVX_Vector a, HVX_Vector b) { + // TODO: fix this since qf16 has less precision than fp16 + return Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_VhfVhf(a, b)); } -template -bool element_wise_op(hexagon::tensor * out, size_t tidx, size_t tcnt) { +inline HVX_Vector vsub_f16_f16(HVX_Vector a, HVX_Vector b) { + // TODO: fix this since qf16 has less precision than fp16 + return Q6_Vhf_equals_Vqf16(Q6_Vqf16_vsub_VhfVhf(a, b)); +} + +inline HVX_Vector vmul_f16_f16(HVX_Vector a, HVX_Vector b) { + return Q6_Vhf_equals_Wqf32(Q6_Wqf32_vmpy_VhfVhf(a, b)); +} + +template struct get_data_type {}; + +template struct get_data_type { + using type = _TyData; +}; + +template bool element_wise_op(hexagon::tensor * out, hexagon::compute_params * params) { + using data_type = typename get_data_type::type; + if (!out) { return false; } @@ -94,24 +128,39 @@ bool element_wise_op(hexagon::tensor * out, size_t tidx, size_t tcnt) { return false; } - const auto * src0_ptr = reinterpret_cast(src0->get_data()); - const auto * src1_ptr = reinterpret_cast(src1->get_data()); - auto * dst_ptr = reinterpret_cast(out->get_data()); - auto total_rows = out->get_ne(3) * out->get_ne(2) * out->get_ne(1); - const auto rows_per_box = out->get_ne(2) * out->get_ne(1); - const auto start_end = hexagon::get_thread_work_slice(total_rows, tidx, tcnt); + const auto * src0_ptr = reinterpret_cast(src0->get_read_buffer()); + const auto * src1_ptr = reinterpret_cast(src1->get_read_buffer()); + auto * dst_ptr = reinterpret_cast(out->get_write_buffer()); + auto total_rows = out->get_ne(3) * out->get_ne(2) * out->get_ne(1); + const auto rows_per_cube = out->get_ne(2) * out->get_ne(1); + const auto start_end = hexagon::get_thread_work_slice(total_rows, params->tidx, params->tcnt); + + if (start_end.first >= start_end.second) { + return true; + } + + DEVICE_SCOPED_OP_PERFORMANCE_TRACKER(out, params->tidx); + + const size_t valid_row_bytes = src0->get_ne(0) * sizeof(data_type); for (int64_t ir = start_end.first; ir < start_end.second; ++ir) { - const auto i03 = ir / rows_per_box; - const auto i02 = ir / out->get_ne(1) - i03 * out->get_ne(2); - const auto i01 = ir % out->get_ne(1); - const auto i13 = i03 % src1->get_ne(3); - const auto i12 = i02 % src1->get_ne(2); - const auto i11 = i01 % src1->get_ne(1); - auto * src0_row = src0_ptr + i03 * src0->get_nb(3) + i02 * src0->get_nb(2) + i01 * src0->get_nb(1); - auto * src1_row = src1_ptr + i13 * src1->get_nb(3) + i12 * src1->get_nb(2) + i11 * src1->get_nb(1); - auto * dst_row = dst_ptr + i03 * out->get_nb(3) + i02 * out->get_nb(2) + i01 * out->get_nb(1); - _RowFunc(reinterpret_cast(src0_row), reinterpret_cast(src1_row), - static_cast(out->get_ne(0)), reinterpret_cast<_TyDst *>(dst_row)); + const auto i03 = ir / rows_per_cube; + const auto i02 = ir / out->get_ne(1) - i03 * out->get_ne(2); + const auto i01 = ir % out->get_ne(1); // TODO: should we use divide instead of mod? + const auto i13 = i03 % src1->get_ne(3); + const auto i12 = i02 % src1->get_ne(2); + const auto i11 = i01 % src1->get_ne(1); + + auto * src1_plane = src1_ptr + i13 * src1->get_nb(3) + i12 * src1->get_nb(2); + auto * src0_row = src0_ptr + i03 * src0->get_nb(3) + i02 * src0->get_nb(2) + i01 * src0->get_nb(1); + auto * src1_row = src1_plane + i11 * src1->get_nb(1); + auto * dst_row = dst_ptr + i03 * out->get_nb(3) + i02 * out->get_nb(2) + i01 * out->get_nb(1); + if (ir + 1 < start_end.second) { + hexagon::l2fetch_row(src0_row + src0->get_nb(1), valid_row_bytes); + hexagon::l2fetch_row(src1_row + src1->get_nb(1), valid_row_bytes); + } + + _RowFunc(reinterpret_cast(src0_row), reinterpret_cast(src1_row), + static_cast(out->get_ne(0)), reinterpret_cast(dst_row)); } return true; @@ -120,19 +169,37 @@ bool element_wise_op(hexagon::tensor * out, size_t tidx, size_t tcnt) { bool is_element_wise_op_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, const npu_device_tensor_spec & dst, npu_device_tensor_op op) { if (op != NPU_OP_ADD && op != NPU_OP_SUB && op != NPU_OP_MUL) { - DEVICE_LOG_DEBUG("Unsupported element wise op: %s\n", hexagon::op_get_name(op)); + DEVICE_LOG_DEBUG("[%s]unsupported\n", hexagon::op_get_name(op)); + return false; + } + + if (dst.type != src0.type || dst.type != src1.type) { + DEVICE_LOG_DEBUG("[%s]src0.type and dst.type mismatch: %s vs %s\n", hexagon::op_get_name(op), + hexagon::get_type_name(src0.type), hexagon::get_type_name(dst.type)); + return false; + } + + if (dst.type != NPU_DATA_TYPE_F32 && dst.type != NPU_DATA_TYPE_F16) { + DEVICE_LOG_DEBUG("[%s]unsupported data type: %s\n", hexagon::op_get_name(op), hexagon::get_type_name(dst.type)); + return false; + } + + // TODO: fix FP16 add/sub + if (dst.type == NPU_DATA_TYPE_F16 && op != NPU_OP_MUL) { + DEVICE_LOG_DEBUG("[%s]unsupported data type: %s\n", hexagon::op_get_name(op), hexagon::get_type_name(dst.type)); return false; } if (src0.ne[0] != src1.ne[0]) { - DEVICE_LOG_DEBUG("src0.ne[0] and src1.ne[0] not match: %ld vs %ld\n", (long) src0.ne[0], (long) src1.ne[0]); + DEVICE_LOG_DEBUG("[%s]src0.ne[0] and src1.ne[0] not match: %ld vs %ld\n", hexagon::op_get_name(op), + (long) src0.ne[0], (long) src1.ne[0]); return false; } for (size_t i = 0; i < DEVICE_TENSOR_MAX_DIMS; ++i) { if (src0.ne[i] != dst.ne[i]) { - DEVICE_LOG_DEBUG("src0.ne[%zu] and dst.ne[%zu] not match: %lld vs %lld\n", i, i, (long long) src0.ne[i], - (long long) dst.ne[i]); + DEVICE_LOG_DEBUG("[%s]src0.ne[%zu] and dst.ne[%zu] not match: %lld vs %lld\n", hexagon::op_get_name(op), i, + i, (long long) src0.ne[i], (long long) dst.ne[i]); return false; } } @@ -142,46 +209,67 @@ bool is_element_wise_op_supported(const npu_device_tensor_spec & src0, const npu struct op_capabilities { npu_device_tensor_op op; - hexagon::compute_func_type compute_func; hexagon::op_is_supported_func_type is_supported; + hexagon::compute_func_type compute_funcs[NPU_DATA_TYPE_COUNT]; }; constexpr const op_capabilities kOpCapabilities[] = { - { NPU_OP_MUL_MAT, hexagon::mul_mat_f32, hexagon::is_mul_mat_supported }, - { NPU_OP_ADD, element_wise_op>, is_element_wise_op_supported }, - { NPU_OP_SUB, element_wise_op>, is_element_wise_op_supported }, - { NPU_OP_MUL, element_wise_op>, is_element_wise_op_supported }, + { + NPU_OP_MUL_MAT, hexagon::is_mul_mat_supported, + { + hexagon::mul_mat_f32, // NPU_DATA_TYPE_F32 + nullptr, // NPU_DATA_TYPE_F16 + }, }, + { NPU_OP_ADD, + is_element_wise_op_supported, { + element_wise_op>, // NPU_DATA_TYPE_F32 + element_wise_op>, // NPU_DATA_TYPE_F16 + } }, + { NPU_OP_SUB, + is_element_wise_op_supported, { + element_wise_op>, // NPU_DATA_TYPE_F32 + element_wise_op>, // NPU_DATA_TYPE_F16 + } }, + { NPU_OP_MUL, + is_element_wise_op_supported, { + element_wise_op>, // NPU_DATA_TYPE_F32 + element_wise_op>, // NPU_DATA_TYPE_F16 + } }, }; -static_assert(kOpCapabilities[NPU_OP_MUL_MAT].compute_func == hexagon::mul_mat_f32, +static_assert(kOpCapabilities[NPU_OP_MUL_MAT].compute_funcs[NPU_DATA_TYPE_F32] == hexagon::mul_mat_f32, "kOpArray[NPU_OP_MUL_MAT] != mul_mat_f32"); static_assert(std::size(kOpCapabilities) == NPU_OP_COUNT); static_assert(kOpCapabilities[NPU_OP_MUL_MAT].op == NPU_OP_MUL_MAT, "kOpArray[NPU_OP_MUL_MAT].op != NPU_OP_MUL_MAT"); static_assert(kOpCapabilities[NPU_OP_MUL].op == NPU_OP_MUL, "kOpArray[NPU_OP_MUL].op != NPU_OP_MUL"); -} // namespace - -namespace hexagon { - -compute_func_type get_compute_func(npu_device_tensor_op op) { +hexagon::compute_func_type get_compute_func_impl(npu_device_tensor_op op, npu_device_tensor_data_type type) { if (op >= NPU_OP_COUNT) { return nullptr; } - return kOpCapabilities[op].compute_func; + return kOpCapabilities[op].compute_funcs[type]; +} + +} // namespace + +namespace hexagon { + +compute_func_type get_compute_func(tensor * dst) { + return get_compute_func_impl(dst->get_op(), dst->get_type()); } bool support_op(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, const npu_device_tensor_spec & dst, npu_device_tensor_op op) { - if (get_compute_func(op) == nullptr) { - DEVICE_LOG_ERROR("Unsupported op: %s, get_compute_func failed\n", op_get_name(op)); + if (get_compute_func_impl(op, dst.type) == nullptr) { + DEVICE_LOG_ERROR("[%s]unsupported, get_compute_func failed\n", op_get_name(op)); return false; } auto is_supported_func = kOpCapabilities[op].is_supported; if (!is_supported_func || !is_supported_func(src0, src1, dst, op)) { - DEVICE_LOG_ERROR("Unsupported op: %s, is_supported_func failed\n", op_get_name(op)); + DEVICE_LOG_DEBUG("[%s]unsupported, is_supported_func failed\n", op_get_name(op)); return false; } diff --git a/ggml/src/ggml-qnn/npu/device/op_impl.hpp b/ggml/src/ggml-qnn/npu/device/op_impl.hpp index 6b30d24819e89..f9a3d01187793 100644 --- a/ggml/src/ggml-qnn/npu/device/op_impl.hpp +++ b/ggml/src/ggml-qnn/npu/device/op_impl.hpp @@ -1,15 +1,10 @@ #pragma once -#include "hexagon_npu.h" -#include "tensor.hpp" +#include "op_types.hpp" namespace hexagon { -typedef bool (*compute_func_type)(tensor * dst, size_t tidx, size_t tcnt); -typedef bool (*op_is_supported_func_type)(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, - const npu_device_tensor_spec & dst, npu_device_tensor_op op); - -compute_func_type get_compute_func(npu_device_tensor_op op); +compute_func_type get_compute_func(tensor * dst); bool support_op(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, const npu_device_tensor_spec & dst, npu_device_tensor_op op); diff --git a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp index 381629da3437c..647a5ff925737 100644 --- a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp @@ -2,17 +2,42 @@ #include +#include "quants.hpp" +#include "vtcm_mem.hpp" + namespace { +inline float vec_reduction_f32(HVX_Vector sums) { + constexpr const size_t kFloatsPerVector = hexagon::kBytesPerVector / sizeof(float); + static_assert(kFloatsPerVector == 32 || kFloatsPerVector == 16, "kFloatsPerVector should be 16 or 32"); + + // TODO: do we have a better way to do the reduction? + switch (kFloatsPerVector) { + default: + case 32: + sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 16 * sizeof(float))); + // fallthrough + case 16: + sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 8 * sizeof(float))); + sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 4 * sizeof(float))); + sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 2 * sizeof(float))); + sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, sizeof(float))); + break; + } + + return hexagon::get_flt0_from_fltv(Q6_Vsf_equals_Vqf32(sums)); +} + inline float vec_dot_product_f32_f32(const float * src0, const float * src1, size_t count) { + constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(float); + HVX_Vector * iptr0 = ((HVX_Vector *) src0); - HVX_Vector * iptr0_end = ((HVX_Vector *) src0) + (count / hexagon::kFloatsPerVector); + HVX_Vector * iptr0_end = ((HVX_Vector *) src0) + (count / kElementsPerVector); HVX_Vector * iptr1 = ((HVX_Vector *) src1); HVX_Vector prev0 = *iptr0++; HVX_Vector prev1 = *iptr1++; HVX_Vector sum = Q6_V_vzero(); - // TODO: prefetch or just use VTCM? while (iptr0 < iptr0_end) { HVX_Vector curr0 = *iptr0++; HVX_Vector curr1 = *iptr1++; @@ -41,7 +66,7 @@ inline float vec_dot_product_f32_f32(const float * src0, const float * src1, siz prev1 = curr1; } - const size_t leftover = count % hexagon::kFloatsPerVector; + const size_t leftover = count % kElementsPerVector; const size_t leftover_bytes = leftover * sizeof(float); if (leftover > 0) { // handle the leftover elements @@ -57,21 +82,201 @@ inline float vec_dot_product_f32_f32(const float * src0, const float * src1, siz Q6_V_valign_VVR(Q6_Vqf32_vmpy_VsfVsf(curr0, curr1), Q6_V_vzero(), leftover_bytes), sum); } - // TODO: do we have a better way to do the reduction? - for (size_t i = hexagon::kFloatsPerVector / 2; i > 0; i /= 2) { - sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_V_vror_VR(sum, i * sizeof(float))); + return vec_reduction_f32(sum); +} + +// TODO: merge with vec_dot_product_f32_f32? +inline float vec_dot_product_f16_f16(const npu_device_fp16_t * src0, const npu_device_fp16_t * src1, size_t count) { + constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(npu_device_fp16_t); + constexpr const size_t kFloatsPerVector = hexagon::kBytesPerVector / sizeof(float); + + HVX_Vector * iptr0 = ((HVX_Vector *) src0); + HVX_Vector * iptr0_end = ((HVX_Vector *) src0) + (count / kElementsPerVector); + HVX_Vector * iptr1 = ((HVX_Vector *) src1); + HVX_Vector prev0 = *iptr0++; + HVX_Vector prev1 = *iptr1++; + HVX_Vector sum_hi = Q6_V_vzero(); + HVX_Vector sum_lo = Q6_V_vzero(); + + while (iptr0 < iptr0_end) { + HVX_Vector curr0 = *iptr0++; + HVX_Vector curr1 = *iptr1++; + HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); + HVX_VectorPair result = Q6_Wqf32_vmpy_VhfVhf(s0, s1); + sum_hi = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_hi_W(result), sum_hi); + sum_lo = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(result), sum_lo); + prev0 = curr0; + prev1 = curr1; + } + + if ((iptr0_end - ((HVX_Vector *) src0)) > 0) { + // handle the last vector + // see also: + // https://github.com/UbiquitousLearning/mllm/blob/babf4410352ce8730824c87699c025a0d4ce3a6f/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/LLaMAMul.cpp#L147 + // or qualcomm sdk libs\qhl_hvx\src\qhblas_hvx\qhblas_hvx_aw_vector_add_ah.c + bool iptr0_aligned = hexagon::is_addr_aligned(iptr0); + HVX_Vector curr0 = iptr0_aligned ? prev0 : *iptr0; + iptr0 = iptr0_aligned ? iptr0 : iptr0 + 1; + bool iptr1_aligned = hexagon::is_addr_aligned(iptr1); + HVX_Vector curr1 = iptr1_aligned ? prev1 : *iptr1; + iptr1 = iptr1_aligned ? iptr1 : iptr1 + 1; + HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); + HVX_VectorPair result = Q6_Wqf32_vmpy_VhfVhf(s0, s1); + sum_hi = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_hi_W(result), sum_hi); + sum_lo = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(result), sum_lo); + prev0 = curr0; + prev1 = curr1; + } + + const size_t leftover = count % kElementsPerVector; + const size_t leftover_bytes = leftover * sizeof(npu_device_fp16_t); + if (leftover > 0) { + // handle the leftover elements + HVX_Vector curr0 = + (leftover_bytes + hexagon::unaligned_bytes(iptr0) > hexagon::kBytesPerVector) ? *iptr0 : prev0; + curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + + HVX_Vector curr1 = + (leftover_bytes + hexagon::unaligned_bytes(iptr1) > hexagon::kBytesPerVector) ? *iptr1 : prev1; + curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); + + HVX_VectorPair result = Q6_Wqf32_vmpy_VhfVhf(curr0, curr1); + + // TODO: can we do this better? + if (leftover > kFloatsPerVector) { + sum_hi = Q6_Vqf32_vadd_Vqf32Vqf32( + Q6_V_valign_VVR(Q6_V_hi_W(result), Q6_V_vzero(), (leftover % kFloatsPerVector) * sizeof(float)), + sum_hi); + sum_lo = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(result), sum_lo); + } else { + sum_lo = Q6_Vqf32_vadd_Vqf32Vqf32( + Q6_V_valign_VVR(Q6_V_lo_W(result), Q6_V_vzero(), leftover * sizeof(float)), sum_lo); + } } - float result; - q6op_vstu_variable_ARV(&result, sizeof(float), Q6_Vsf_equals_Vqf32(sum)); - return result; + return vec_reduction_f32(Q6_Vqf32_vadd_Vqf32Vqf32(sum_hi, sum_lo)); +} + +template struct get_data_type {}; + +template struct get_data_type { + using type = _TyData; +}; + +template +void mul_mat_impl(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tensor * dst, + hexagon::compute_params * params) { + using data_type = typename get_data_type::type; + + const bool is_quantized = hexagon::is_quantized_type(src0->get_type()); + const auto src0_actual_row_size = hexagon::get_dequantized_row_size(src0); + auto * dequantize_row_func = hexagon::get_type_traits(src0->get_type()).dequantize_row; + if (is_quantized && dequantize_row_func == nullptr) { + DEVICE_LOG_ERROR("Unsupported quantized src0 type: %d, dequantize_row_func is null\n", src0->get_type()); + return; + } + + const auto r02 = src1->get_ne(2) / src0->get_ne(2); + const auto r03 = src1->get_ne(3) / src0->get_ne(3); + const auto * src0_ptr = reinterpret_cast(src0->get_read_buffer()); + const auto * src1_ptr = reinterpret_cast(src1->get_read_buffer()); + auto * dst_ptr = reinterpret_cast(dst->get_write_buffer()); + const auto total_planes = dst->get_ne(3) * dst->get_ne(2); + + auto start_end_plane = std::pair{ 0, total_planes }; + auto start_end_row = std::pair{ 0, dst->get_ne(1) }; + auto start_end_element = std::pair{ 0, dst->get_ne(0) }; + + if (total_planes >= params->tcnt) { + start_end_plane = hexagon::get_thread_work_slice(total_planes, params->tidx, params->tcnt); + } else if (dst->get_ne(1) >= params->tcnt) { + start_end_row = hexagon::get_thread_work_slice(dst->get_ne(1), params->tidx, params->tcnt); + } else { + start_end_element = hexagon::get_thread_work_slice(dst->get_ne(0), params->tidx, params->tcnt); + } + + if (start_end_plane.second <= start_end_plane.first || start_end_row.second <= start_end_row.first || + start_end_element.second <= start_end_element.first) { + DEVICE_LOG_DEBUG( + "mul_mat_impl: no work to do, start_end_plane: (%ld, %ld), start_end_row: (%ld, %ld), " + "start_end_element: (%ld, %ld)\n", + start_end_plane.first, start_end_plane.second, start_end_row.first, start_end_row.second, + start_end_element.first, start_end_element.second); + return; + } + + // cache the src0 plane in VTCM + const size_t src0_plane_row_count = start_end_element.second - start_end_element.first; + size_t src0_plane_cache_size = 0; + uint8_t * src0_plane_cache_ptr = nullptr; + const uint8_t * last_cached_plane_ptr = nullptr; + if (is_quantized) { + src0_plane_cache_size = src0_actual_row_size * src0_plane_row_count; + src0_plane_cache_ptr = params->get_cache(src0_plane_cache_size, is_quantized); + } + + DEVICE_LOG_DEBUG("mul_mat_impl src0_actual_row_size: %zu, is_quantized: %d, vtcm_mem: %p(%zu)\n", + src0_actual_row_size, is_quantized, (void *) src0_plane_cache_ptr, src0_plane_cache_size); + + const size_t valid_row_bytes = src1->get_ne(0) * sizeof(data_type); + DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_WITH_SUB_PROC(dst, params->tidx, dequant); + for (int64_t ip = start_end_plane.first; ip < start_end_plane.second; ip++) { + const auto i3 = ip / dst->get_ne(2); + const auto i2 = ip - i3 * dst->get_ne(2); + const auto * src0_plane = src0_ptr + i3 / r03 * src0->get_nb(3) + i2 / r02 * src0->get_nb(2) + + start_end_element.first * src0->get_nb(1); + const auto * src1_plane = src1_ptr + i3 * src1->get_nb(3) + i2 * src1->get_nb(2); + auto * dst_plane = dst_ptr + i3 * dst->get_nb(3) + i2 * dst->get_nb(2); + + if (src0_plane_cache_ptr) { + if (last_cached_plane_ptr != src0_plane) { + DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_SUB_PROC(dequant); + + for (int64_t ir = 0; ir < (int64_t) src0_plane_row_count; ir++) { + auto * src0_row = src0_plane + ir * src0->get_nb(1); + if (ir + 1 < src0_plane_row_count) { + hexagon::l2fetch_row(src0_row + src0->get_nb(1), src0->get_nb(1)); + } + + auto * dst_row = reinterpret_cast(src0_plane_cache_ptr + ir * src0_actual_row_size); + dequantize_row_func(src0_row, reinterpret_cast(dst_row), src0->get_ne(0), + params->f16_to_f32_table); + } + + last_cached_plane_ptr = src0_plane; + } + + src0_plane = src0_plane_cache_ptr; + } + + for (int64_t i1 = start_end_row.first; i1 < start_end_row.second; i1++) { + auto * src1_row = src1_plane + i1 * src1->get_nb(1); + auto * dst_row = reinterpret_cast(dst_plane + i1 * dst->get_nb(1)) + start_end_element.first; + for (int64_t i0 = 0; i0 < (int64_t) src0_plane_row_count; i0++) { + auto * src0_row = src0_plane + i0 * src0_actual_row_size; + if (i0 + 1 < src0_plane_row_count) { + if (!src0_plane_cache_ptr) { + hexagon::l2fetch_row(src0_row + src0_actual_row_size, valid_row_bytes); + } + } else if (ip + 1 < start_end_plane.second) { + hexagon::l2fetch_row(src1_row + src1->get_nb(1), valid_row_bytes); + } + + // TODO: figure dst how to handle a entire row + dst_row[i0] = _DotFunc(reinterpret_cast(src0_row), + reinterpret_cast(src1_row), (size_t) src0->get_ne(0)); + } + } + } } } // namespace namespace hexagon { -bool mul_mat_f32(hexagon::tensor * out, size_t tidx, size_t tcnt) { +bool mul_mat_f32(hexagon::tensor * out, compute_params * params) { if (!out) { return false; } @@ -83,62 +288,80 @@ bool mul_mat_f32(hexagon::tensor * out, size_t tidx, size_t tcnt) { return true; // skip if no src } - const auto r02 = src1->get_ne(2) / src0->get_ne(2); - const auto r03 = src1->get_ne(3) / src0->get_ne(3); - const auto * src0_ptr = reinterpret_cast(src0->get_data()); - const auto * src1_ptr = reinterpret_cast(src1->get_data()); - auto * dst_ptr = reinterpret_cast(out->get_data()); - const auto total_planes = out->get_ne(3) * out->get_ne(2); - - const auto start_end_plane = (total_planes >= tcnt) ? get_thread_work_slice(total_planes, tidx, tcnt) : - std::pair{ 0, total_planes }; - const auto start_end_row = (total_planes >= tcnt) ? std::pair{ 0, out->get_ne(1) } : - get_thread_work_slice(out->get_ne(1), tidx, tcnt); - for (int64_t ip = start_end_plane.first; ip < start_end_plane.second; ip++) { - const auto i3 = ip / out->get_ne(2); - const auto i2 = ip - i3 * out->get_ne(2); - const auto * src0_plane = src0_ptr + i3 / r03 * src0->get_nb(3) + i2 / r02 * src0->get_nb(2); - const auto * src1_plane = src1_ptr + i3 * src1->get_nb(3) + i2 * src1->get_nb(2); - auto * dst_plane = dst_ptr + i3 * out->get_nb(3) + i2 * out->get_nb(2); - for (int64_t i1 = start_end_row.first; i1 < start_end_row.second; i1++) { - // TODO: prefetch row? - auto * src1_row = src1_plane + i1 * src1->get_nb(1); - auto * dst_row = reinterpret_cast(dst_plane + i1 * out->get_nb(1)); - for (int64_t i0 = 0; i0 < out->get_ne(0); i0++) { - auto * src0_row = src0_plane + i0 * src0->get_nb(1); - // TODO: figure out how to handle a entire row - *dst_row++ = - vec_dot_product_f32_f32(reinterpret_cast(src0_row), - reinterpret_cast(src1_row), (size_t) src0->get_ne(0)); - } - } + // TODO: array? + switch (src1->get_type()) { + case NPU_DATA_TYPE_F32: + mul_mat_impl(src0, src1, out, params); + return true; + + case NPU_DATA_TYPE_F16: + mul_mat_impl(src0, src1, out, params); + return true; + default: + break; } - return true; + DEVICE_LOG_ERROR("Unsupported src1 tensor type: %s\n", get_type_name(src1->get_type())); + return false; } bool is_mul_mat_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, const npu_device_tensor_spec & dst, npu_device_tensor_op op) { if (op != NPU_OP_MUL_MAT) { - DEVICE_LOG_DEBUG("op is not NPU_OP_MUL_MAT: %d\n", op); + DEVICE_LOG_DEBUG("op is not MUL_MAT: %d\n", op); + return false; + } + + if (dst.type != NPU_DATA_TYPE_F32) { + DEVICE_LOG_DEBUG("[%s]dst type is not F32: %s\n", op_get_name(op), get_type_name(dst.type)); + return false; + } + + if (src0.type != src1.type) { +#ifdef GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS + if (src1.type != NPU_DATA_TYPE_F32) { + DEVICE_LOG_DEBUG("[%s]src0.type(%s) and src1.type(%s) mismatch and src1 is not F32\n", op_get_name(op), + get_type_name(src0.type), get_type_name(src1.type)); + return false; + } + + const auto type_traits = get_type_traits(src0.type); + if (!type_traits.is_quantized || type_traits.dequantize_row == nullptr) { + DEVICE_LOG_DEBUG("[%s]src0.type(%s) and src1.type(%s) mismatch and src0 is not quantized\n", + op_get_name(op), get_type_name(src0.type), get_type_name(src1.type)); + return false; + } + + if (src0.ne[0] % type_traits.blck_size) { + DEVICE_LOG_DEBUG("[%s]src0.type(%s) ne[0] is not aligned: %ld\n", op_get_name(op), get_type_name(src0.type), + (long) src0.ne[0]); + return false; + } + + DEVICE_LOG_DEBUG("[%s]supported quantized src0.type(%s) and src1.type(%s)\n", op_get_name(op), + get_type_name(src0.type), get_type_name(src1.type)); +#else + DEVICE_LOG_DEBUG("[%s]src0.type(%s) and src1.type(%s) mismatch and quantized tensors are not supported\n", + op_get_name(op), get_type_name(src0.type), get_type_name(src1.type)); return false; +#endif } if (src0.ne[0] != src1.ne[0] || src0.ne[1] != dst.ne[0]) { - DEVICE_LOG_DEBUG("src0 and src1 cannot multiply: %ldx%ld vs %ldx%ld\n", (long) src0.ne[0], (long) src0.ne[1], - (long) src1.ne[0], (long) src1.ne[1]); + DEVICE_LOG_DEBUG("[%s]src0 and src1 cannot multiply: %ldx%ld vs %ldx%ld\n", op_get_name(op), (long) src0.ne[0], + (long) src0.ne[1], (long) src1.ne[0], (long) src1.ne[1]); return false; } if (src1.ne[1] != dst.ne[1] || src1.ne[2] != dst.ne[2] || src1.ne[3] != dst.ne[3]) { - DEVICE_LOG_DEBUG("src1 and dst dimensions not match: %ldx%ld vs %ldx%ld\n", (long) src1.ne[2], - (long) src1.ne[3], (long) dst.ne[2], (long) dst.ne[3]); + DEVICE_LOG_DEBUG("[%s]src1 and dst dimensions not match: %ldx%ld vs %ldx%ld\n", op_get_name(op), + (long) src1.ne[2], (long) src1.ne[3], (long) dst.ne[2], (long) dst.ne[3]); return false; } if (src1.ne[2] % src0.ne[2] || src1.ne[3] % src0.ne[3]) { - DEVICE_LOG_DEBUG("src0 cannot broadcast to src1: %ldx%ld vs %ldx%ld\n", (long) src0.ne[2], (long) src0.ne[3], - (long) src1.ne[2], (long) src1.ne[3]); + DEVICE_LOG_DEBUG("[%s]src0 cannot broadcast to src1: %ldx%ld vs %ldx%ld\n", op_get_name(op), (long) src0.ne[2], + (long) src0.ne[3], (long) src1.ne[2], (long) src1.ne[3]); return false; } diff --git a/ggml/src/ggml-qnn/npu/device/op_mul_mat.hpp b/ggml/src/ggml-qnn/npu/device/op_mul_mat.hpp index fc2eb2c97e3eb..3a97858606cd4 100644 --- a/ggml/src/ggml-qnn/npu/device/op_mul_mat.hpp +++ b/ggml/src/ggml-qnn/npu/device/op_mul_mat.hpp @@ -2,15 +2,15 @@ #include -#include - +#include "op_types.hpp" #include "tensor.hpp" namespace hexagon { -constexpr const size_t kBytesPerVector = sizeof(HVX_Vector); // 128 for v73 -constexpr const size_t kFloatsPerVector = kBytesPerVector / sizeof(float); -constexpr const size_t kAlignMask = kBytesPerVector - 1; +constexpr const size_t kBytesPerVector = sizeof(HVX_Vector); // 128 for v73 +constexpr const size_t kAlignMask = kBytesPerVector - 1; +constexpr const size_t kL2CacheSize = 8 * 1024; // // 8KB L2 cache +constexpr const size_t kL2FetchAheadVectors = kL2CacheSize / kBytesPerVector; inline size_t unaligned_bytes(const void * addr) { return ((size_t) addr) & kAlignMask; @@ -20,7 +20,30 @@ inline bool is_addr_aligned(void * addr) { return unaligned_bytes(addr) == 0; } -bool mul_mat_f32(tensor * out, size_t tidx, size_t tcnt); +inline void l2fetch(const void * p, uint32_t stride, uint32_t width, uint32_t height, uint32_t dir) { + uint64_t control = HEXAGON_V64_CREATE_H(dir, stride, width, height); + __asm__ __volatile__(" l2fetch(%0,%1) " : : "r"(p), "r"(control)); +} + +inline void l2fetch_row(const uint8_t * curr_row, size_t bytes) { + // TODO: should we use small kL2FetchAheadVectors? + int32_t l2fetch_vectors = Q6_R_min_RR(bytes / kBytesPerVector, kL2FetchAheadVectors); + hexagon::l2fetch(curr_row, kBytesPerVector, kBytesPerVector, l2fetch_vectors, 0); +} + +inline float get_flt0_from_fltv(HVX_Vector vect) { + // See also: tools\HEXAGON_Tools\8.6.07\Examples\StandAlone_Applications\QFloat\QFloat.c + + union { + int32_t i; + float f; + } cvt; + + cvt.i = vect[0]; + return cvt.f; +} + +bool mul_mat_f32(tensor * out, compute_params * params); bool is_mul_mat_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, const npu_device_tensor_spec & dst, npu_device_tensor_op op); diff --git a/ggml/src/ggml-qnn/npu/device/op_types.hpp b/ggml/src/ggml-qnn/npu/device/op_types.hpp new file mode 100644 index 0000000000000..8bf10637db51c --- /dev/null +++ b/ggml/src/ggml-qnn/npu/device/op_types.hpp @@ -0,0 +1,58 @@ +#pragma once + +#include +#include +#include +#include + +#include "hexagon_npu.h" +#include "tensor.hpp" +#include "util.hpp" +#include "vtcm_mem.hpp" + +namespace hexagon { + +struct compute_params { + const size_t tidx; + const size_t tcnt; + const float * f16_to_f32_table; + std::unique_ptr vtcm_cache; + std::unique_ptr mem_cache; + size_t mem_cache_size = 0; + + uint8_t * get_cache(size_t size, bool fallback_to_mem) { + if (!vtcm_cache || vtcm_cache->get_size() < size) { + vtcm_cache = std::make_unique(size, false); + } + + if (vtcm_cache->is_valid()) { + return vtcm_cache->get_mem(); + } + + if (!fallback_to_mem) { + DEVICE_LOG_DEBUG("vtcm_mem not valid, return nullptr\n"); + return nullptr; + } + + DEVICE_LOG_DEBUG("vtcm_mem not valid, allocate from mem_cache\n"); + if (!mem_cache || mem_cache_size < size) { + mem_cache = std::make_unique(size + 256); + mem_cache_size = mem_cache ? size : 0; + } + + return mem_cache.get(); + } +}; + +typedef bool (*compute_func_type)(tensor * dst, compute_params * params); +typedef bool (*op_is_supported_func_type)(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, + const npu_device_tensor_spec & dst, npu_device_tensor_op op); + +inline constexpr std::pair get_thread_work_slice(int64_t total, size_t tidx, size_t tcnt) { + const auto elements_per_thread = (total + tcnt - 1) / tcnt; + const auto start = tidx * elements_per_thread; + const auto end = std::min(start + elements_per_thread, total); + return { start, end }; +} + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/device/quants.cpp b/ggml/src/ggml-qnn/npu/device/quants.cpp new file mode 100644 index 0000000000000..d873691b58e15 --- /dev/null +++ b/ggml/src/ggml-qnn/npu/device/quants.cpp @@ -0,0 +1,151 @@ +#include "quants.hpp" + +#include + +#include + +static_assert(sizeof(npu_device_block_q4_K) == + 2 * sizeof(npu_device_fp16_t) + QUANT_K_SCALE_SIZE + QUANT_K_BLOCK_SIZE / 2, + "wrong q4_K block size/padding"); + +static_assert(sizeof(npu_device_block_q4_0) == sizeof(npu_device_fp16_t) + QUANT_BLOCK_SIZE / 2, + "wrong q4_0 block size/padding"); + +static_assert(sizeof(npu_device_block_q8_0) == sizeof(npu_device_fp16_t) + QUANT_BLOCK_SIZE, + "wrong q8_0 block size/padding"); + +namespace { + +inline float to_float(const npu_device_fp16_t src) { + union { + __fp16 f16; + npu_device_fp16_t u16; + } f16; + + f16.u16 = src; + return f16.f16; +} + +inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8_t * m) { + if (j < 4) { + *d = q[j] & 63; + *m = q[j + 4] & 63; + } else { + *d = (q[j + 4] & 0xF) | ((q[j - 4] >> 6) << 4); + *m = (q[j + 4] >> 4) | ((q[j - 0] >> 6) << 4); + } +} + +void dequantize_row_q8_0(const void * src, float * dst, size_t count, const float * f16_to_f32_table) { + constexpr const int qk = QUANT_BLOCK_SIZE; + const int nb = count / qk; + const auto * src_ptr = reinterpret_cast(src); + + // TODO: use intrinsics + for (int i = 0; i < nb; i++) { + const float d = f16_to_f32_table[src_ptr[i].d]; + + for (int j = 0; j < qk; ++j) { + dst[i * qk + j] = src_ptr[i].qs[j] * d; + } + } +} + +void dequantize_row_q4_0(const void * src, float * dst, size_t count, const float * f16_to_f32_table) { + constexpr const int qk = QUANT_BLOCK_SIZE; + static_assert(qk % 2 == 0, "qk must be even"); + + const int nb = count / qk; + const auto * src_ptr = reinterpret_cast(src); + + // TODO: use intrinsics + for (int i = 0; i < nb; i++) { + const float d = f16_to_f32_table[src_ptr[i].d]; + + for (int j = 0; j < qk / 2; ++j) { + const int x0 = (src_ptr[i].qs[j] & 0x0F) - 8; + const int x1 = ((src_ptr[i].qs[j] >> 4) & 0xF) - 8; + + dst[i * qk + j + 0] = x0 * d; + dst[i * qk + j + qk / 2] = x1 * d; + } + } +} + +void dequantize_row_q4_K(const void * src, float * dst, size_t count, const float * f16_to_f32_table) { + const int nb = count / QUANT_K_BLOCK_SIZE; + const auto * src_ptr = reinterpret_cast(src); + + // TODO: use intrinsics + for (int i = 0; i < nb; i++) { + const uint8_t * q = src_ptr[i].qs; + + const float d = f16_to_f32_table[src_ptr[i].d]; + const float min = f16_to_f32_table[src_ptr[i].dmin]; + + int is = 0; + uint8_t sc = 0; + uint8_t m = 0; + const auto * scales = src_ptr[i].scales; + for (int j = 0; j < QUANT_K_BLOCK_SIZE; j += 64) { + get_scale_min_k4(is + 0, scales, &sc, &m); + const float d1 = d * sc; + const float m1 = min * m; + get_scale_min_k4(is + 1, scales, &sc, &m); + const float d2 = d * sc; + const float m2 = min * m; + for (int l = 0; l < 32; ++l) { + dst[0] = d1 * (q[l] & 0xF) - m1; + dst[32] = d2 * ((q[l] >> 4) & 0xF) - m2; + dst++; + } + dst += 32; + q += 32; + is += 2; + } + } +} + +constexpr const hexagon::device_type_traits kDeviceTypeTraits[] = { + { NPU_DATA_TYPE_F32, "F32", 1, false, nullptr }, + { NPU_DATA_TYPE_F16, "F16", 1, false, nullptr }, + { NPU_DATA_TYPE_Q8_0, "Q8_0", QUANT_BLOCK_SIZE, true, dequantize_row_q8_0 }, + { NPU_DATA_TYPE_Q4_0, "Q4_0", QUANT_BLOCK_SIZE, true, dequantize_row_q4_0 }, + { NPU_DATA_TYPE_Q4_K, "Q4_K", QUANT_K_BLOCK_SIZE, true, dequantize_row_q4_K }, +}; + +static_assert(std::size(kDeviceTypeTraits) == NPU_DATA_TYPE_COUNT, + "kDeviceTypeTraits size mismatch with npu_device_tensor_data_type enum"); +static_assert(kDeviceTypeTraits[NPU_DATA_TYPE_F32].type == NPU_DATA_TYPE_F32, + "kDeviceTypeTraits F32 type mismatch with npu_device_tensor_data_type enum"); +static_assert(kDeviceTypeTraits[NPU_DATA_TYPE_F16].type == NPU_DATA_TYPE_F16, + "kDeviceTypeTraits F16 type mismatch with npu_device_tensor_data_type enum"); +static_assert(kDeviceTypeTraits[NPU_DATA_TYPE_Q8_0].type == NPU_DATA_TYPE_Q8_0, + "kDeviceTypeTraits Q8_0 type mismatch with npu_device_tensor_data_type enum"); +static_assert(kDeviceTypeTraits[NPU_DATA_TYPE_Q4_0].type == NPU_DATA_TYPE_Q4_0, + "kDeviceTypeTraits Q4_0 type mismatch with npu_device_tensor_data_type enum"); +static_assert(kDeviceTypeTraits[NPU_DATA_TYPE_Q4_K].type == NPU_DATA_TYPE_Q4_K, + "kDeviceTypeTraits Q4_K type mismatch with npu_device_tensor_data_type enum"); + +} // namespace + +namespace hexagon { + +bool init_f16_f32_table(float * table, size_t count) { + constexpr const size_t kTableSize = (1U << 16); + if (count < kTableSize) { + return false; + } + + for (size_t i = 0; i < count; ++i) { + table[i] = to_float(i); + } + + return true; +} + +const device_type_traits & get_type_traits(npu_device_tensor_data_type type) { + return kDeviceTypeTraits[type]; +} + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/device/quants.hpp b/ggml/src/ggml-qnn/npu/device/quants.hpp new file mode 100644 index 0000000000000..6ffbeb0031635 --- /dev/null +++ b/ggml/src/ggml-qnn/npu/device/quants.hpp @@ -0,0 +1,78 @@ + +#include "hexagon_npu.h" +#include "tensor.hpp" +#include "util.hpp" + +namespace hexagon { + +bool init_f16_f32_table(float * table, size_t count); + +typedef void (*dequantize_row_type)(const void * src, float * dst, size_t count, const float * f16_to_f32_table); + +struct device_type_traits { + npu_device_tensor_data_type type; + const char * type_name; + int64_t blck_size; + bool is_quantized; + dequantize_row_type dequantize_row; +}; + +const device_type_traits & get_type_traits(npu_device_tensor_data_type type); + +inline bool is_quantized_type(npu_device_tensor_data_type type) { + return get_type_traits(type).is_quantized; +} + +inline size_t get_dequantized_row_size(tensor * tensor) { + if (!is_quantized_type(tensor->get_type())) { + return tensor->get_nb(1); // for f32 and f16 + } + + auto row_elems_count = tensor->get_ne(0); + return row_elems_count * sizeof(float); // currently only f32 is supported +} + +inline const char * get_type_name(npu_device_tensor_data_type type) { + return get_type_traits(type).type_name; +} + +} // namespace hexagon + +// TODO: move this to a common header +#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING +namespace hexagon { + +inline auto make_scoped_op_perf_timer(tensor * op, size_t tidx, const char * sub_proc_log_prefix = nullptr) { + auto * src0 = op->get_src(0); + auto * src1 = op->get_src(1); + char buffer[512]; + if (src1 == nullptr) { + snprintf(buffer, sizeof(buffer), "[%s][%lldx%lldx%lldx%lld%s], tidx: %zu", op_get_name(op->get_op()), + src0->get_ne(0), src0->get_ne(1), src0->get_ne(2), src0->get_ne(3), get_type_name(src0->get_type()), + tidx); + } else { + snprintf(buffer, sizeof(buffer), "[%s][%lldx%lldx%lldx%lld%s],[%lldx%lldx%lldx%lld%s], tidx: %zu", + op_get_name(op->get_op()), src0->get_ne(0), src0->get_ne(1), src0->get_ne(2), src0->get_ne(3), + get_type_name(src0->get_type()), src1->get_ne(0), src1->get_ne(1), src1->get_ne(2), src1->get_ne(3), + get_type_name(src1->get_type()), tidx); + } + return npu_scoped_timer<512>(buffer, sub_proc_log_prefix); +} + +} // namespace hexagon + +# define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER(op, tidx) \ + auto __npu_op_timer_##__LINE__ = hexagon::make_scoped_op_perf_timer(op, tidx) + +# define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_WITH_SUB_PROC(op, tidx, sub_prefix) \ + auto __npu_op_timer_##sub_prefix = hexagon::make_scoped_op_perf_timer(op, tidx, #sub_prefix) + +# define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_SUB_PROC(sub_prefix) \ + hexagon::npu_sub_process_scoped_timer \ + __npu_op_sub_timer##sub_prefix(__npu_op_timer_##sub_prefix) + +#else +# define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER(op, tidx) ((void) 0) +# define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_WITH_SUB_PROC(op, tidx, sub_prefix) ((void) 0) +# define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_SUB_PROC(sub_prefix) ((void) 0) +#endif diff --git a/ggml/src/ggml-qnn/npu/device/tensor.hpp b/ggml/src/ggml-qnn/npu/device/tensor.hpp index ad1915ecb6418..9c7f6bffefff6 100644 --- a/ggml/src/ggml-qnn/npu/device/tensor.hpp +++ b/ggml/src/ggml-qnn/npu/device/tensor.hpp @@ -36,7 +36,14 @@ class tensor { DEVICE_LOG_INFO("~tensor(%p) fd: %d", (void *) this, _info.buffer_fd); } - void flush() { + void flush() const { + if (_data) { + qurt_mem_cache_clean((qurt_addr_t) (_data + _info.offset), (qurt_size_t) _info.size, QURT_MEM_CACHE_FLUSH, + QURT_MEM_DCACHE); + } + } + + void invalidate() const { if (_data) { qurt_mem_cache_clean((qurt_addr_t) (_data + _info.offset), (qurt_size_t) _info.size, QURT_MEM_CACHE_INVALIDATE, QURT_MEM_DCACHE); @@ -72,7 +79,12 @@ class tensor { npu_device_tensor_data_type get_type() const { return _info.type; } - uint8_t * get_data() const { return _data + _info.offset; } + const uint8_t * get_read_buffer() const { + invalidate(); + return _data + _info.offset; + } + + uint8_t * get_write_buffer() const { return _data + _info.offset; } bool is_valid() const { return _data != nullptr; } diff --git a/ggml/src/ggml-qnn/npu/device/thread_pool.hpp b/ggml/src/ggml-qnn/npu/device/thread_pool.hpp index a936ae0c4cafc..bd7e83dd8a485 100644 --- a/ggml/src/ggml-qnn/npu/device/thread_pool.hpp +++ b/ggml/src/ggml-qnn/npu/device/thread_pool.hpp @@ -143,6 +143,8 @@ template class thread_pool { return true; } + void sync_thread() { qurt_barrier_wait(&_completed); } + private: struct thread_pool_arg { thread_pool * pool = nullptr; diff --git a/ggml/src/ggml-qnn/npu/device/util.hpp b/ggml/src/ggml-qnn/npu/device/util.hpp index f6f5479694edd..a5e1ae5201c3b 100644 --- a/ggml/src/ggml-qnn/npu/device/util.hpp +++ b/ggml/src/ggml-qnn/npu/device/util.hpp @@ -1,9 +1,10 @@ #pragma once #include +#include -#include #include +#include #include #include "hexagon_npu.h" @@ -52,11 +53,105 @@ inline constexpr const char * op_get_name(npu_device_tensor_op op) { } } -inline constexpr std::pair get_thread_work_slice(int64_t total, size_t tidx, size_t tcnt) { - const auto elements_per_thread = (total + tcnt - 1) / tcnt; - const auto start = tidx * elements_per_thread; - const auto end = std::min(start + elements_per_thread, total); - return { start, end }; +#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING + +template class npu_scoped_timer { + public: + enum { kBufferCount = _buffer_count }; + + explicit npu_scoped_timer(const char * log_prefix, const char * sub_proc_log_prefix) { + strncpy(_log_prefix, log_prefix, kBufferCount - 1); + if (sub_proc_log_prefix != nullptr) { + strncpy(_sub_proc_log_prefix, sub_proc_log_prefix, kBufferCount - 1); + } + + _begin_cycles = HAP_perf_get_qtimer_count(); + _begin_pcycles = HAP_perf_get_pcycles(); + } + + npu_scoped_timer(npu_scoped_timer && other) { *this = std::move(other); } + + ~npu_scoped_timer() { print(); } + + void operator=(npu_scoped_timer && other) { + strncpy(_log_prefix, other._log_prefix, kBufferCount - 1); + strncpy(_sub_proc_log_prefix, other._sub_proc_log_prefix, kBufferCount - 1); + _begin_cycles = other._begin_cycles; + _sub_proc_cycles = other._sub_proc_cycles; + _sub_proc_count = other._sub_proc_count; + } + + void add_sub_proc_cycles(uint64_t cycles, uint64_t pcycles) { + _sub_proc_cycles += cycles; + _sub_proc_pcycles += pcycles; + _sub_proc_count++; + } + + void print() const { + auto total_cycles = HAP_perf_get_qtimer_count() - _begin_cycles; + auto total_pcycles = HAP_perf_get_pcycles() - _begin_pcycles; + auto duration = HAP_perf_qtimer_count_to_us(total_cycles); + + if (_sub_proc_count > 0) { + auto sub_proc_duration = HAP_perf_qtimer_count_to_us(_sub_proc_cycles); + DEVICE_LOG_WARN("[profiler]%s, pcyc: %llu, dur: %lluus, [%s]cnt: %llu, pcyc: %llu, dur: %lluus\n", + _log_prefix, total_pcycles, duration, _sub_proc_log_prefix, _sub_proc_count, + _sub_proc_pcycles, sub_proc_duration); + } else { + DEVICE_LOG_WARN("[profiler]%s, pcyc: %llu, dur: %lluus\n", _log_prefix, total_pcycles, duration); + } + } + + private: + char _log_prefix[kBufferCount] = {}; + char _sub_proc_log_prefix[kBufferCount] = {}; + uint64_t _begin_cycles = 0; + uint64_t _begin_pcycles = 0; + uint64_t _sub_proc_cycles = 0; + uint64_t _sub_proc_pcycles = 0; + uint64_t _sub_proc_count = 0; + + DISABLE_COPY(npu_scoped_timer); +}; + +template class npu_sub_process_scoped_timer { + public: + using npu_scoped_timer = npu_scoped_timer<_buffer_count>; + + explicit npu_sub_process_scoped_timer(npu_scoped_timer & timer) : _timer(timer) { + _begin_cycles = HAP_perf_get_qtimer_count(); + _begin_pcycles = HAP_perf_get_pcycles(); + } + + ~npu_sub_process_scoped_timer() { + _timer.add_sub_proc_cycles(HAP_perf_get_qtimer_count() - _begin_cycles, + HAP_perf_get_pcycles() - _begin_pcycles); + } + + private: + npu_scoped_timer & _timer; + uint64_t _begin_cycles = 0; + uint64_t _begin_pcycles = 0; + + DISABLE_COPY_AND_MOVE(npu_sub_process_scoped_timer); +}; + +inline auto make_scoped_perf_timer(const char * format, ...) { + va_list args; + va_start(args, format); + char buffer[512]; + vsnprintf(buffer, sizeof(buffer), format, args); + va_end(args); + return npu_scoped_timer<512>(buffer, nullptr); } +#endif + } // namespace hexagon + +#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING +# define DEVICE_SCOPED_PERFORMANCE_TRACKER(fmt, ...) \ + auto __npu_timer_##__LINE__ = hexagon::make_scoped_perf_timer(fmt, __VA_ARGS__) +#else +# define DEVICE_SCOPED_PERFORMANCE_TRACKER(fmt, ...) ((void) 0) +#endif diff --git a/ggml/src/ggml-qnn/npu/device/vtcm_mem.hpp b/ggml/src/ggml-qnn/npu/device/vtcm_mem.hpp new file mode 100644 index 0000000000000..4c2922ca87f15 --- /dev/null +++ b/ggml/src/ggml-qnn/npu/device/vtcm_mem.hpp @@ -0,0 +1,101 @@ +#pragma once + +#include + +#include "util.hpp" + +namespace hexagon { + +class vtcm_mem { + public: + explicit vtcm_mem(size_t size, bool single_page) { + size_t avail_size = single_page ? get_avail_page_size() : get_avail_block_size(); + if (size > avail_size) { + DEVICE_LOG_ERROR("Requested VTCM size %zu exceeds available size %zu\n", size, avail_size); + return; + } + + _vtcm_mem = HAP_request_VTCM((unsigned int) size, single_page ? 1 : 0); + if (_vtcm_mem == nullptr) { + DEVICE_LOG_ERROR("Failed to allocate VTCM memory: %zu bytes\n", size); + return; + } + + _vtcm_size = size; + DEVICE_LOG_DEBUG("VTCM allocated: %p(%zu), avail: %zu\n", _vtcm_mem, size, avail_size); + } + + explicit vtcm_mem(size_t size, bool single_page, size_t timeout_us) { + _vtcm_mem = HAP_request_async_VTCM((unsigned int) size, single_page ? 1 : 0, (unsigned int) timeout_us); + if (_vtcm_mem == nullptr) { + DEVICE_LOG_ERROR("Failed to allocate VTCM memory: %zu bytes, timeout %zu us\n", size, timeout_us); + return; + } + + _vtcm_size = size; + DEVICE_LOG_DEBUG("VTCM allocated: %p(%zu), avail: %zu\n", _vtcm_mem, size, get_avail_block_size()); + } + + ~vtcm_mem() { + if (is_valid()) { + auto ret = HAP_release_VTCM(_vtcm_mem); + if (ret != AEE_SUCCESS) { + DEVICE_LOG_ERROR("Failed to release VTCM memory: %d\n", ret); + } + } + + DEVICE_LOG_DEBUG("VTCM released: %zu bytes at %p\n", _vtcm_size, _vtcm_mem); + } + + bool is_valid() const { return _vtcm_mem != nullptr; } + + uint8_t * get_mem() const { return reinterpret_cast(_vtcm_mem); } + + size_t get_size() const { return _vtcm_size; } + + static size_t get_total_size() { + unsigned int arch_page_aligned_size = 0; + unsigned int arch_page_count = 0; + auto ret = HAP_query_total_VTCM(&arch_page_aligned_size, &arch_page_count); + if (ret != AEE_SUCCESS) { + DEVICE_LOG_ERROR("Failed to query total VTCM: %d\n", ret); + return 0; + } + + return arch_page_aligned_size; + } + + static size_t get_avail_block_size() { + unsigned int avail_block_size = 0; + unsigned int avail_page_size = 0; + unsigned int num_pages = 0; + auto ret = HAP_query_avail_VTCM(&avail_block_size, &avail_page_size, &num_pages); + if (ret != AEE_SUCCESS) { + DEVICE_LOG_ERROR("Failed to query available VTCM: %d\n", ret); + return 0; + } + + return avail_block_size; + } + + static size_t get_avail_page_size() { + unsigned int avail_block_size = 0; + unsigned int avail_page_size = 0; + unsigned int num_pages = 0; + auto ret = HAP_query_avail_VTCM(&avail_block_size, &avail_page_size, &num_pages); + if (ret != AEE_SUCCESS) { + DEVICE_LOG_ERROR("Failed to query available VTCM: %d\n", ret); + return 0; + } + + return avail_page_size; + } + + private: + void * _vtcm_mem = nullptr; + size_t _vtcm_size = 0; + + DISABLE_COPY_AND_MOVE(vtcm_mem); +}; + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/host/buffer.cpp b/ggml/src/ggml-qnn/npu/host/buffer.cpp index ff5c8a320c745..ace3dbee8eeec 100644 --- a/ggml/src/ggml-qnn/npu/host/buffer.cpp +++ b/ggml/src/ggml-qnn/npu/host/buffer.cpp @@ -75,6 +75,12 @@ void backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { memset(buffer_obj->get_buffer(), value, buffer_obj->get_size()); } +void backend_buffer_reset(ggml_backend_buffer_t buffer) { + auto * buffer_obj = get_buffer_object(buffer); + GGML_ASSERT(buffer_obj != nullptr); + buffer_obj->clear_tensors(); +} + constexpr const ggml_backend_buffer_i backend_buffer_interface = { /* .free_buffer = */ backend_buffer_free_buffer, /* .get_base = */ backend_buffer_get_base, @@ -84,7 +90,7 @@ constexpr const ggml_backend_buffer_i backend_buffer_interface = { /* .get_tensor = */ backend_buffer_get_tensor, /* .cpy_tensor = */ backend_buffer_cpy_tensor, /* .clear = */ backend_buffer_clear, - /* .reset = */ nullptr, + /* .reset = */ backend_buffer_reset, }; const char * backend_buffer_type_get_name(ggml_backend_buffer_type_t buft) { @@ -190,6 +196,11 @@ std::shared_ptr host_buffer::init_tensor(ggml_tensor * tensor, remo return tensor_object; } +void host_buffer::clear_tensors() { + _tensors.clear(); + LOG_DEBUG("clear host_buffer(%p) tensors\n", (void *) _data); +} + host_buffer_type::host_buffer_type(ggml_backend_dev_t dev, const std::string & name, common::rpc_mem_ptr rpc_mem) : _name(name), _rpc_mem(rpc_mem) { diff --git a/ggml/src/ggml-qnn/npu/host/buffer.hpp b/ggml/src/ggml-qnn/npu/host/buffer.hpp index 955944bb98f59..38c9eed815d10 100644 --- a/ggml/src/ggml-qnn/npu/host/buffer.hpp +++ b/ggml/src/ggml-qnn/npu/host/buffer.hpp @@ -25,6 +25,8 @@ class host_buffer { std::shared_ptr init_tensor(ggml_tensor * tensor, remote_handle64 device_handle); + void clear_tensors(); + private: common::rpc_mem_ptr _allocator; void * _data = nullptr; diff --git a/ggml/src/ggml-qnn/npu/host/graph.cpp b/ggml/src/ggml-qnn/npu/host/graph.cpp index 9e8cf8320408e..72ef5cc7868eb 100644 --- a/ggml/src/ggml-qnn/npu/host/graph.cpp +++ b/ggml/src/ggml-qnn/npu/host/graph.cpp @@ -32,7 +32,8 @@ bool host_graph::update(ggml_cgraph * cgraph) { _tensor_handles.reserve(cgraph->n_nodes); for (int i = 0; i < cgraph->n_nodes; ++i) { auto * node = cgraph->nodes[i]; - if (node->op == GGML_OP_NONE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE) { + if (node->op == GGML_OP_NONE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || + node->op == GGML_OP_RESHAPE) { // skip view liked ops LOG_DEBUG("node[%d]%s(%s), addr: %p, type: %s, skipped\n", i, ggml_get_name(node), ggml_op_desc(node), (void *) node, ggml_type_name(node->type)); @@ -55,8 +56,8 @@ bool host_graph::update(ggml_cgraph * cgraph) { } } - LOG_DEBUG("host_graph::update, host_graph(%p), ggml_cgraph(%p), tensor count(%zu)\n", (void *) this, - (void *) cgraph, _tensor_handles.size()); + LOG_DEBUG("host_graph::update, host_graph(%p), handle(%p), ggml_cgraph(%p), tensor count(%zu)\n", (void *) this, + (void *) _graph_handle, (void *) cgraph, _tensor_handles.size()); if (!_tensor_handles.empty()) { npu_device_graph_set_tensor(_device_handle, _graph_handle, _tensor_handles.data(), (int) _tensor_handles.size()); diff --git a/ggml/src/ggml-qnn/npu/host/host.cpp b/ggml/src/ggml-qnn/npu/host/host.cpp index 90c4cd29e8e20..28c561a49fa59 100644 --- a/ggml/src/ggml-qnn/npu/host/host.cpp +++ b/ggml/src/ggml-qnn/npu/host/host.cpp @@ -57,7 +57,7 @@ void backend_dev_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props ggml_backend_t backend_dev_init_backend(ggml_backend_dev_t dev, const char * params) { auto * dev_obj = get_device_object(dev); GGML_ASSERT(dev_obj != nullptr); - if (!dev_obj->init_device(dev, params)) { + if (!dev_obj->init_device()) { LOG_ERROR("[%s]Failed to init device\n", backend_dev_get_name(dev)); return nullptr; } diff --git a/ggml/src/ggml-qnn/npu/host/host_device.cpp b/ggml/src/ggml-qnn/npu/host/host_device.cpp index aa90cfa8bc8f1..fb1ad4dfd677b 100644 --- a/ggml/src/ggml-qnn/npu/host/host_device.cpp +++ b/ggml/src/ggml-qnn/npu/host/host_device.cpp @@ -7,6 +7,8 @@ #include +#include + #include "graph.hpp" #include "util.hpp" @@ -114,39 +116,13 @@ bool npu_device::is_device_initialized() const { return true; } -bool npu_device::init_device(ggml_backend_dev_t dev, const char * params) { +bool npu_device::init_device() { if (!init_rpc_mem()) { return false; } - if (!_device_handle) { - auto arch = get_dsp_arch(_rpc_interface, _dsp_domain_id); - const auto & device_lib_info = get_device_library_info(arch); - std::string device_lib_uri = device_lib_info.device_lib_uri; - device_lib_uri += get_domain_param(_dsp_domain_id); - LOG_DEBUG("[%s]NPU device arch: %s, uri: %s\n", get_name(), get_dsp_arch_desc(arch), device_lib_uri.c_str()); - auto err = npu_device_open(device_lib_uri.c_str(), &_device_handle); - if (err != AEE_SUCCESS) { - if (err == AEE_ECONNREFUSED) { - LOG_DEBUG("[%s]NPU device is not available, trying to enable unsigned DSP module and reopen\n", - get_name()); - enable_unsigned_dsp_module(_rpc_interface, _dsp_domain_id); - err = npu_device_open(device_lib_uri.c_str(), &_device_handle); - } - - if (err != AEE_SUCCESS) { - LOG_ERROR("[%s]Unable to open NPU device, err: 0x%x, uri %s\n", get_name(), err, - device_lib_uri.c_str()); - _device_handle = 0; - return false; - } - } - - _description += ' '; - _description += get_dsp_arch_desc(arch); - LOG_DEBUG("[%s]NPU device opened successfully\n", get_name()); - } else { - LOG_DEBUG("[%s]NPU device is already opened\n", get_name()); + if (!init_device_lib()) { + return false; } return true; @@ -157,10 +133,17 @@ bool npu_device::supports_buft(ggml_backend_buffer_type_t buft) const { } bool npu_device::supports_op_impl(const ggml_tensor * op) { + static_assert(std::is_same::value, + "npu_device_fp16_t should be same as ggml_fp16_t"); + if (op->op == GGML_OP_NONE) { return true; } + if (op->op == GGML_OP_VIEW || op->op == GGML_OP_RESHAPE || op->op == GGML_OP_PERMUTE) { + return true; + } + if (type_to_npu_type(op->type) == NPU_DATA_TYPE_COUNT) { LOG_DEBUG("[%s]Unsupported op tensor type: %s\n", get_name(), ggml_type_name(op->type)); return false; @@ -189,6 +172,11 @@ bool npu_device::supports_op_impl(const ggml_tensor * op) { return false; } + if (!_device_handle && !init_device()) { + LOG_DEBUG("[%s]NPU device initialization failed\n", get_name()); + return false; + } + constexpr const auto get_spec = [](const ggml_tensor * tensor) -> npu_device_tensor_spec { if (!tensor) { return npu_device_tensor_spec{}; @@ -210,12 +198,12 @@ bool npu_device::supports_op_impl(const ggml_tensor * op) { auto dst_spec = get_spec(op); auto ret = npu_device_device_support_op(_device_handle, &src0_spec, &src1_spec, &dst_spec, npu_op, &supported); if (ret != AEE_SUCCESS || !supported) { - LOG_DEBUG("[%s]Unsupported op: %s, ret: 0x%x, supported: %d\n", get_name(), ggml_op_name(op->op), ret, - supported); + LOG_DEBUG("[%s][%s]unsupported %s(%s,%s), ret: 0x%x, supported: %d\n", get_name(), ggml_op_name(op->op), + ggml_type_name(op->type), ggml_type_name(src0->type), (src1 ? ggml_type_name(src1->type) : "null"), + ret, supported); return false; } - LOG_DEBUG("[%s]Supported op: %s\n", get_name(), ggml_op_name(op->op)); return true; } @@ -238,11 +226,72 @@ bool npu_device::init_rpc_mem() { return true; } +bool npu_device::init_device_lib() { + if (!_device_handle) { + auto arch = get_dsp_arch(_rpc_interface, _dsp_domain_id); + const auto & device_lib_info = get_device_library_info(arch); + std::string device_lib_uri = device_lib_info.device_lib_uri; + device_lib_uri += get_domain_param(_dsp_domain_id); + LOG_DEBUG("[%s]NPU device arch: %s, uri: %s\n", get_name(), get_dsp_arch_desc(arch), device_lib_uri.c_str()); + auto err = npu_device_open(device_lib_uri.c_str(), &_device_handle); + if (err != AEE_SUCCESS) { + if (err == AEE_ECONNREFUSED) { + LOG_DEBUG("[%s]NPU device is not available, trying to enable unsigned DSP module and reopen\n", + get_name()); + enable_unsigned_dsp_module(_rpc_interface, _dsp_domain_id); + err = npu_device_open(device_lib_uri.c_str(), &_device_handle); + } + + if (err != AEE_SUCCESS) { + LOG_ERROR("[%s]Unable to open NPU device, err: 0x%x, uri %s\n", get_name(), err, + device_lib_uri.c_str()); + _device_handle = 0; + return false; + } + } + + _description += ' '; + _description += get_dsp_arch_desc(arch); + LOG_DEBUG("[%s]NPU device opened successfully\n", get_name()); + } else { + LOG_DEBUG("[%s]NPU device is already opened\n", get_name()); + } + + return true; +} + bool npu_device::offload_op(const ggml_tensor * op) { // TODO: implement this return false; } +#ifndef NDEBUG +bool npu_device::supports_op(const ggml_tensor * op) { + char op_desc[1024]; + get_op_tensor_desc(op, op_desc, sizeof(op_desc)); + + if (supports_op_impl(op)) { + if (op->op != GGML_OP_NONE && op->op != GGML_OP_VIEW && op->op != GGML_OP_RESHAPE && + op->op != GGML_OP_PERMUTE) { + _supported_op++; + LOG_DEBUG("[%s][%s]supported, %s, supported/unsupported: %u/%u\n", get_name(), ggml_op_name(op->op), + op_desc, _supported_op.load(), _unsupported_op.load()); + } + + return true; + } + + _unsupported_op++; + LOG_DEBUG("[%s][%s]unsupported, %s, supported/unsupported: %u/%u\n", get_name(), ggml_op_name(op->op), op_desc, + _supported_op.load(), _unsupported_op.load()); + return false; +} +#else +bool npu_device::supports_op(const ggml_tensor * op) { + return supports_op_impl(op); +} +#endif + ggml_backend_buffer_type_t npu_device::get_default_buffer_type(ggml_backend_dev_t dev) { // Note that this function will be called before the npu_device::init_device if (!init_rpc_mem()) { diff --git a/ggml/src/ggml-qnn/npu/host/host_device.hpp b/ggml/src/ggml-qnn/npu/host/host_device.hpp index efc7914f18615..b2fab667d6f4a 100644 --- a/ggml/src/ggml-qnn/npu/host/host_device.hpp +++ b/ggml/src/ggml-qnn/npu/host/host_device.hpp @@ -31,37 +31,18 @@ class npu_device { ggml_backend_buffer_type_t get_default_buffer_type(ggml_backend_dev_t dev); bool is_device_initialized() const; - bool init_device(ggml_backend_dev_t dev, const char * params); + bool init_device(); bool supports_buft(ggml_backend_buffer_type_t buft) const; bool offload_op(const ggml_tensor * op); - -#ifndef NDEBUG - bool supports_op(const ggml_tensor * op) { - if (supports_op_impl(op)) { - if (op->op != GGML_OP_NONE) { - _supported_op++; - LOG_DEBUG("[%s]Supported op: %s, supported/unsupported: %u/%u\n", get_name(), ggml_op_name(op->op), - _supported_op.load(), _unsupported_op.load()); - } - - return true; - } - - _unsupported_op++; - LOG_DEBUG("[%s]Unsupported op: %s, supported/unsupported: %u/%u\n", get_name(), ggml_op_name(op->op), - _supported_op.load(), _unsupported_op.load()); - return false; - } -#else - bool supports_op(const ggml_tensor * op) { return supports_op_impl(op); } -#endif + bool supports_op(const ggml_tensor * op); remote_handle64 get_device_handle() const { return _device_handle; } private: bool supports_op_impl(const ggml_tensor * op); bool init_rpc_mem(); + bool init_device_lib(); std::string _name = "hexagon-npu"; std::string _description = "Hexagon NPU"; diff --git a/ggml/src/ggml-qnn/npu/host/tensor.hpp b/ggml/src/ggml-qnn/npu/host/tensor.hpp index e7d5f7a88aeb4..c5d2decbc5682 100644 --- a/ggml/src/ggml-qnn/npu/host/tensor.hpp +++ b/ggml/src/ggml-qnn/npu/host/tensor.hpp @@ -40,19 +40,17 @@ class host_tensor { tensor->extra = this; _ggml_tensor = tensor; - LOG_DEBUG( - "host_tensor(%p) created, ggml_tensor(%p[%ldx%ldx%ldx%ld], nb[%ld][%ld][%ld][%ld]), " - "device_tensor_handle(%p)\n", - (void *) this, (void *) tensor, (long) tensor->ne[0], (long) tensor->ne[1], (long) tensor->ne[2], - (long) tensor->ne[3], (long) tensor->nb[0], (long) tensor->nb[1], (long) tensor->nb[2], - (long) tensor->nb[3], (void *) _device_tensor_handle); + LOG_DEBUG("host_tensor(%p), ggml_tensor(%p[%ldx%ldx%ldx%ld], nb[%ld][%ld][%ld][%ld], %s), handle(%p)\n", + (void *) this, (void *) tensor, (long) tensor->ne[0], (long) tensor->ne[1], (long) tensor->ne[2], + (long) tensor->ne[3], (long) tensor->nb[0], (long) tensor->nb[1], (long) tensor->nb[2], + (long) tensor->nb[3], ggml_type_name(tensor->type), (void *) _device_tensor_handle); } ~host_tensor() { LOG_DEBUG("host_tensor(%p) destroy, device_tensor_handle: %p\n", (void *) this, (void *) _device_tensor_handle); if (_device_tensor_handle) { npu_device_tensor_free(_device_handle, _device_tensor_handle); - _ggml_tensor->extra = nullptr; + // TODO: figure out why the _ggml_tensor is invalid here } } diff --git a/ggml/src/ggml-qnn/npu/host/util.cpp b/ggml/src/ggml-qnn/npu/host/util.cpp index 5db54b661ebde..9ce9841004235 100644 --- a/ggml/src/ggml-qnn/npu/host/util.cpp +++ b/ggml/src/ggml-qnn/npu/host/util.cpp @@ -2,6 +2,17 @@ #include +#define GGML_COMMON_DECL_CPP +#include "ggml-common.h" +#undef GGML_COMMON_DECL_CPP + +static_assert(sizeof(npu_device_block_q4_K) == sizeof(block_q4_K), "npu_device_block_q4_K size mismatch"); +static_assert(sizeof(npu_device_block_q4_0) == sizeof(block_q4_0), "npu_device_block_q4_0 size mismatch"); +static_assert(sizeof(npu_device_block_q8_0) == sizeof(block_q8_0), "npu_device_block_q8_0 size mismatch"); +static_assert(QUANT_K_SCALE_SIZE == K_SCALE_SIZE, "QUANT_K_SCALE_SIZE size mismatch"); +static_assert(QUANT_K_BLOCK_SIZE == QK_K, "QUANT_K_BLOCK_SIZE size mismatch"); +static_assert(QUANT_BLOCK_SIZE == QK4_0, "QUANT_BLOCK_SIZE size mismatch"); + namespace hexagon { enum npu_device_tensor_op op_to_npu_op(ggml_op op) { @@ -23,6 +34,14 @@ enum npu_device_tensor_data_type type_to_npu_type(ggml_type type) { switch (type) { case GGML_TYPE_F32: return NPU_DATA_TYPE_F32; + case GGML_TYPE_F16: + return NPU_DATA_TYPE_F16; + case GGML_TYPE_Q4_K: + return NPU_DATA_TYPE_Q4_K; + case GGML_TYPE_Q4_0: + return NPU_DATA_TYPE_Q4_0; + case GGML_TYPE_Q8_0: + return NPU_DATA_TYPE_Q8_0; default: return NPU_DATA_TYPE_COUNT; } @@ -93,4 +112,56 @@ void enable_unsigned_dsp_module(common::rpc_interface_ptr rpc_interface, uint32_ } } +void get_op_tensor_desc(const ggml_tensor * dst, char * out, size_t max_len) { + if (dst == nullptr) { + snprintf(out, max_len, "null"); + return; + } + + constexpr const auto print_tensor = [](const ggml_tensor * tensor, char * out, size_t max_len) { + auto dims = ggml_n_dims(tensor); + + switch (dims) { + default: + case 4: + snprintf(out, max_len, "%s[%ldx%ldx%ldx%ld]", ggml_type_name(tensor->type), (long) tensor->ne[0], + (long) tensor->ne[1], (long) tensor->ne[2], (long) tensor->ne[3]); + break; + case 3: + snprintf(out, max_len, "%s[%ldx%ldx%ld]", ggml_type_name(tensor->type), (long) tensor->ne[0], + (long) tensor->ne[1], (long) tensor->ne[2]); + break; + case 2: + snprintf(out, max_len, "%s[%ldx%ld]", ggml_type_name(tensor->type), (long) tensor->ne[0], + (long) tensor->ne[1]); + break; + case 1: + snprintf(out, max_len, "%s[%ld]", ggml_type_name(tensor->type), (long) tensor->ne[0]); + break; + } + }; + + auto * src0 = dst->src[0]; + if (src0 == nullptr) { + print_tensor(dst, out, max_len); + return; + } + + char dst_desc[256]; + print_tensor(dst, dst_desc, sizeof(dst_desc)); + + char src0_desc[256]; + print_tensor(src0, src0_desc, sizeof(src0_desc)); + + auto * src1 = dst->src[1]; + if (src1 == nullptr) { + snprintf(out, max_len, "dst: %s, src0: %s", dst_desc, src0_desc); + return; + } + + char src1_desc[256]; + print_tensor(src1, src1_desc, sizeof(src1_desc)); + snprintf(out, max_len, "dst: %s, src0: %s, src1: %s", dst_desc, src0_desc, src1_desc); +} + } // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/host/util.hpp b/ggml/src/ggml-qnn/npu/host/util.hpp index c001272d4cf7f..469e5066602ed 100644 --- a/ggml/src/ggml-qnn/npu/host/util.hpp +++ b/ggml/src/ggml-qnn/npu/host/util.hpp @@ -23,4 +23,6 @@ const char * get_dsp_arch_desc(hexagon_dsp_arch arch); void enable_unsigned_dsp_module(common::rpc_interface_ptr rpc_interface, uint32_t domain_id); +void get_op_tensor_desc(const ggml_tensor * dst, char * out, size_t max_len); + } // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl b/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl index d62e65b3bd877..df3cdf4957295 100644 --- a/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl +++ b/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl @@ -4,6 +4,9 @@ const uint32_t DEVICE_TENSOR_MAX_DIMS = 4; const uint32_t DEVICE_TENSOR_MAX_SRC = 2; +const uint32_t QUANT_BLOCK_SIZE = 32; +const uint32_t QUANT_K_BLOCK_SIZE = 256; +const uint32_t QUANT_K_SCALE_SIZE = 12; interface npu_device : remote_handle64{ @@ -11,6 +14,25 @@ interface npu_device : remote_handle64{ typedef uint64_t tensor_handle_t; typedef uint64_t graph_handle_t; + typedef uint16_t fp16_t; + + struct block_q4_0 { + fp16_t d; + uint8_t qs[QUANT_BLOCK_SIZE / 2]; + }; + + struct block_q4_K { + fp16_t d; + fp16_t dmin; + uint8_t scales[QUANT_K_SCALE_SIZE]; + uint8_t qs[QUANT_K_BLOCK_SIZE / 2]; + }; + + struct block_q8_0 { + fp16_t d; + int8_t qs[QUANT_BLOCK_SIZE]; + }; + enum tensor_op { NPU_OP_MUL_MAT, NPU_OP_ADD, @@ -21,6 +43,10 @@ interface npu_device : remote_handle64{ enum tensor_data_type { NPU_DATA_TYPE_F32, + NPU_DATA_TYPE_F16, + NPU_DATA_TYPE_Q8_0, + NPU_DATA_TYPE_Q4_0, + NPU_DATA_TYPE_Q4_K, NPU_DATA_TYPE_COUNT }; diff --git a/ggml/src/ggml-qnn/qnn/CMakeLists.txt b/ggml/src/ggml-qnn/qnn/CMakeLists.txt index 010fcf08db186..8b1308399891b 100644 --- a/ggml/src/ggml-qnn/qnn/CMakeLists.txt +++ b/ggml/src/ggml-qnn/qnn/CMakeLists.txt @@ -26,11 +26,11 @@ else() message("GGML_QNN_ENABLE_CPU_BACKEND is disabled") endif() -if(GGML_QNN_ENABLE_PERFORMANCE_TRACKING) - message("GGML_QNN_ENABLE_PERFORMANCE_TRACKING is enabled") - target_compile_definitions(qnn-backend PUBLIC GGML_QNN_ENABLE_PERFORMANCE_TRACKING) +if(GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING) + message("GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING is enabled") + target_compile_definitions(qnn-backend PUBLIC GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING) else() - message("GGML_QNN_ENABLE_PERFORMANCE_TRACKING is disabled") + message("GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING is disabled") endif() if(CMAKE_SYSTEM_NAME STREQUAL "Android") diff --git a/ggml/src/ggml-qnn/qnn/graph.cpp b/ggml/src/ggml-qnn/qnn/graph.cpp index 70fc71c211c14..3094b5c3bee67 100644 --- a/ggml/src/ggml-qnn/qnn/graph.cpp +++ b/ggml/src/ggml-qnn/qnn/graph.cpp @@ -10,7 +10,7 @@ #include "profiler.hpp" #include "tensor.hpp" -#ifdef GGML_QNN_ENABLE_PERFORMANCE_TRACKING +#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING # define GRAPH_PROFILE_HANDLE (_event_tracer ? _event_tracer->get_handle() : nullptr) # define GRAPH_PROFILE_PRINT() \ if (_event_tracer) { \ @@ -381,7 +381,7 @@ qnn_graph::qnn_graph(const std::string & graph_name, backend_index_type device, return; } -#ifdef GGML_QNN_ENABLE_PERFORMANCE_TRACKING +#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING if (device == QNN_BACKEND_NPU) { _event_tracer = std::make_shared( graph_name, qnn_interface, qnn_instance->get_qnn_backend_handle(), qnn_event_tracer::PROFILE_OP_TRACE); diff --git a/ggml/src/ggml-qnn/qnn/graph.hpp b/ggml/src/ggml-qnn/qnn/graph.hpp index 5e862112fbd1e..99ffeaa3d0faa 100644 --- a/ggml/src/ggml-qnn/qnn/graph.hpp +++ b/ggml/src/ggml-qnn/qnn/graph.hpp @@ -79,7 +79,7 @@ class qnn_graph { std::vector _qnn_tensor_inputs; std::vector _qnn_tensor_outputs; -#ifdef GGML_QNN_ENABLE_PERFORMANCE_TRACKING +#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING // profiler qnn_event_tracer_ptr _event_tracer; #endif diff --git a/ggml/src/ggml-qnn/qnn/profiler.hpp b/ggml/src/ggml-qnn/qnn/profiler.hpp index 34db09e0bf865..0d4f839fda270 100644 --- a/ggml/src/ggml-qnn/qnn/profiler.hpp +++ b/ggml/src/ggml-qnn/qnn/profiler.hpp @@ -12,7 +12,7 @@ namespace qnn { -#ifdef GGML_QNN_ENABLE_PERFORMANCE_TRACKING +#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING class qnn_scoped_timer { public: @@ -92,7 +92,7 @@ using qnn_event_tracer_ptr = std::shared_ptr; } // namespace qnn -#ifdef GGML_QNN_ENABLE_PERFORMANCE_TRACKING +#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING # define QNN_SCOPED_PERFORMANCE_TRACKER(fmt, ...) \ auto __qnn_timer_##__LINE__ = qnn::make_scope_perf_timer(fmt, __VA_ARGS__) #else From 2306f82a58fc07d59571bba7f6e19457f87bc3b1 Mon Sep 17 00:00:00 2001 From: nullname Date: Tue, 27 May 2025 06:35:41 +0000 Subject: [PATCH 157/166] fix compiling error --- ggml/src/ggml-qnn/qnn/backend-ops.cpp | 1 + ggml/src/ggml-qnn/qnn/op-config-caps.cpp | 2 ++ 2 files changed, 3 insertions(+) diff --git a/ggml/src/ggml-qnn/qnn/backend-ops.cpp b/ggml/src/ggml-qnn/qnn/backend-ops.cpp index 1446115a57ba0..784e1deec77d6 100644 --- a/ggml/src/ggml-qnn/qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/qnn/backend-ops.cpp @@ -164,6 +164,7 @@ constexpr const bool kQnnSupportedOps[] = { false, // GGML_UNARY_OP_HARDSWISH false, // GGML_UNARY_OP_HARDSIGMOID false, // GGML_UNARY_OP_EXP + false, // GGML_UNARY_OP_GELU_ERF }; static_assert(kQnnSupportedOps[GGML_OP_NONE], "GGML_OP_NONE is not true"); diff --git a/ggml/src/ggml-qnn/qnn/op-config-caps.cpp b/ggml/src/ggml-qnn/qnn/op-config-caps.cpp index 081b7fba7f3ef..2a6f7abca4044 100644 --- a/ggml/src/ggml-qnn/qnn/op-config-caps.cpp +++ b/ggml/src/ggml-qnn/qnn/op-config-caps.cpp @@ -213,6 +213,7 @@ constexpr const qnn_op_caps_t kOpCaps[] = { {}, // GGML_UNARY_OP_HARDSWISH {}, // GGML_UNARY_OP_HARDSIGMOID {}, // GGML_UNARY_OP_EXP + {}, // GGML_UNARY_OP_GELU_ERF }; static_assert(kOpCaps[GGML_OP_NONE].get_desc == nullptr, "GGML_OP_NONE should not have get_desc function"); @@ -382,6 +383,7 @@ constexpr const op_constructor_t kOpConstructors[] = { nullptr, // GGML_UNARY_OP_HARDSWISH nullptr, // GGML_UNARY_OP_HARDSIGMOID nullptr, // GGML_UNARY_OP_EXP + nullptr, // GGML_UNARY_OP_GELU_ERF }; static_assert(kOpConstructors[GGML_OP_NONE] == nullptr, "GGML_OP_NONE does not match the nullptr function"); From c23ab465c033fc14bb4ee68fb333cdcc487dd46d Mon Sep 17 00:00:00 2001 From: nullname Date: Wed, 28 May 2025 00:00:42 +0800 Subject: [PATCH 158/166] feat: perf opt part4 (#43) * wip * refactor: rewrite dequantize_row_q4_0 by intrinsic * log for debug * fix q4 intrinsic * small opt * wip * wip * add vtcm_quota_size * add perf log for hexagon-npu backend * wip * add log * sync after a specfic op * increase worker thread priority * fix unbalanced thread slice * small slict to fit in vtcm cache * limit the supported row element size * opt 4_0 dequant * fix q4 dequant * add power_utils * add rms_norm * wip * enable rms_norm f32 * fix rms_norm with param * fix compiling flags * use float * fix small row size * vectorized rms norm * wip * read 2 vectors * rename * add perf log on update * set empty tensors handle also * merge some rpc functions * opt param update * wip * print more log * add struct for update param config * add npu_device_graph_set_tensor_with_param * merge tensor and params update * wip * wip * make as template to reuse * vectorize dequantize_row_q8_0 * opt * avoid using union to store q data * wip * wip * wip --- ggml/src/ggml-qnn/npu/CMakeLists.txt | 7 +- ggml/src/ggml-qnn/npu/device/device.cpp | 52 ++-- ggml/src/ggml-qnn/npu/device/graph.cpp | 15 +- ggml/src/ggml-qnn/npu/device/graph.hpp | 1 + ggml/src/ggml-qnn/npu/device/op_impl.cpp | 224 ++++++++++++-- ggml/src/ggml-qnn/npu/device/op_impl.hpp | 2 + ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp | 289 ++++++++++-------- ggml/src/ggml-qnn/npu/device/op_mul_mat.hpp | 30 +- ggml/src/ggml-qnn/npu/device/op_types.hpp | 46 ++- ggml/src/ggml-qnn/npu/device/quants.cpp | 120 ++++++-- ggml/src/ggml-qnn/npu/device/quants.hpp | 6 +- ggml/src/ggml-qnn/npu/device/tensor.hpp | 40 ++- ggml/src/ggml-qnn/npu/device/thread_pool.hpp | 33 +- ggml/src/ggml-qnn/npu/device/util.hpp | 105 +++++++ ggml/src/ggml-qnn/npu/device/vtcm_mem.hpp | 2 +- ggml/src/ggml-qnn/npu/host/buffer.cpp | 2 +- ggml/src/ggml-qnn/npu/host/graph.cpp | 40 ++- ggml/src/ggml-qnn/npu/host/graph.hpp | 7 +- ggml/src/ggml-qnn/npu/host/host_device.cpp | 6 +- ggml/src/ggml-qnn/npu/host/tensor.hpp | 100 +++++- ggml/src/ggml-qnn/npu/host/util.cpp | 19 ++ ggml/src/ggml-qnn/npu/host/util.hpp | 1 + ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl | 24 +- .../qnn/{profiler.cpp => event_tracer.cpp} | 2 +- ggml/src/ggml-qnn/qnn/event_tracer.hpp | 45 +++ ggml/src/ggml-qnn/qnn/graph.cpp | 15 +- ggml/src/ggml-qnn/qnn/graph.hpp | 2 +- ggml/src/ggml-qnn/qnn/profiler.hpp | 100 ------ ggml/src/ggml-qnn/qnn/qnn-lib.cpp | 21 +- ggml/src/ggml-qnn/shared/common.hpp | 4 + ggml/src/ggml-qnn/shared/profiler.hpp | 61 ++++ ggml/src/ggml-qnn/shared/rpc-mem.hpp | 2 + 32 files changed, 1020 insertions(+), 403 deletions(-) rename ggml/src/ggml-qnn/qnn/{profiler.cpp => event_tracer.cpp} (99%) create mode 100644 ggml/src/ggml-qnn/qnn/event_tracer.hpp delete mode 100644 ggml/src/ggml-qnn/qnn/profiler.hpp create mode 100644 ggml/src/ggml-qnn/shared/profiler.hpp diff --git a/ggml/src/ggml-qnn/npu/CMakeLists.txt b/ggml/src/ggml-qnn/npu/CMakeLists.txt index 5f1009bb9bea6..5e1281c3d5cf4 100644 --- a/ggml/src/ggml-qnn/npu/CMakeLists.txt +++ b/ggml/src/ggml-qnn/npu/CMakeLists.txt @@ -231,6 +231,11 @@ else() build_idl(idl/hexagon_npu.idl hexagon_npu_skel_OBJS) + add_subdirectory(${HEXAGON_SDK_ROOT}/libs/qprintf qprintf_dir) + target_include_directories(hexagon_npu_skel_OBJS PUBLIC + ${HEXAGON_SDK_ROOT}/libs/qprintf/inc/ + ) + # disable warnings for the skel set_source_files_properties( ${skel_srcs} @@ -239,12 +244,12 @@ else() ) add_library(hexagon_npu_skel SHARED $) - target_link_libraries(hexagon_npu_skel ${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++abi.a ${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++.a ) set_target_properties(hexagon_npu_skel PROPERTIES OUTPUT_NAME "hexagon_npu_skel_${HEXAGON_ARCH}") + target_link_libraries(hexagon_npu_skel qprintf_static) copy_binaries(hexagon_npu_skel) endif() diff --git a/ggml/src/ggml-qnn/npu/device/device.cpp b/ggml/src/ggml-qnn/npu/device/device.cpp index fbed4b0a28fa6..8a10e9e7525b1 100644 --- a/ggml/src/ggml-qnn/npu/device/device.cpp +++ b/ggml/src/ggml-qnn/npu/device/device.cpp @@ -106,6 +106,7 @@ int npu_device_open(const char * uri, remote_handle64 * h) { } *h = reinterpret_cast(context); + DEVICE_LOG_INFO("NPU device context created: %p", (void *) *h); return AEE_SUCCESS; } @@ -117,6 +118,7 @@ int npu_device_close(remote_handle64 h) { } delete context; + DEVICE_LOG_INFO("NPU device context destroyed: %p", (void *) h); return AEE_SUCCESS; } @@ -130,6 +132,12 @@ AEEResult npu_device_device_support_op(remote_handle64 _h, const npu_device_tens const npu_device_tensor_spec * src1, const npu_device_tensor_spec * dst, npu_device_tensor_op op, boolean * is_supported) { NPU_UNUSED(_h); + + if (!src0 || !src1 || !dst || !is_supported) { + DEVICE_LOG_ERROR("npu_device_device_support_op: Invalid arguments"); + return AEE_EINVARGS; + } + *is_supported = hexagon::support_op(*src0, *src1, *dst, op); return AEE_SUCCESS; } @@ -147,28 +155,15 @@ AEEResult npu_device_tensor_init(remote_handle64 _h, const npu_device_tensor_con return AEE_SUCCESS; } -AEEResult npu_device_tensor_set_src(remote_handle64 _h, npu_device_tensor_handle_t tensor_handle, uint64_t index, - npu_device_tensor_handle_t src) { +AEEResult npu_device_tensor_update_params(remote_handle64 _h, npu_device_tensor_handle_t tensor_handle, + const npu_device_tensor_update_config * config) { NPU_UNUSED(_h); auto * tensor = tensor_from_handle(tensor_handle); - if (!tensor) { - return AEE_EINVHANDLE; - } - - auto * src_tensor = tensor_from_handle(src); - tensor->set_src(index, src_tensor); - return AEE_SUCCESS; -} - -AEEResult npu_device_tensor_set_op(remote_handle64 _h, npu_device_tensor_handle_t tensor_handle, - npu_device_tensor_op op) { - NPU_UNUSED(_h); - auto * tensor = tensor_from_handle(tensor_handle); - if (!tensor) { + if (!tensor || !config) { return AEE_EINVHANDLE; } - tensor->set_op(op); + tensor->update_config(*config); return AEE_SUCCESS; } @@ -206,6 +201,29 @@ AEEResult npu_device_graph_set_tensor(remote_handle64 _h, npu_device_graph_handl return AEE_SUCCESS; } +AEEResult npu_device_graph_set_tensor_with_param(remote_handle64 _h, npu_device_graph_handle_t graph_handle, + const npu_device_tensor_handle_t * tensor_handles, + int tensor_handlesLen, + const npu_device_tensor_update_config * tensor_params, + int tensor_paramsLen) { + NPU_UNUSED(_h); + auto * graph = graph_from_handle(graph_handle); + if (!graph || !tensor_handles || tensor_handlesLen <= 0 || !tensor_params || + tensor_handlesLen != tensor_paramsLen) { + return AEE_EINVHANDLE; + } + + graph->set_tensor(tensor_handles, tensor_handlesLen); + for (int i = 0; i < tensor_handlesLen; ++i) { + auto * tensor = tensor_from_handle(tensor_handles[i]); + if (tensor) { + tensor->update_config(tensor_params[i]); + } + } + + return AEE_SUCCESS; +} + AEEResult npu_device_graph_compute(remote_handle64 _h, npu_device_graph_handle_t graph_handle) { auto dev_ctx = device_context_from_handle(_h); if (!dev_ctx) { diff --git a/ggml/src/ggml-qnn/npu/device/graph.cpp b/ggml/src/ggml-qnn/npu/device/graph.cpp index 5201edefea924..c9cad772320f1 100644 --- a/ggml/src/ggml-qnn/npu/device/graph.cpp +++ b/ggml/src/ggml-qnn/npu/device/graph.cpp @@ -10,7 +10,8 @@ namespace hexagon { graph::graph() noexcept { - DEVICE_LOG_DEBUG("graph(%p) created\n", (void *) this); + _vtcm_quota_size = hexagon::vtcm_mem::get_avail_block_size(); // TODO: move to device init? + DEVICE_LOG_DEBUG("graph(%p) created: vtcm quota size: %zu\n", (void *) this, _vtcm_quota_size); } graph::~graph() noexcept { @@ -45,6 +46,8 @@ bool graph::compute(default_thread_pool * thread_pool, const float * f16_to_f32_ } DEVICE_LOG_DEBUG("graph(%p) compute\n", (void *) this); + + DEVICE_SCOPED_PERFORMANCE_TRACKER("[%p]compute", (void *) this); _f16_to_f32_table = f16_to_f32_table; if (thread_pool) { thread_pool->sync_execute(reinterpret_cast(&graph::thread_pool_task), this); @@ -61,6 +64,8 @@ void graph::thread_pool_task(default_thread_pool * pool, size_t thread_idx, size } void graph::compute_impl(default_thread_pool * pool, size_t thread_idx, size_t thread_count) { + hexagon::compute_params params = { thread_idx, thread_count, _vtcm_quota_size / thread_count, _f16_to_f32_table }; + for (size_t i = 0; i < _tensor_count; ++i) { auto * dst = _tensors[i]; auto op = dst->get_op(); @@ -69,14 +74,14 @@ void graph::compute_impl(default_thread_pool * pool, size_t thread_idx, size_t t DEVICE_LOG_ERROR("graph(%p) tensor[%zu] op %d not supported\n", (void *) this, i, op); return; } - - hexagon::compute_params params = { thread_idx, thread_count, _f16_to_f32_table }; if (!func(dst, ¶ms)) { DEVICE_LOG_ERROR("graph(%p) tensor[%zu] op %d compute failed\n", (void *) this, i, op); } - // TODO: figure out which ops need to sync - if (pool) { + DEVICE_SCOPED_PERFORMANCE_TRACKER("[%p]sync_thread, tidx: %zu", (void *) this, thread_idx); + + const bool should_sync = requires_thread_barrier(op); + if (pool && should_sync && i < _tensor_count - 1) { pool->sync_thread(); } dst->invalidate(); diff --git a/ggml/src/ggml-qnn/npu/device/graph.hpp b/ggml/src/ggml-qnn/npu/device/graph.hpp index 126d2541786a0..c6b68c4eeadd9 100644 --- a/ggml/src/ggml-qnn/npu/device/graph.hpp +++ b/ggml/src/ggml-qnn/npu/device/graph.hpp @@ -25,6 +25,7 @@ class graph { std::unique_ptr _tensors; size_t _tensor_count = 0; + size_t _vtcm_quota_size = 0; const float * _f16_to_f32_table = nullptr; DISABLE_COPY_AND_MOVE(graph); diff --git a/ggml/src/ggml-qnn/npu/device/op_impl.cpp b/ggml/src/ggml-qnn/npu/device/op_impl.cpp index d68fd9a53b4d4..777072024a450 100644 --- a/ggml/src/ggml-qnn/npu/device/op_impl.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_impl.cpp @@ -5,6 +5,8 @@ #include #include +#include + #include "op_mul_mat.hpp" #include "quants.hpp" @@ -17,7 +19,7 @@ inline void vec_op_impl(const _TyData * src0, const _TyData * src1, size_t count HVX_Vector * iptr0 = ((HVX_Vector *) src0); HVX_Vector * iptr0_end = ((HVX_Vector *) src0) + (count / kElementsPerVector); HVX_Vector * iptr1 = ((HVX_Vector *) src1); - HVX_Vector * optr = ((HVX_Vector *) dst); + HVX_Vector * optr = ((HVX_Vector *) dst); // framework will ensure the dst is aligned HVX_Vector prev0 = *iptr0++; HVX_Vector prev1 = *iptr1++; @@ -108,6 +110,12 @@ template struct get_data_type +struct get_data_type { + using type = _TyData; + using param_type = typename std::remove_cv::type>::type; +}; + template bool element_wise_op(hexagon::tensor * out, hexagon::compute_params * params) { using data_type = typename get_data_type::type; @@ -166,6 +174,16 @@ template bool element_wise_op(hexagon::tensor * out, hexagon::co return true; } +bool is_same_shape(const npu_device_tensor_spec & src, const npu_device_tensor_spec & dst) { + for (size_t i = 0; i < DEVICE_TENSOR_MAX_DIMS; ++i) { + if (src.ne[i] != dst.ne[i]) { + return false; + } + } + + return true; +} + bool is_element_wise_op_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, const npu_device_tensor_spec & dst, npu_device_tensor_op op) { if (op != NPU_OP_ADD && op != NPU_OP_SUB && op != NPU_OP_MUL) { @@ -196,12 +214,149 @@ bool is_element_wise_op_supported(const npu_device_tensor_spec & src0, const npu return false; } - for (size_t i = 0; i < DEVICE_TENSOR_MAX_DIMS; ++i) { - if (src0.ne[i] != dst.ne[i]) { - DEVICE_LOG_DEBUG("[%s]src0.ne[%zu] and dst.ne[%zu] not match: %lld vs %lld\n", hexagon::op_get_name(op), i, - i, (long long) src0.ne[i], (long long) dst.ne[i]); - return false; + if (!is_same_shape(src0, dst)) { + DEVICE_LOG_DEBUG("[%s]src0 and dst have different shape\n", hexagon::op_get_name(op)); + return false; + } + + return true; +} + +void rms_norm_vec_f32(const float * src, size_t count, float eps, float * dst) { + constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(float); + + HVX_Vector * src_vec_ptr = ((HVX_Vector *) src); + HVX_Vector * src_vec_end = ((HVX_Vector *) src) + (count / kElementsPerVector); + HVX_Vector prev = *src_vec_ptr++; + HVX_Vector sum = Q6_V_vzero(); + while (src_vec_ptr < src_vec_end) { + HVX_Vector curr = *src_vec_ptr++; + HVX_Vector s0 = Q6_V_valign_VVR(curr, prev, (size_t) src); + sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_Vqf32_vmpy_VsfVsf(s0, s0)); + prev = curr; + } + + if ((src_vec_end - ((HVX_Vector *) src)) > 0) { + // handle the last vector + bool src_ptr_aligned = hexagon::is_addr_aligned(src_vec_ptr); + HVX_Vector curr = src_ptr_aligned ? prev : *src_vec_ptr; + src_vec_ptr = src_ptr_aligned ? src_vec_ptr : src_vec_ptr + 1; + HVX_Vector s0 = Q6_V_valign_VVR(curr, prev, (size_t) src); + sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_Vqf32_vmpy_VsfVsf(s0, s0)); + prev = curr; + } + + const size_t leftover = count % kElementsPerVector; + const size_t leftover_bytes = leftover * sizeof(float); + if (leftover > 0) { + // handle the leftover elements + HVX_Vector curr = + (leftover_bytes + hexagon::unaligned_bytes(src_vec_ptr) > hexagon::kBytesPerVector) ? *src_vec_ptr : prev; + curr = Q6_V_valign_VVR(curr, prev, (size_t) src); + sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, + Q6_V_valign_VVR(Q6_Vqf32_vmpy_VsfVsf(curr, curr), Q6_V_vzero(), leftover_bytes)); + } + + const float mean = hexagon::vec_reduction_f32(sum) / count; // TODO: figure out how to do division in vector + const float scale = 1.0f / sqrtf(mean + eps); // TODO: use buildin blas sqrtf? + + HVX_Vector scale_vec = Q6_V_vsplat_R(reinterpret_cast(scale)); + src_vec_ptr = ((HVX_Vector *) src); + prev = *src_vec_ptr++; + HVX_Vector * dst_vec_ptr = ((HVX_Vector *) dst); // framework will ensure the dst is aligned + while (src_vec_ptr < src_vec_end) { + HVX_Vector curr = *src_vec_ptr++; + HVX_Vector s0 = Q6_V_valign_VVR(curr, prev, (size_t) src); + *dst_vec_ptr++ = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(s0, scale_vec)); + prev = curr; + } + + if ((src_vec_end - ((HVX_Vector *) src)) > 0) { + // handle the last vector + bool src_ptr_aligned = hexagon::is_addr_aligned(src_vec_ptr); + HVX_Vector curr = src_ptr_aligned ? prev : *src_vec_ptr; + src_vec_ptr = src_ptr_aligned ? src_vec_ptr : src_vec_ptr + 1; + HVX_Vector s0 = Q6_V_valign_VVR(curr, prev, (size_t) src); + *dst_vec_ptr++ = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(s0, scale_vec)); + prev = curr; + } + + if (leftover > 0) { + // handle the leftover elements + HVX_Vector curr = + (leftover_bytes + hexagon::unaligned_bytes(src_vec_ptr) > hexagon::kBytesPerVector) ? *src_vec_ptr : prev; + curr = Q6_V_valign_VVR(curr, prev, (size_t) src); + q6op_vstu_variable_ARV(dst_vec_ptr, leftover_bytes, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(curr, scale_vec))); + } +} + +// TODO: merge with element_wise_op? +template bool unary_op(hexagon::tensor * out, hexagon::compute_params * params) { + using data_type = typename get_data_type::type; + using param_type = typename get_data_type::param_type; + + if (!out) { + return false; + } + + static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "element_wise_op requires max dims 4"); + auto * src0 = out->get_src(0); + if (!src0) { + return true; // skip if no src + } + + const auto * src0_ptr = reinterpret_cast(src0->get_read_buffer()); + auto * dst_ptr = reinterpret_cast(out->get_write_buffer()); + auto total_rows = out->get_ne(3) * out->get_ne(2) * out->get_ne(1); + const auto rows_per_cube = out->get_ne(2) * out->get_ne(1); + const auto start_end = hexagon::get_thread_work_slice(total_rows, params->tidx, params->tcnt); + if (start_end.first >= start_end.second) { + return true; + } + + DEVICE_SCOPED_OP_PERFORMANCE_TRACKER(out, params->tidx); + + const auto param = out->get_op_param(0); + const size_t valid_row_bytes = src0->get_ne(0) * sizeof(data_type); + for (int64_t ir = start_end.first; ir < start_end.second; ++ir) { + const auto i03 = ir / rows_per_cube; + const auto i02 = ir / out->get_ne(1) - i03 * out->get_ne(2); + const auto i01 = ir % out->get_ne(1); // TODO: should we use divide instead of mod? + + auto * src0_row = src0_ptr + i03 * src0->get_nb(3) + i02 * src0->get_nb(2) + i01 * src0->get_nb(1); + auto * dst_row = dst_ptr + i03 * out->get_nb(3) + i02 * out->get_nb(2) + i01 * out->get_nb(1); + if (ir + 1 < start_end.second) { + hexagon::l2fetch_row(src0_row + src0->get_nb(1), valid_row_bytes); } + + _RowFunc(reinterpret_cast(src0_row), static_cast(out->get_ne(0)), param, + reinterpret_cast(dst_row)); + } + + return true; +} + +bool is_unary_op_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, + const npu_device_tensor_spec & dst, npu_device_tensor_op op) { + if (op != NPU_OP_RMS_NORM) { + DEVICE_LOG_DEBUG("[%s]unsupported\n", hexagon::op_get_name(op)); + return false; + } + + if (dst.type != src0.type) { + DEVICE_LOG_DEBUG("[%s]src0.type and dst.type mismatch: %s vs %s\n", hexagon::op_get_name(op), + hexagon::get_type_name(src0.type), hexagon::get_type_name(dst.type)); + return false; + } + + if (dst.type != NPU_DATA_TYPE_F32) { + DEVICE_LOG_DEBUG("[%s]unsupported data type: %s\n", hexagon::op_get_name(op), hexagon::get_type_name(dst.type)); + return false; + } + + if (!is_same_shape(src0, dst)) { + DEVICE_LOG_DEBUG("[%s]src0 and dst have different shape\n", hexagon::op_get_name(op)); + return false; } return true; @@ -211,6 +366,7 @@ struct op_capabilities { npu_device_tensor_op op; hexagon::op_is_supported_func_type is_supported; hexagon::compute_func_type compute_funcs[NPU_DATA_TYPE_COUNT]; + bool requires_thread_barrier = false; }; constexpr const op_capabilities kOpCapabilities[] = { @@ -219,22 +375,36 @@ constexpr const op_capabilities kOpCapabilities[] = { { hexagon::mul_mat_f32, // NPU_DATA_TYPE_F32 nullptr, // NPU_DATA_TYPE_F16 - }, }, - { NPU_OP_ADD, - is_element_wise_op_supported, { - element_wise_op>, // NPU_DATA_TYPE_F32 - element_wise_op>, // NPU_DATA_TYPE_F16 - } }, - { NPU_OP_SUB, - is_element_wise_op_supported, { - element_wise_op>, // NPU_DATA_TYPE_F32 - element_wise_op>, // NPU_DATA_TYPE_F16 - } }, - { NPU_OP_MUL, - is_element_wise_op_supported, { - element_wise_op>, // NPU_DATA_TYPE_F32 - element_wise_op>, // NPU_DATA_TYPE_F16 - } }, + }, true, + }, + { + NPU_OP_ADD, is_element_wise_op_supported, + { + element_wise_op>, // NPU_DATA_TYPE_F32 + element_wise_op>, // NPU_DATA_TYPE_F16 + }, false, + }, + { + NPU_OP_SUB, is_element_wise_op_supported, + { + element_wise_op>, // NPU_DATA_TYPE_F32 + element_wise_op>, // NPU_DATA_TYPE_F16 + }, false, + }, + { + NPU_OP_MUL, is_element_wise_op_supported, + { + element_wise_op>, // NPU_DATA_TYPE_F32 + element_wise_op>, // NPU_DATA_TYPE_F16 + }, false, + }, + { + NPU_OP_RMS_NORM, is_unary_op_supported, + { + unary_op, // NPU_DATA_TYPE_F32 + nullptr, // NPU_DATA_TYPE_F16 + }, false, + }, }; static_assert(kOpCapabilities[NPU_OP_MUL_MAT].compute_funcs[NPU_DATA_TYPE_F32] == hexagon::mul_mat_f32, @@ -243,6 +413,8 @@ static_assert(kOpCapabilities[NPU_OP_MUL_MAT].compute_funcs[NPU_DATA_TYPE_F32] = static_assert(std::size(kOpCapabilities) == NPU_OP_COUNT); static_assert(kOpCapabilities[NPU_OP_MUL_MAT].op == NPU_OP_MUL_MAT, "kOpArray[NPU_OP_MUL_MAT].op != NPU_OP_MUL_MAT"); static_assert(kOpCapabilities[NPU_OP_MUL].op == NPU_OP_MUL, "kOpArray[NPU_OP_MUL].op != NPU_OP_MUL"); +static_assert(kOpCapabilities[NPU_OP_RMS_NORM].op == NPU_OP_RMS_NORM, + "kOpArray[NPU_OP_RMS_NORM].op != NPU_OP_RMS_NORM"); hexagon::compute_func_type get_compute_func_impl(npu_device_tensor_op op, npu_device_tensor_data_type type) { if (op >= NPU_OP_COUNT) { @@ -260,6 +432,14 @@ compute_func_type get_compute_func(tensor * dst) { return get_compute_func_impl(dst->get_op(), dst->get_type()); } +bool requires_thread_barrier(npu_device_tensor_op op) { + if (op >= NPU_OP_COUNT) { + return false; + } + + return kOpCapabilities[op].requires_thread_barrier; +} + bool support_op(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, const npu_device_tensor_spec & dst, npu_device_tensor_op op) { if (get_compute_func_impl(op, dst.type) == nullptr) { diff --git a/ggml/src/ggml-qnn/npu/device/op_impl.hpp b/ggml/src/ggml-qnn/npu/device/op_impl.hpp index f9a3d01187793..9b75ec6d47967 100644 --- a/ggml/src/ggml-qnn/npu/device/op_impl.hpp +++ b/ggml/src/ggml-qnn/npu/device/op_impl.hpp @@ -6,6 +6,8 @@ namespace hexagon { compute_func_type get_compute_func(tensor * dst); +bool requires_thread_barrier(npu_device_tensor_op op); + bool support_op(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, const npu_device_tensor_spec & dst, npu_device_tensor_op op); diff --git a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp index 647a5ff925737..6087673ac65af 100644 --- a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp @@ -3,44 +3,43 @@ #include #include "quants.hpp" +#include "thread_pool.hpp" // TODO: remove this dependency #include "vtcm_mem.hpp" namespace { -inline float vec_reduction_f32(HVX_Vector sums) { - constexpr const size_t kFloatsPerVector = hexagon::kBytesPerVector / sizeof(float); - static_assert(kFloatsPerVector == 32 || kFloatsPerVector == 16, "kFloatsPerVector should be 16 or 32"); - - // TODO: do we have a better way to do the reduction? - switch (kFloatsPerVector) { - default: - case 32: - sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 16 * sizeof(float))); - // fallthrough - case 16: - sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 8 * sizeof(float))); - sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 4 * sizeof(float))); - sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 2 * sizeof(float))); - sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, sizeof(float))); - break; - } - - return hexagon::get_flt0_from_fltv(Q6_Vsf_equals_Vqf32(sums)); -} - inline float vec_dot_product_f32_f32(const float * src0, const float * src1, size_t count) { constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(float); - HVX_Vector * iptr0 = ((HVX_Vector *) src0); - HVX_Vector * iptr0_end = ((HVX_Vector *) src0) + (count / kElementsPerVector); - HVX_Vector * iptr1 = ((HVX_Vector *) src1); - HVX_Vector prev0 = *iptr0++; - HVX_Vector prev1 = *iptr1++; - HVX_Vector sum = Q6_V_vzero(); + HVX_Vector * src0_vec_ptr = ((HVX_Vector *) src0); + HVX_Vector * src0_vec_ptr_end = ((HVX_Vector *) src0) + count / kElementsPerVector; + HVX_Vector * src1_vec_ptr = ((HVX_Vector *) src1); + HVX_Vector prev0 = *src0_vec_ptr++; + HVX_Vector prev1 = *src1_vec_ptr++; + HVX_Vector sum = Q6_V_vzero(); + + while (src0_vec_ptr_end - src0_vec_ptr > 1) { + HVX_Vector curr0_lo = src0_vec_ptr[0]; + HVX_Vector curr0_hi = src0_vec_ptr[1]; + HVX_Vector curr1_lo = src1_vec_ptr[0]; + HVX_Vector curr1_hi = src1_vec_ptr[1]; + + HVX_Vector l0 = Q6_V_valign_VVR(curr0_lo, prev0, (size_t) src0); + HVX_Vector l1 = Q6_V_valign_VVR(curr1_lo, prev1, (size_t) src1); + HVX_Vector h0 = Q6_V_valign_VVR(curr0_hi, curr0_lo, (size_t) src0); + HVX_Vector h1 = Q6_V_valign_VVR(curr1_hi, curr1_lo, (size_t) src1); + sum = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(l0, l1), sum); + sum = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(h0, h1), sum); + + prev0 = curr0_hi; + prev1 = curr1_hi; + src0_vec_ptr += 2; + src1_vec_ptr += 2; + } - while (iptr0 < iptr0_end) { - HVX_Vector curr0 = *iptr0++; - HVX_Vector curr1 = *iptr1++; + if (src0_vec_ptr_end - src0_vec_ptr > 0) { + HVX_Vector curr0 = *src0_vec_ptr++; + HVX_Vector curr1 = *src1_vec_ptr++; HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); sum = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(s0, s1), sum); @@ -48,17 +47,17 @@ inline float vec_dot_product_f32_f32(const float * src0, const float * src1, siz prev1 = curr1; } - if ((iptr0_end - ((HVX_Vector *) src0)) > 0) { + if ((src0_vec_ptr_end - ((HVX_Vector *) src0)) > 0) { // handle the last vector // see also: // https://github.com/UbiquitousLearning/mllm/blob/babf4410352ce8730824c87699c025a0d4ce3a6f/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/LLaMAMul.cpp#L147 // or qualcomm sdk libs\qhl_hvx\src\qhblas_hvx\qhblas_hvx_aw_vector_add_ah.c - bool iptr0_aligned = hexagon::is_addr_aligned(iptr0); - HVX_Vector curr0 = iptr0_aligned ? prev0 : *iptr0; - iptr0 = iptr0_aligned ? iptr0 : iptr0 + 1; - bool iptr1_aligned = hexagon::is_addr_aligned(iptr1); - HVX_Vector curr1 = iptr1_aligned ? prev1 : *iptr1; - iptr1 = iptr1_aligned ? iptr1 : iptr1 + 1; + bool iptr0_aligned = hexagon::is_addr_aligned(src0_vec_ptr); + HVX_Vector curr0 = iptr0_aligned ? prev0 : *src0_vec_ptr; + src0_vec_ptr = iptr0_aligned ? src0_vec_ptr : src0_vec_ptr + 1; + bool iptr1_aligned = hexagon::is_addr_aligned(src1_vec_ptr); + HVX_Vector curr1 = iptr1_aligned ? prev1 : *src1_vec_ptr; + src1_vec_ptr = iptr1_aligned ? src1_vec_ptr : src1_vec_ptr + 1; HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); sum = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(s0, s1), sum); @@ -70,19 +69,21 @@ inline float vec_dot_product_f32_f32(const float * src0, const float * src1, siz const size_t leftover_bytes = leftover * sizeof(float); if (leftover > 0) { // handle the leftover elements - HVX_Vector curr0 = - (leftover_bytes + hexagon::unaligned_bytes(iptr0) > hexagon::kBytesPerVector) ? *iptr0 : prev0; - curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + HVX_Vector curr0 = (leftover_bytes + hexagon::unaligned_bytes(src0_vec_ptr) > hexagon::kBytesPerVector) ? + *src0_vec_ptr : + prev0; + curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); - HVX_Vector curr1 = - (leftover_bytes + hexagon::unaligned_bytes(iptr1) > hexagon::kBytesPerVector) ? *iptr1 : prev1; - curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); + HVX_Vector curr1 = (leftover_bytes + hexagon::unaligned_bytes(src1_vec_ptr) > hexagon::kBytesPerVector) ? + *src1_vec_ptr : + prev1; + curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); sum = Q6_Vqf32_vadd_Vqf32Vqf32( Q6_V_valign_VVR(Q6_Vqf32_vmpy_VsfVsf(curr0, curr1), Q6_V_vzero(), leftover_bytes), sum); } - return vec_reduction_f32(sum); + return hexagon::vec_reduction_f32(sum); } // TODO: merge with vec_dot_product_f32_f32? @@ -90,17 +91,17 @@ inline float vec_dot_product_f16_f16(const npu_device_fp16_t * src0, const npu_d constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(npu_device_fp16_t); constexpr const size_t kFloatsPerVector = hexagon::kBytesPerVector / sizeof(float); - HVX_Vector * iptr0 = ((HVX_Vector *) src0); - HVX_Vector * iptr0_end = ((HVX_Vector *) src0) + (count / kElementsPerVector); - HVX_Vector * iptr1 = ((HVX_Vector *) src1); - HVX_Vector prev0 = *iptr0++; - HVX_Vector prev1 = *iptr1++; - HVX_Vector sum_hi = Q6_V_vzero(); - HVX_Vector sum_lo = Q6_V_vzero(); - - while (iptr0 < iptr0_end) { - HVX_Vector curr0 = *iptr0++; - HVX_Vector curr1 = *iptr1++; + HVX_Vector * src0_vec_ptr = ((HVX_Vector *) src0); + HVX_Vector * src0_vec_ptr_end = ((HVX_Vector *) src0) + (count / kElementsPerVector); + HVX_Vector * src1_vec_ptr = ((HVX_Vector *) src1); + HVX_Vector prev0 = *src0_vec_ptr++; + HVX_Vector prev1 = *src1_vec_ptr++; + HVX_Vector sum_hi = Q6_V_vzero(); + HVX_Vector sum_lo = Q6_V_vzero(); + + while (src0_vec_ptr < src0_vec_ptr_end) { + HVX_Vector curr0 = *src0_vec_ptr++; + HVX_Vector curr1 = *src1_vec_ptr++; HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); HVX_VectorPair result = Q6_Wqf32_vmpy_VhfVhf(s0, s1); @@ -110,17 +111,17 @@ inline float vec_dot_product_f16_f16(const npu_device_fp16_t * src0, const npu_d prev1 = curr1; } - if ((iptr0_end - ((HVX_Vector *) src0)) > 0) { + if ((src0_vec_ptr_end - ((HVX_Vector *) src0)) > 0) { // handle the last vector // see also: // https://github.com/UbiquitousLearning/mllm/blob/babf4410352ce8730824c87699c025a0d4ce3a6f/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/LLaMAMul.cpp#L147 // or qualcomm sdk libs\qhl_hvx\src\qhblas_hvx\qhblas_hvx_aw_vector_add_ah.c - bool iptr0_aligned = hexagon::is_addr_aligned(iptr0); - HVX_Vector curr0 = iptr0_aligned ? prev0 : *iptr0; - iptr0 = iptr0_aligned ? iptr0 : iptr0 + 1; - bool iptr1_aligned = hexagon::is_addr_aligned(iptr1); - HVX_Vector curr1 = iptr1_aligned ? prev1 : *iptr1; - iptr1 = iptr1_aligned ? iptr1 : iptr1 + 1; + bool iptr0_aligned = hexagon::is_addr_aligned(src0_vec_ptr); + HVX_Vector curr0 = iptr0_aligned ? prev0 : *src0_vec_ptr; + src0_vec_ptr = iptr0_aligned ? src0_vec_ptr : src0_vec_ptr + 1; + bool iptr1_aligned = hexagon::is_addr_aligned(src1_vec_ptr); + HVX_Vector curr1 = iptr1_aligned ? prev1 : *src1_vec_ptr; + src1_vec_ptr = iptr1_aligned ? src1_vec_ptr : src1_vec_ptr + 1; HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); HVX_VectorPair result = Q6_Wqf32_vmpy_VhfVhf(s0, s1); @@ -134,13 +135,15 @@ inline float vec_dot_product_f16_f16(const npu_device_fp16_t * src0, const npu_d const size_t leftover_bytes = leftover * sizeof(npu_device_fp16_t); if (leftover > 0) { // handle the leftover elements - HVX_Vector curr0 = - (leftover_bytes + hexagon::unaligned_bytes(iptr0) > hexagon::kBytesPerVector) ? *iptr0 : prev0; - curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + HVX_Vector curr0 = (leftover_bytes + hexagon::unaligned_bytes(src0_vec_ptr) > hexagon::kBytesPerVector) ? + *src0_vec_ptr : + prev0; + curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); - HVX_Vector curr1 = - (leftover_bytes + hexagon::unaligned_bytes(iptr1) > hexagon::kBytesPerVector) ? *iptr1 : prev1; - curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); + HVX_Vector curr1 = (leftover_bytes + hexagon::unaligned_bytes(src1_vec_ptr) > hexagon::kBytesPerVector) ? + *src1_vec_ptr : + prev1; + curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); HVX_VectorPair result = Q6_Wqf32_vmpy_VhfVhf(curr0, curr1); @@ -156,7 +159,7 @@ inline float vec_dot_product_f16_f16(const npu_device_fp16_t * src0, const npu_d } } - return vec_reduction_f32(Q6_Vqf32_vadd_Vqf32Vqf32(sum_hi, sum_lo)); + return hexagon::vec_reduction_f32(Q6_Vqf32_vadd_Vqf32Vqf32(sum_hi, sum_lo)); } template struct get_data_type {}; @@ -208,70 +211,118 @@ void mul_mat_impl(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tenso } // cache the src0 plane in VTCM - const size_t src0_plane_row_count = start_end_element.second - start_end_element.first; - size_t src0_plane_cache_size = 0; - uint8_t * src0_plane_cache_ptr = nullptr; - const uint8_t * last_cached_plane_ptr = nullptr; + size_t src0_plane_slice_row_count = start_end_element.second - start_end_element.first; + size_t src0_plane_cache_size = 0; + uint8_t * src0_plane_cache_ptr = nullptr; + const uint8_t * last_cached_plane_ptr = nullptr; + bool is_mem_cache = false; if (is_quantized) { - src0_plane_cache_size = src0_actual_row_size * src0_plane_row_count; - src0_plane_cache_ptr = params->get_cache(src0_plane_cache_size, is_quantized); + src0_plane_slice_row_count = + std::min(params->vtcm_quota_size / src0_actual_row_size, src0_plane_slice_row_count); + src0_plane_cache_size = src0_actual_row_size * src0_plane_slice_row_count; + src0_plane_cache_ptr = params->get_vtcm_cache(src0_plane_cache_size); + if (src0_plane_cache_ptr == nullptr) { + DEVICE_LOG_DEBUG( + "mul_mat_impl: failed to get VTCM cache for src0, size: %zu, src0_plane_slice_row_count: %zu, " + "src0_actual_row_size: %zu, will fallback to mem cache\n", + src0_plane_cache_size, src0_plane_slice_row_count, src0_actual_row_size); + src0_plane_cache_ptr = params->get_mem_cache(src0_plane_cache_size); + is_mem_cache = true; + } } - DEVICE_LOG_DEBUG("mul_mat_impl src0_actual_row_size: %zu, is_quantized: %d, vtcm_mem: %p(%zu)\n", - src0_actual_row_size, is_quantized, (void *) src0_plane_cache_ptr, src0_plane_cache_size); + DEVICE_LOG_DEBUG( + "mul_mat_impl src0_actual_row_size: %zu, src0_plane_slice_row_count: %zu, is_quantized: %d, vtcm_mem: " + "%p(%zu)\n", + src0_actual_row_size, src0_plane_slice_row_count, is_quantized, (void *) src0_plane_cache_ptr, + src0_plane_cache_size); const size_t valid_row_bytes = src1->get_ne(0) * sizeof(data_type); DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_WITH_SUB_PROC(dst, params->tidx, dequant); for (int64_t ip = start_end_plane.first; ip < start_end_plane.second; ip++) { const auto i3 = ip / dst->get_ne(2); const auto i2 = ip - i3 * dst->get_ne(2); - const auto * src0_plane = src0_ptr + i3 / r03 * src0->get_nb(3) + i2 / r02 * src0->get_nb(2) + - start_end_element.first * src0->get_nb(1); const auto * src1_plane = src1_ptr + i3 * src1->get_nb(3) + i2 * src1->get_nb(2); auto * dst_plane = dst_ptr + i3 * dst->get_nb(3) + i2 * dst->get_nb(2); - - if (src0_plane_cache_ptr) { - if (last_cached_plane_ptr != src0_plane) { - DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_SUB_PROC(dequant); - - for (int64_t ir = 0; ir < (int64_t) src0_plane_row_count; ir++) { - auto * src0_row = src0_plane + ir * src0->get_nb(1); - if (ir + 1 < src0_plane_row_count) { - hexagon::l2fetch_row(src0_row + src0->get_nb(1), src0->get_nb(1)); + for (int64_t col_idx = start_end_element.first; col_idx < start_end_element.second; + col_idx += src0_plane_slice_row_count) { + const auto * src0_plane = + src0_ptr + i3 / r03 * src0->get_nb(3) + i2 / r02 * src0->get_nb(2) + col_idx * src0->get_nb(1); + if (src0_plane_cache_ptr) { + if (last_cached_plane_ptr != src0_plane) { + DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_SUB_PROC(dequant); + + for (int64_t ir = 0; ir < (int64_t) src0_plane_slice_row_count; ir++) { + auto * src0_row = src0_plane + ir * src0->get_nb(1); + if (ir + 1 < src0_plane_slice_row_count) { + hexagon::l2fetch_row(src0_row + src0->get_nb(1), src0->get_nb(1)); + } + + auto * dst_row = reinterpret_cast(src0_plane_cache_ptr + ir * src0_actual_row_size); + dequantize_row_func(src0_row, reinterpret_cast(dst_row), src0->get_ne(0), + params->f16_to_f32_table); } - auto * dst_row = reinterpret_cast(src0_plane_cache_ptr + ir * src0_actual_row_size); - dequantize_row_func(src0_row, reinterpret_cast(dst_row), src0->get_ne(0), - params->f16_to_f32_table); + last_cached_plane_ptr = src0_plane; } - last_cached_plane_ptr = src0_plane; + src0_plane = src0_plane_cache_ptr; } - src0_plane = src0_plane_cache_ptr; - } - - for (int64_t i1 = start_end_row.first; i1 < start_end_row.second; i1++) { - auto * src1_row = src1_plane + i1 * src1->get_nb(1); - auto * dst_row = reinterpret_cast(dst_plane + i1 * dst->get_nb(1)) + start_end_element.first; - for (int64_t i0 = 0; i0 < (int64_t) src0_plane_row_count; i0++) { - auto * src0_row = src0_plane + i0 * src0_actual_row_size; - if (i0 + 1 < src0_plane_row_count) { - if (!src0_plane_cache_ptr) { - hexagon::l2fetch_row(src0_row + src0_actual_row_size, valid_row_bytes); + for (int64_t i1 = start_end_row.first; i1 < start_end_row.second; i1++) { + auto * src1_row = src1_plane + i1 * src1->get_nb(1); + auto * dst_row = reinterpret_cast(dst_plane + i1 * dst->get_nb(1)) + col_idx; + for (int64_t i0 = 0; i0 < (int64_t) src0_plane_slice_row_count; i0++) { + auto * src0_row = src0_plane + i0 * src0_actual_row_size; + if (i0 + 1 < src0_plane_slice_row_count) { + if (!src0_plane_cache_ptr || is_mem_cache) { + hexagon::l2fetch_row(src0_row + src0_actual_row_size, valid_row_bytes); + } + } else if (ip + 1 < start_end_plane.second) { + hexagon::l2fetch_row(src1_row + src1->get_nb(1), valid_row_bytes); } - } else if (ip + 1 < start_end_plane.second) { - hexagon::l2fetch_row(src1_row + src1->get_nb(1), valid_row_bytes); - } - // TODO: figure dst how to handle a entire row - dst_row[i0] = _DotFunc(reinterpret_cast(src0_row), - reinterpret_cast(src1_row), (size_t) src0->get_ne(0)); + // TODO: figure dst how to handle a entire row + dst_row[i0] = _DotFunc(reinterpret_cast(src0_row), + reinterpret_cast(src1_row), (size_t) src0->get_ne(0)); + } } } } } +bool is_quantized_mul_mat_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1) { + if (src1.type != NPU_DATA_TYPE_F32) { + DEVICE_LOG_DEBUG("[MUL_MAT]src0.type(%s) and src1.type(%s) mismatch and src1 is not F32\n", + hexagon::get_type_name(src0.type), hexagon::get_type_name(src1.type)); + return false; + } + + const auto type_traits = hexagon::get_type_traits(src0.type); + if (!type_traits.is_quantized || type_traits.dequantize_row == nullptr) { + DEVICE_LOG_DEBUG("[MUL_MAT]src0.type(%s) and src1.type(%s) mismatch and src0 is not quantized\n", + hexagon::get_type_name(src0.type), hexagon::get_type_name(src1.type)); + return false; + } + + if (src0.ne[0] % type_traits.blck_size) { + DEVICE_LOG_DEBUG("[MUL_MAT]src0.type(%s) ne[0] is not aligned: %ld\n", hexagon::get_type_name(src0.type), + (long) src0.ne[0]); + return false; + } + + const auto vtcm_thread_quota_size = hexagon::vtcm_mem::get_total_size() / hexagon::kMaxThreadCount; + if (src0.ne[0] * sizeof(hexagon::dequantized_element_type) > vtcm_thread_quota_size) { + DEVICE_LOG_DEBUG("[MUL_MAT]src0.type(%s) ne[0] is too large: %ld, vtcm_thread_quota_size: %zu\n", + hexagon::get_type_name(src0.type), (long) src0.ne[0], vtcm_thread_quota_size); + return false; + } + + DEVICE_LOG_DEBUG("[MUL_MAT]supported quantized src0.type(%s) and src1.type(%s)\n", + hexagon::get_type_name(src0.type), hexagon::get_type_name(src1.type)); + return true; +} + } // namespace namespace hexagon { @@ -319,27 +370,9 @@ bool is_mul_mat_supported(const npu_device_tensor_spec & src0, const npu_device_ if (src0.type != src1.type) { #ifdef GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS - if (src1.type != NPU_DATA_TYPE_F32) { - DEVICE_LOG_DEBUG("[%s]src0.type(%s) and src1.type(%s) mismatch and src1 is not F32\n", op_get_name(op), - get_type_name(src0.type), get_type_name(src1.type)); - return false; - } - - const auto type_traits = get_type_traits(src0.type); - if (!type_traits.is_quantized || type_traits.dequantize_row == nullptr) { - DEVICE_LOG_DEBUG("[%s]src0.type(%s) and src1.type(%s) mismatch and src0 is not quantized\n", - op_get_name(op), get_type_name(src0.type), get_type_name(src1.type)); + if (!is_quantized_mul_mat_supported(src0, src1)) { return false; } - - if (src0.ne[0] % type_traits.blck_size) { - DEVICE_LOG_DEBUG("[%s]src0.type(%s) ne[0] is not aligned: %ld\n", op_get_name(op), get_type_name(src0.type), - (long) src0.ne[0]); - return false; - } - - DEVICE_LOG_DEBUG("[%s]supported quantized src0.type(%s) and src1.type(%s)\n", op_get_name(op), - get_type_name(src0.type), get_type_name(src1.type)); #else DEVICE_LOG_DEBUG("[%s]src0.type(%s) and src1.type(%s) mismatch and quantized tensors are not supported\n", op_get_name(op), get_type_name(src0.type), get_type_name(src1.type)); diff --git a/ggml/src/ggml-qnn/npu/device/op_mul_mat.hpp b/ggml/src/ggml-qnn/npu/device/op_mul_mat.hpp index 3a97858606cd4..8cf41e0a99d86 100644 --- a/ggml/src/ggml-qnn/npu/device/op_mul_mat.hpp +++ b/ggml/src/ggml-qnn/npu/device/op_mul_mat.hpp @@ -7,11 +7,6 @@ namespace hexagon { -constexpr const size_t kBytesPerVector = sizeof(HVX_Vector); // 128 for v73 -constexpr const size_t kAlignMask = kBytesPerVector - 1; -constexpr const size_t kL2CacheSize = 8 * 1024; // // 8KB L2 cache -constexpr const size_t kL2FetchAheadVectors = kL2CacheSize / kBytesPerVector; - inline size_t unaligned_bytes(const void * addr) { return ((size_t) addr) & kAlignMask; } @@ -43,6 +38,31 @@ inline float get_flt0_from_fltv(HVX_Vector vect) { return cvt.f; } +inline HVX_Vector vec_reduction_qf32(HVX_Vector sums) { + constexpr const size_t kFloatsPerVector = hexagon::kBytesPerVector / sizeof(float); + static_assert(kFloatsPerVector == 32 || kFloatsPerVector == 16, "kFloatsPerVector should be 16 or 32"); + + // TODO: do we have a better way to do the reduction? + switch (kFloatsPerVector) { + default: + case 32: + sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 16 * sizeof(float))); + // fallthrough + case 16: + sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 8 * sizeof(float))); + sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 4 * sizeof(float))); + sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 2 * sizeof(float))); + sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, sizeof(float))); + break; + } + + return sums; +} + +inline float vec_reduction_f32(HVX_Vector sums) { + return hexagon::get_flt0_from_fltv(Q6_Vsf_equals_Vqf32(vec_reduction_qf32(sums))); +} + bool mul_mat_f32(tensor * out, compute_params * params); bool is_mul_mat_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, const npu_device_tensor_spec & dst, npu_device_tensor_op op); diff --git a/ggml/src/ggml-qnn/npu/device/op_types.hpp b/ggml/src/ggml-qnn/npu/device/op_types.hpp index 8bf10637db51c..153bbab058b89 100644 --- a/ggml/src/ggml-qnn/npu/device/op_types.hpp +++ b/ggml/src/ggml-qnn/npu/device/op_types.hpp @@ -1,5 +1,7 @@ #pragma once +#include + #include #include #include @@ -15,26 +17,25 @@ namespace hexagon { struct compute_params { const size_t tidx; const size_t tcnt; + const size_t vtcm_quota_size; const float * f16_to_f32_table; std::unique_ptr vtcm_cache; std::unique_ptr mem_cache; size_t mem_cache_size = 0; - uint8_t * get_cache(size_t size, bool fallback_to_mem) { + uint8_t * get_vtcm_cache(size_t size) { if (!vtcm_cache || vtcm_cache->get_size() < size) { vtcm_cache = std::make_unique(size, false); } - if (vtcm_cache->is_valid()) { - return vtcm_cache->get_mem(); - } - - if (!fallback_to_mem) { - DEVICE_LOG_DEBUG("vtcm_mem not valid, return nullptr\n"); + if (!vtcm_cache->is_valid()) { return nullptr; } - DEVICE_LOG_DEBUG("vtcm_mem not valid, allocate from mem_cache\n"); + return vtcm_cache->get_mem(); + } + + uint8_t * get_mem_cache(size_t size) { if (!mem_cache || mem_cache_size < size) { mem_cache = std::make_unique(size + 256); mem_cache_size = mem_cache ? size : 0; @@ -49,10 +50,31 @@ typedef bool (*op_is_supported_func_type)(const npu_device_tensor_spec & src0, c const npu_device_tensor_spec & dst, npu_device_tensor_op op); inline constexpr std::pair get_thread_work_slice(int64_t total, size_t tidx, size_t tcnt) { - const auto elements_per_thread = (total + tcnt - 1) / tcnt; - const auto start = tidx * elements_per_thread; - const auto end = std::min(start + elements_per_thread, total); - return { start, end }; + if (total <= 0 || tidx >= tcnt) { + return { 0, 0 }; // No work for this thread + } + + const auto elements_per_thread = total / tcnt; + const auto remainder = total % tcnt; + + int64_t start = 0; + int64_t end = 0; + if (tidx < remainder) { + // First 'remainder' threads get one extra item + start = tidx * (elements_per_thread + 1); + end = start + elements_per_thread + 1; + } else { + // Remaining threads get the base number of elements + start = remainder * (elements_per_thread + 1) + (tidx - remainder) * elements_per_thread; + end = start + elements_per_thread; + } + + return { start, std::min(end, total) }; } +constexpr const size_t kBytesPerVector = sizeof(HVX_Vector); // 128 for v73 +constexpr const size_t kAlignMask = kBytesPerVector - 1; +constexpr const size_t kL2CacheSize = 8 * 1024; // // 8KB L2 cache +constexpr const size_t kL2FetchAheadVectors = kL2CacheSize / kBytesPerVector; + } // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/device/quants.cpp b/ggml/src/ggml-qnn/npu/device/quants.cpp index d873691b58e15..67e77c2fc2a2a 100644 --- a/ggml/src/ggml-qnn/npu/device/quants.cpp +++ b/ggml/src/ggml-qnn/npu/device/quants.cpp @@ -4,6 +4,8 @@ #include +#include "op_types.hpp" // TODO: remove this include + static_assert(sizeof(npu_device_block_q4_K) == 2 * sizeof(npu_device_fp16_t) + QUANT_K_SCALE_SIZE + QUANT_K_BLOCK_SIZE / 2, "wrong q4_K block size/padding"); @@ -16,14 +18,34 @@ static_assert(sizeof(npu_device_block_q8_0) == sizeof(npu_device_fp16_t) + QUANT namespace { +inline HVX_Vector vmemu(const void * unaligned_ptr) { + HVX_Vector ret = *reinterpret_cast(unaligned_ptr); + return ret; +} + inline float to_float(const npu_device_fp16_t src) { - union { - __fp16 f16; - npu_device_fp16_t u16; - } f16; + return reinterpret_cast(src); +} + +template inline HVX_Vector load_block_generic(const _TBlock & src) { + uint8_t buffer[hexagon::kBytesPerVector]; + + static_assert(sizeof(buffer) == sizeof(HVX_Vector), "wrong cvt size/padding"); + static_assert(sizeof(buffer) >= sizeof(src.qs), "wrong q4_0 block size/padding"); - f16.u16 = src; - return f16.f16; + memcpy(&buffer[0], src.qs, sizeof(src.qs)); + return *reinterpret_cast(buffer); +} + +template inline HVX_Vector load_dual_block_generic(const _TBlock & src1, const _TBlock & src2) { + uint8_t buffer[hexagon::kBytesPerVector]; + + static_assert(sizeof(buffer) == sizeof(HVX_Vector), "wrong cvt size/padding"); + static_assert(sizeof(buffer) >= sizeof(src1.qs) * 2, "wrong q4_0 block size/padding"); + + memcpy(&buffer[0], src1.qs, sizeof(src1.qs)); + memcpy(&buffer[sizeof(src1.qs)], src2.qs, sizeof(src2.qs)); + return *reinterpret_cast(buffer); } inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8_t * m) { @@ -37,38 +59,78 @@ inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8_t * m) } void dequantize_row_q8_0(const void * src, float * dst, size_t count, const float * f16_to_f32_table) { - constexpr const int qk = QUANT_BLOCK_SIZE; - const int nb = count / qk; - const auto * src_ptr = reinterpret_cast(src); + constexpr const int qk = QUANT_BLOCK_SIZE; + static_assert(QUANT_BLOCK_SIZE == hexagon::kBytesPerVector / sizeof(float)); - // TODO: use intrinsics - for (int i = 0; i < nb; i++) { - const float d = f16_to_f32_table[src_ptr[i].d]; + const int nb = count / qk; + const auto * src_ptr = reinterpret_cast(src); + HVX_UVector * out = ((HVX_UVector *) dst); // TODO: opt for aligned access - for (int j = 0; j < qk; ++j) { - dst[i * qk + j] = src_ptr[i].qs[j] * d; - } + for (int i = 0; i < nb; i++) { + const auto & src = src_ptr[i]; + HVX_Vector d = Q6_Vh_vsplat_R(src.d); + + HVX_Vector q_lo = load_block_generic(src); + HVX_VectorPair q = Q6_Wh_vunpack_Vb(q_lo); + q = Q6_Wh_vunpack_Vb(Q6_V_lo_W(q)); + q_lo = Q6_Vhf_equals_Vh(Q6_V_lo_W(q)); + q = Q6_Wqf32_vmpy_VhfVhf(q_lo, d); + out[i] = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(q)); } } void dequantize_row_q4_0(const void * src, float * dst, size_t count, const float * f16_to_f32_table) { constexpr const int qk = QUANT_BLOCK_SIZE; static_assert(qk % 2 == 0, "qk must be even"); + static_assert(QUANT_BLOCK_SIZE == hexagon::kBytesPerVector / sizeof(float)); + constexpr const uint32_t kSizeOfQs = sizeof(npu_device_block_q4_0::qs); + + const int nb = count / qk; + const auto * src_ptr = reinterpret_cast(src); + HVX_Vector mask = Q6_Vb_vsplat_R(0x0F); + HVX_Vector minus = Q6_Vb_vsplat_R(8); + HVX_UVector * out = ((HVX_UVector *) dst); // TODO: opt for aligned access + + const int loop_count = nb - (nb % 2); + for (int i = 0; i < loop_count; i += 2) { + const auto & src1 = src_ptr[i]; + const auto & src2 = src_ptr[i + 1]; + + HVX_Vector d1 = Q6_Vh_vsplat_R(src1.d); + HVX_Vector d2 = Q6_Vh_vsplat_R(src2.d); + d1 = Q6_V_valign_VVR(d1, Q6_V_vzero(), hexagon::kBytesPerVector / 2); + d1 = Q6_V_valign_VVR(d2, d1, hexagon::kBytesPerVector / 2); + HVX_Vector d = Q6_Vh_vshuff_Vh(d1); + + HVX_Vector q_lo = load_dual_block_generic(src1, src2); + HVX_Vector q_hi = Q6_Vub_vlsr_VubR(q_lo, 4); + HVX_VectorPair q = Q6_W_vshuff_VVR(q_hi, Q6_V_vand_VV(q_lo, mask), kSizeOfQs); + q_lo = Q6_V_valign_VVR(Q6_V_lo_W(q), Q6_V_vzero(), hexagon::kBytesPerVector / 2); + q_lo = Q6_V_valign_VVR(Q6_V_hi_W(q), q_lo, hexagon::kBytesPerVector / 2); + q_lo = Q6_Vb_vshuff_Vb(q_lo); + q_lo = Q6_Vb_vsub_VbVb(q_lo, minus); + q = Q6_Wh_vunpack_Vb(q_lo); + q_lo = Q6_Vhf_equals_Vh(Q6_V_lo_W(q)); + q = Q6_Wqf32_vmpy_VhfVhf(q_lo, d); + out[i] = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(q)); + out[i + 1] = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(q)); + } - const int nb = count / qk; - const auto * src_ptr = reinterpret_cast(src); - - // TODO: use intrinsics - for (int i = 0; i < nb; i++) { - const float d = f16_to_f32_table[src_ptr[i].d]; - - for (int j = 0; j < qk / 2; ++j) { - const int x0 = (src_ptr[i].qs[j] & 0x0F) - 8; - const int x1 = ((src_ptr[i].qs[j] >> 4) & 0xF) - 8; - - dst[i * qk + j + 0] = x0 * d; - dst[i * qk + j + qk / 2] = x1 * d; - } + if (loop_count < nb) { + const auto & curr_blk = src_ptr[nb - 1]; + HVX_Vector d = Q6_Vh_vsplat_R(curr_blk.d); + + HVX_Vector q_lo = load_block_generic(curr_blk); + HVX_Vector q_hi = Q6_Vub_vlsr_VubR(q_lo, 4); + q_lo = Q6_V_valign_VVR(Q6_V_vand_VV(q_lo, mask), Q6_V_vzero(), sizeof(curr_blk.qs)); + q_lo = Q6_V_valign_VVR(q_hi, q_lo, hexagon::kBytesPerVector - sizeof(curr_blk.qs)); + q_lo = Q6_Vb_vsub_VbVb(q_lo, minus); + + HVX_VectorPair q = Q6_Wh_vunpack_Vb(q_lo); + q = Q6_Wh_vunpack_Vb(Q6_V_lo_W(q)); + q_lo = Q6_Vhf_equals_Vh(Q6_V_lo_W(q)); + q = Q6_Wqf32_vmpy_VhfVhf(q_lo, d); + out[nb - 1] = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(q)); } } diff --git a/ggml/src/ggml-qnn/npu/device/quants.hpp b/ggml/src/ggml-qnn/npu/device/quants.hpp index 6ffbeb0031635..6006cd22e93a4 100644 --- a/ggml/src/ggml-qnn/npu/device/quants.hpp +++ b/ggml/src/ggml-qnn/npu/device/quants.hpp @@ -23,13 +23,15 @@ inline bool is_quantized_type(npu_device_tensor_data_type type) { return get_type_traits(type).is_quantized; } -inline size_t get_dequantized_row_size(tensor * tensor) { +using dequantized_element_type = float; + +inline size_t get_dequantized_row_size(const tensor * tensor) { if (!is_quantized_type(tensor->get_type())) { return tensor->get_nb(1); // for f32 and f16 } auto row_elems_count = tensor->get_ne(0); - return row_elems_count * sizeof(float); // currently only f32 is supported + return row_elems_count * sizeof(dequantized_element_type); // currently only f32 is supported } inline const char * get_type_name(npu_device_tensor_data_type type) { diff --git a/ggml/src/ggml-qnn/npu/device/tensor.hpp b/ggml/src/ggml-qnn/npu/device/tensor.hpp index 9c7f6bffefff6..7e980d8402fb2 100644 --- a/ggml/src/ggml-qnn/npu/device/tensor.hpp +++ b/ggml/src/ggml-qnn/npu/device/tensor.hpp @@ -8,7 +8,8 @@ namespace hexagon { -constexpr const size_t kMaxTensorSrc = DEVICE_TENSOR_MAX_SRC; +constexpr const size_t kMaxTensorSrc = DEVICE_TENSOR_MAX_SRC; +constexpr const size_t kMaxParamsCount = DEVICE_TENSOR_MAX_OP_PARAMS; class tensor { public: @@ -50,17 +51,17 @@ class tensor { } } - bool set_src(size_t index, tensor * src) { - if (index >= kMaxTensorSrc) { - return false; - } + void update_config(const npu_device_tensor_update_config & config) { + static_assert(sizeof(_op_params) == sizeof(config.params), "op params size mismatch"); - _src[index] = src; - return true; + _info.op = config.op; + memcpy(_op_params, config.params, sizeof(_op_params)); + for (size_t i = 0; i < DEVICE_TENSOR_MAX_SRC; ++i) { + auto src_handle = config.src_handles[i]; + _src[i] = (src_handle ? reinterpret_cast(src_handle) : nullptr); + } } - void set_op(npu_device_tensor_op op) { _info.op = op; } - tensor * get_src(size_t index) const { if (index >= kMaxTensorSrc) { return nullptr; @@ -77,6 +78,20 @@ class tensor { npu_device_tensor_op get_op() const { return _info.op; } + template const _TyParam get_op_param(size_t index) const { + static_assert(sizeof(_TyParam) <= sizeof(_op_params), "_op_param type size exceeds op params size"); + + if (sizeof(_TyParam) * (index + 1) >= sizeof(_op_params)) { + return 0; + } + + return reinterpret_cast(_op_params)[index]; + } + + const int32_t * get_op_params() const { return _op_params; } + + const size_t get_op_param_count() const { return kMaxParamsCount; } + npu_device_tensor_data_type get_type() const { return _info.type; } const uint8_t * get_read_buffer() const { @@ -89,9 +104,10 @@ class tensor { bool is_valid() const { return _data != nullptr; } private: - npu_device_tensor_config _info; - tensor * _src[kMaxTensorSrc] = {}; - uint8_t * _data = nullptr; + npu_device_tensor_config _info = {}; + int32_t _op_params[kMaxParamsCount] = {}; + tensor * _src[kMaxTensorSrc] = {}; + uint8_t * _data = nullptr; DISABLE_COPY_AND_MOVE(tensor); }; diff --git a/ggml/src/ggml-qnn/npu/device/thread_pool.hpp b/ggml/src/ggml-qnn/npu/device/thread_pool.hpp index bd7e83dd8a485..9a525213c9fad 100644 --- a/ggml/src/ggml-qnn/npu/device/thread_pool.hpp +++ b/ggml/src/ggml-qnn/npu/device/thread_pool.hpp @@ -12,7 +12,7 @@ namespace hexagon { constexpr const size_t kMaxThreadCount = 4; -constexpr const size_t kDefaultStackSize = 1024 * 16; // 16KB +constexpr const size_t kDefaultStackSize = 1024 * 32; // 32KB constexpr const unsigned long long kThreadTaskPendingBit = 1; template class qurt_thread { @@ -80,7 +80,7 @@ using qurt_thread_ptr = std::unique_ptr>; template class thread_pool { static_assert(_thread_count > 1, "Thread count must be greater than 1"); - constexpr const static size_t kMaxThreadCount = _thread_count - 1; + constexpr const static size_t kMaxSubThreadCount = _thread_count - 1; public: typedef qurt_thread thread_type; @@ -88,9 +88,10 @@ template class thread_pool { thread_pool() { std::string thread_name_base = "thread_pool_"; - qurt_barrier_init(&_pending, kMaxThreadCount + 1); - qurt_barrier_init(&_completed, kMaxThreadCount + 1); - for (size_t i = 0; i < kMaxThreadCount; ++i) { + qurt_barrier_init(&_pending, kMaxSubThreadCount + 1); + qurt_barrier_init(&_completed, kMaxSubThreadCount + 1); + const auto priority = qurt_thread_get_priority(qurt_thread_get_id()); + for (size_t i = 0; i < kMaxSubThreadCount; ++i) { auto & thread_arg = _thread_args[i]; thread_arg.pool = this; thread_arg.thread_idx = i + 1; @@ -98,7 +99,7 @@ template class thread_pool { auto thread = std::make_unique( thread_name_base + std::to_string(i), reinterpret_cast(&thread_pool::thread_func_impl), &thread_arg, - QURT_THREAD_ATTR_PRIORITY_DEFAULT); + priority); if (!thread->is_valid()) { DEVICE_LOG_ERROR("Failed to create thread: %zu", i); // destroy all barriers and threads at destructor @@ -107,7 +108,7 @@ template class thread_pool { _threads[i] = std::move(thread); } - DEVICE_LOG_DEBUG("thread_pool.created: %zu", kMaxThreadCount); + DEVICE_LOG_DEBUG("thread_pool.created: %zu", kMaxSubThreadCount); } ~thread_pool() { @@ -133,7 +134,7 @@ template class thread_pool { _arg = arg; qurt_barrier_wait(&_pending); - task(this, 0, kMaxThreadCount + 1, arg); + task(this, 0, kMaxSubThreadCount + 1, arg); DEVICE_LOG_DEBUG("main_thread.task_completed: 0"); qurt_barrier_wait(&_completed); @@ -166,7 +167,7 @@ template class thread_pool { auto task = pool._task; if (task) { - task(arg->pool, arg->thread_idx, kMaxThreadCount + 1, pool._arg); + task(arg->pool, arg->thread_idx, kMaxSubThreadCount + 1, pool._arg); } DEVICE_LOG_DEBUG("thread_func_impl.task_completed: %zu", arg->thread_idx); @@ -176,13 +177,13 @@ template class thread_pool { DEVICE_LOG_DEBUG("thread_func_impl.end: %zu", arg->thread_idx); } - std::atomic_bool _thread_exit = false; - std::array _threads; - thread_pool_arg _thread_args[kMaxThreadCount] = {}; - qurt_barrier_t _pending = {}; - qurt_barrier_t _completed = {}; - task_type _task = nullptr; - void * _arg = nullptr; + std::atomic_bool _thread_exit = false; + std::array _threads; + thread_pool_arg _thread_args[kMaxSubThreadCount] = {}; + qurt_barrier_t _pending = {}; + qurt_barrier_t _completed = {}; + task_type _task = nullptr; + void * _arg = nullptr; DISABLE_COPY_AND_MOVE(thread_pool); }; diff --git a/ggml/src/ggml-qnn/npu/device/util.hpp b/ggml/src/ggml-qnn/npu/device/util.hpp index a5e1ae5201c3b..3ae7f100de507 100644 --- a/ggml/src/ggml-qnn/npu/device/util.hpp +++ b/ggml/src/ggml-qnn/npu/device/util.hpp @@ -1,7 +1,9 @@ #pragma once +#include #include #include +#include #include #include @@ -48,11 +50,114 @@ inline constexpr const char * op_get_name(npu_device_tensor_op op) { return "SUB"; case NPU_OP_MUL: return "MUL"; + case NPU_OP_RMS_NORM: + return "RMS_NORM"; default: return "UNKNOWN"; } } +class power_utils { + public: + power_utils() { + _context_ptr = HAP_utils_create_context(); + if (_context_ptr == nullptr) { + DEVICE_LOG_ERROR("Failed to create power context\n"); + } + } + + ~power_utils() { + if (_context_ptr != nullptr) { + HAP_utils_destroy_context(_context_ptr); + } + } + + unsigned int get_clock_speed_hz() const { + if (!is_valid()) { + DEVICE_LOG_ERROR("Power context is not initialized\n"); + return 0; + } + + HAP_power_response_t response = {}; + response.type = HAP_power_get_clk_Freq; + auto ret = HAP_power_get(_context_ptr, &response); + if (ret != AEE_SUCCESS) { + DEVICE_LOG_ERROR("Failed to get clock speed: %d\n", ret); + return 0; + } + + return response.clkFreqHz; + } + + bool get_dvcs_enabled() const { + if (!is_valid()) { + DEVICE_LOG_ERROR("Power context is not initialized\n"); + return false; + } + + HAP_power_response_t response = {}; + response.type = HAP_power_get_dcvsEnabled; + auto ret = HAP_power_get(_context_ptr, &response); + if (ret != AEE_SUCCESS) { + DEVICE_LOG_ERROR("Failed to get DVCS enabled: %d\n", ret); + return false; + } + + return response.dcvsEnabled; + } + + void set_dvcs_performance_mode(bool enable) { + if (!is_valid()) { + DEVICE_LOG_ERROR("Power context is not initialized\n"); + return; + } + + HAP_power_request_t request = {}; + request.type = HAP_power_set_DCVS_v3; + request.dcvs_v3.dcvs_enable = enable ? TRUE : FALSE; + if (enable) { + request.dcvs_v3.dcvs_option = HAP_DCVS_V2_PERFORMANCE_MODE; + /* + * sleep_latency : To request for sleep latency in micro-seconds. + * Sleep latency is the minimum time before which the DSP sleeps + * Set latency to 65535 to reset it to the default value + */ + request.dcvs_v3.set_latency = TRUE; + request.dcvs_v3.latency = 1000; + + request.dcvs_v3.set_bus_params = TRUE; + request.dcvs_v3.bus_params.min_corner = HAP_DCVS_VCORNER_SVS; + request.dcvs_v3.bus_params.max_corner = HAP_DCVS_VCORNER_TURBO; + request.dcvs_v3.bus_params.target_corner = HAP_DCVS_VCORNER_NOM; + } + + auto ret = HAP_power_set(_context_ptr, &request); + if (ret != AEE_SUCCESS) { + DEVICE_LOG_ERROR("Failed to set DVCS performance mode: %d\n", ret); + } + } + + void set_sleep_mode(bool enable) { + if (!is_valid()) { + DEVICE_LOG_ERROR("Power context is not initialized\n"); + return; + } + + boolean sleep_disable = enable ? FALSE : TRUE; + auto ret = HAP_power_set_sleep_mode(_context_ptr, sleep_disable); + if (ret != AEE_SUCCESS) { + DEVICE_LOG_ERROR("Failed to set sleep mode: %d\n", ret); + } + } + + bool is_valid() const { return _context_ptr != nullptr; } + + private: + void * _context_ptr = nullptr; + + DISABLE_COPY_AND_MOVE(power_utils); +}; + #ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING template class npu_scoped_timer { diff --git a/ggml/src/ggml-qnn/npu/device/vtcm_mem.hpp b/ggml/src/ggml-qnn/npu/device/vtcm_mem.hpp index 4c2922ca87f15..ab1041f626205 100644 --- a/ggml/src/ggml-qnn/npu/device/vtcm_mem.hpp +++ b/ggml/src/ggml-qnn/npu/device/vtcm_mem.hpp @@ -47,7 +47,7 @@ class vtcm_mem { DEVICE_LOG_DEBUG("VTCM released: %zu bytes at %p\n", _vtcm_size, _vtcm_mem); } - bool is_valid() const { return _vtcm_mem != nullptr; } + bool is_valid() const { return _vtcm_mem != nullptr && _vtcm_size != 0; } uint8_t * get_mem() const { return reinterpret_cast(_vtcm_mem); } diff --git a/ggml/src/ggml-qnn/npu/host/buffer.cpp b/ggml/src/ggml-qnn/npu/host/buffer.cpp index ace3dbee8eeec..7d3c1fbd9f7ac 100644 --- a/ggml/src/ggml-qnn/npu/host/buffer.cpp +++ b/ggml/src/ggml-qnn/npu/host/buffer.cpp @@ -177,7 +177,7 @@ std::shared_ptr host_buffer::init_tensor(ggml_tensor * tensor, remo auto ret = _allocator->fastrpc_mmap((int) _domain_id, _buffer_fd, _data, 0, _size, FASTRPC_MAP_FD); if (ret != AEE_SUCCESS) { - LOG_ERROR("failed to mmap rpc memory, fd: %d, ret: %d\n", _buffer_fd, ret); + LOG_ERROR("failed to mmap rpc memory, fd: %d, size: %zu, ret: %d\n", _buffer_fd, _size, ret); return std::shared_ptr(); } diff --git a/ggml/src/ggml-qnn/npu/host/graph.cpp b/ggml/src/ggml-qnn/npu/host/graph.cpp index 72ef5cc7868eb..d891280e5694c 100644 --- a/ggml/src/ggml-qnn/npu/host/graph.cpp +++ b/ggml/src/ggml-qnn/npu/host/graph.cpp @@ -1,5 +1,6 @@ #include "graph.hpp" +#include "profiler.hpp" #include "tensor.hpp" namespace hexagon { @@ -28,8 +29,12 @@ bool host_graph::update(ggml_cgraph * cgraph) { return false; } + SCOPED_PERFORMANCE_TRACKER("[hexagon-npu][%p]update, handle(%p)", (void *) this, (void *) _graph_handle); + _tensor_handles.clear(); + _tensor_update_configs.clear(); _tensor_handles.reserve(cgraph->n_nodes); + _tensor_update_configs.reserve(cgraph->n_nodes); for (int i = 0; i < cgraph->n_nodes; ++i) { auto * node = cgraph->nodes[i]; if (node->op == GGML_OP_NONE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || @@ -40,28 +45,38 @@ bool host_graph::update(ggml_cgraph * cgraph) { continue; } + // TODO: move to tensor? auto * tensor_obj = host_tensor::from_ggml_tensor(node); if (!tensor_obj) { LOG_DEBUG("Unable to get host tensor from ggml tensor: %p\n", (void *) node); continue; } - tensor_obj->set_op(node->op); _tensor_handles.push_back(tensor_obj->get_device_tensor_handle()); - LOG_DEBUG("node[%d]%s(%s), addr: %p, type: %s, tensor_handle: %p\n", i, ggml_get_name(node), ggml_op_desc(node), - (void *) node, ggml_type_name(node->type), (void *) tensor_obj->get_device_tensor_handle()); - for (size_t j = 0; j < GGML_MAX_SRC && node->src[j]; ++j) { - auto * src = host_tensor::from_ggml_tensor(node->src[j]); - tensor_obj->set_src(j, src); - } + _tensor_update_configs.push_back(tensor_obj->update_hosts_params_only(node)); + LOG_DEBUG("[%p]node[%d]%s(%s), addr: %p, type: %s, tensor_handle: %p\n", (void *) this, i, ggml_get_name(node), + ggml_op_desc(node), (void *) node, ggml_type_name(node->type), + (void *) tensor_obj->get_device_tensor_handle()); } - LOG_DEBUG("host_graph::update, host_graph(%p), handle(%p), ggml_cgraph(%p), tensor count(%zu)\n", (void *) this, - (void *) _graph_handle, (void *) cgraph, _tensor_handles.size()); - if (!_tensor_handles.empty()) { - npu_device_graph_set_tensor(_device_handle, _graph_handle, _tensor_handles.data(), - (int) _tensor_handles.size()); + GGML_ASSERT(_tensor_handles.size() == _tensor_update_configs.size()); + + constexpr const npu_device_tensor_handle_t kEmptyTensorHandle = 0; + constexpr const npu_device_tensor_update_config kEmptyUpdateConfig = {}; + + auto ret = npu_device_graph_set_tensor_with_param( + _device_handle, _graph_handle, _tensor_handles.size() ? _tensor_handles.data() : &kEmptyTensorHandle, + (int) _tensor_handles.size(), + _tensor_update_configs.size() ? _tensor_update_configs.data() : &kEmptyUpdateConfig, + (int) _tensor_update_configs.size()); + + if (ret != AEE_SUCCESS) { + LOG_ERROR("Failed to set tensors in host_graph: 0x%x\n", (int) ret); + return false; } + + LOG_DEBUG("[%p]host_graph::update, handle(%p), ggml_cgraph(%p), tensor count(%zu)\n", (void *) this, + (void *) _graph_handle, (void *) cgraph, _tensor_handles.size()); return true; } @@ -71,6 +86,7 @@ bool host_graph::compute() { return false; } + SCOPED_PERFORMANCE_TRACKER("[hexagon-npu][%p]compute, handle(%p)", (void *) this, (void *) _graph_handle); auto status = npu_device_graph_compute(_device_handle, _graph_handle); if (status != AEE_SUCCESS) { LOG_ERROR("Failed to compute host_graph: 0x%x\n", (int) status); diff --git a/ggml/src/ggml-qnn/npu/host/graph.hpp b/ggml/src/ggml-qnn/npu/host/graph.hpp index 20c917e1203ca..b871c125563f2 100644 --- a/ggml/src/ggml-qnn/npu/host/graph.hpp +++ b/ggml/src/ggml-qnn/npu/host/graph.hpp @@ -21,9 +21,10 @@ class host_graph { bool compute(); private: - remote_handle64 _device_handle = 0; - npu_device_graph_handle_t _graph_handle = 0; - std::vector _tensor_handles; + remote_handle64 _device_handle = 0; + npu_device_graph_handle_t _graph_handle = 0; + std::vector _tensor_handles; + std::vector _tensor_update_configs; DISABLE_COPY(host_graph); DISABLE_MOVE(host_graph); diff --git a/ggml/src/ggml-qnn/npu/host/host_device.cpp b/ggml/src/ggml-qnn/npu/host/host_device.cpp index fb1ad4dfd677b..443abe5c9e6fe 100644 --- a/ggml/src/ggml-qnn/npu/host/host_device.cpp +++ b/ggml/src/ggml-qnn/npu/host/host_device.cpp @@ -151,7 +151,7 @@ bool npu_device::supports_op_impl(const ggml_tensor * op) { auto * src0 = op->src[0]; if (!src0) { - LOG_DEBUG("[%s]Unsupported inplace op: %s\n", get_name(), ggml_op_name(op->op)); + LOG_DEBUG("[%s]Unsupported inplace op: %s\n", get_name(), ggml_op_desc(op)); return false; } @@ -168,7 +168,7 @@ bool npu_device::supports_op_impl(const ggml_tensor * op) { auto npu_op = op_to_npu_op(op->op); if (npu_op == NPU_OP_COUNT) { - LOG_DEBUG("[%s]Unsupported op: %s\n", get_name(), ggml_op_name(op->op)); + LOG_DEBUG("[%s]Unsupported op: %s\n", get_name(), ggml_op_desc(op)); return false; } @@ -179,7 +179,7 @@ bool npu_device::supports_op_impl(const ggml_tensor * op) { constexpr const auto get_spec = [](const ggml_tensor * tensor) -> npu_device_tensor_spec { if (!tensor) { - return npu_device_tensor_spec{}; + return npu_device_tensor_spec{ {}, NPU_DATA_TYPE_COUNT }; } static_assert(DEVICE_TENSOR_MAX_DIMS == GGML_MAX_DIMS, "tensor dimensions mismatch"); diff --git a/ggml/src/ggml-qnn/npu/host/tensor.hpp b/ggml/src/ggml-qnn/npu/host/tensor.hpp index c5d2decbc5682..71205b39fb7a8 100644 --- a/ggml/src/ggml-qnn/npu/host/tensor.hpp +++ b/ggml/src/ggml-qnn/npu/host/tensor.hpp @@ -1,5 +1,7 @@ #pragma once +#include + #include "common.hpp" #include "ggml-impl.h" #include "hexagon_npu.h" @@ -19,11 +21,15 @@ class host_tensor { explicit host_tensor(ggml_tensor * tensor, int buffer_fd, uint64_t offset, remote_handle64 device_handle) : _device_handle(device_handle) { + + // TODO: figure out why the npu_device_tensor_config can't be larger than 100 bytes + static_assert(sizeof(npu_device_tensor_config) < 100, "npu_device_tensor_config size too large"); + _info.buffer_fd = buffer_fd; _info.offset = offset; _info.type = type_to_npu_type(tensor->type); - _info.op = op_to_npu_op(tensor->op); _info.size = ggml_nbytes(tensor); + // _info.op will be updated in update_params() static_assert(DEVICE_TENSOR_MAX_DIMS == GGML_MAX_DIMS, "tensor dimensions mismatch"); static_assert(sizeof(_info.ne) == sizeof(tensor->ne), "tensor ne size mismatch"); @@ -56,28 +62,96 @@ class host_tensor { npu_device_tensor_handle_t get_device_tensor_handle() const { return _device_tensor_handle; } - void set_src(size_t index, host_tensor * src) { - if (index >= DEVICE_TENSOR_MAX_SRC) { - LOG_ERROR("host_tensor(%p) set_src[%zu] out of range\n", (void *) this, index); + void update_params(ggml_tensor * ggml_tensor) { + static_assert(sizeof(_info_update.params) <= sizeof(_ggml_tensor->op_params), + "device tensor params size mismatch"); + static_assert(DEVICE_TENSOR_MAX_SRC <= GGML_MAX_SRC, "device tensor src size mismatch"); + + GGML_ASSERT(ggml_tensor == _ggml_tensor); + if (!_ggml_tensor) { + LOG_DEBUG("host_tensor(%p) _ggml_tensor is null\n", (void *) this); return; } - LOG_DEBUG("host_tensor(%p) set_src[%zu]: %p\n", (void *) this, index, (void *) src); - npu_device_tensor_set_src(_device_handle, _device_tensor_handle, index, src->get_device_tensor_handle()); + auto new_op = op_to_npu_op(_ggml_tensor->op); + bool params_changed = new_op != _info_update.op; + if (params_changed) { + LOG_DEBUG("host_tensor(%p) op changed: %s -> %s\n", (void *) this, get_npu_op_desc(_info.op), + get_npu_op_desc(new_op)); + } + + _info.op = new_op; + _info_update.op = new_op; + + if (memcmp(_info_update.params, _ggml_tensor->op_params, sizeof(_info_update.params)) != 0) { + params_changed = true; + memcpy(_info_update.params, _ggml_tensor->op_params, sizeof(_info_update.params)); + LOG_DEBUG("host_tensor(%p) op_params changed: [%x, %x, %x, %x]\n", (void *) this, + (int) _info_update.params[0], (int) _info_update.params[1], (int) _info_update.params[2], + (int) _info_update.params[3]); + } + + npu_device_tensor_handle_t src_tensor_handles[DEVICE_TENSOR_MAX_SRC] = {}; + for (size_t j = 0; j < DEVICE_TENSOR_MAX_SRC && _ggml_tensor->src[j]; ++j) { + auto * src = host_tensor::from_ggml_tensor(_ggml_tensor->src[j]); + src_tensor_handles[j] = src->get_device_tensor_handle(); + LOG_DEBUG("host_tensor(%p) set_src[%zu]: %p\n", (void *) this, j, (void *) src); + } + + static_assert(std::is_same::value, + "src tensor handles type mismatch"); + + if (memcmp(_info_update.src_handles, src_tensor_handles, sizeof(_info_update.src_handles)) != 0) { + params_changed = true; + memcpy(_info_update.src_handles, src_tensor_handles, sizeof(_info_update.src_handles)); + LOG_DEBUG("host_tensor(%p) src changed, handles: [%p, %p]\n", (void *) this, + (void *) _info_update.src_handles[0], (void *) _info_update.src_handles[1]); + } + + if (params_changed) { + npu_device_tensor_update_params(_device_handle, _device_tensor_handle, &_info_update); + LOG_DEBUG("host_tensor(%p) update_params, op: %s, params: [%x, %x, %x, %x]\n", (void *) this, + ggml_op_desc(_ggml_tensor), (int) _info_update.params[0], (int) _info_update.params[1], + (int) _info_update.params[2], (int) _info_update.params[3]); + } else { + LOG_DEBUG("host_tensor(%p) update_params, no changes, op: %s, params: [%x, %x, %x, %x]\n", (void *) this, + ggml_op_desc(_ggml_tensor), (int) _info_update.params[0], (int) _info_update.params[1], + (int) _info_update.params[2], (int) _info_update.params[3]); + } } - void set_op(ggml_op op) { - _info.op = op_to_npu_op(op); - npu_device_tensor_set_op(_device_handle, _device_tensor_handle, _info.op); + const npu_device_tensor_update_config & update_hosts_params_only(ggml_tensor * ggml_tensor) { + static_assert(sizeof(_info_update.params) <= sizeof(ggml_tensor->op_params), + "device tensor params size mismatch"); + static_assert(DEVICE_TENSOR_MAX_SRC <= GGML_MAX_SRC, "device tensor src size mismatch"); + + GGML_ASSERT(ggml_tensor == _ggml_tensor); + + auto new_op = op_to_npu_op(_ggml_tensor->op); + _info.op = new_op; + _info_update.op = new_op; + memcpy(_info_update.params, _ggml_tensor->op_params, sizeof(_info_update.params)); + + for (size_t j = 0; j < DEVICE_TENSOR_MAX_SRC && _ggml_tensor->src[j]; ++j) { + auto * src = host_tensor::from_ggml_tensor(_ggml_tensor->src[j]); + _info_update.src_handles[j] = src->get_device_tensor_handle(); + LOG_DEBUG("host_tensor(%p) set_src[%zu]: %p\n", (void *) this, j, (void *) src); + } + + LOG_DEBUG("host_tensor(%p) update_params, op: %s, params: [%x, %x, %x, %x]\n", (void *) this, + ggml_op_desc(_ggml_tensor), (int) _info_update.params[0], (int) _info_update.params[1], + (int) _info_update.params[2], (int) _info_update.params[3]); + return _info_update; } bool is_valid() const { return _device_tensor_handle != 0; } private: - remote_handle64 _device_handle = 0; - npu_device_tensor_handle_t _device_tensor_handle = 0; - npu_device_tensor_config _info = {}; - ggml_tensor * _ggml_tensor = nullptr; + remote_handle64 _device_handle = 0; + npu_device_tensor_handle_t _device_tensor_handle = 0; + npu_device_tensor_config _info = {}; + npu_device_tensor_update_config _info_update = {}; + ggml_tensor * _ggml_tensor = nullptr; DISABLE_COPY(host_tensor); DISABLE_MOVE(host_tensor); diff --git a/ggml/src/ggml-qnn/npu/host/util.cpp b/ggml/src/ggml-qnn/npu/host/util.cpp index 9ce9841004235..b62370d1ad845 100644 --- a/ggml/src/ggml-qnn/npu/host/util.cpp +++ b/ggml/src/ggml-qnn/npu/host/util.cpp @@ -25,11 +25,30 @@ enum npu_device_tensor_op op_to_npu_op(ggml_op op) { return NPU_OP_SUB; case GGML_OP_MUL: return NPU_OP_MUL; + case GGML_OP_RMS_NORM: + return NPU_OP_RMS_NORM; default: return NPU_OP_COUNT; } } +const char * get_npu_op_desc(enum npu_device_tensor_op op) { + switch (op) { + case NPU_OP_MUL_MAT: + return ggml_op_name(GGML_OP_MUL_MAT); + case NPU_OP_ADD: + return ggml_op_name(GGML_OP_ADD); + case NPU_OP_SUB: + return ggml_op_name(GGML_OP_SUB); + case NPU_OP_MUL: + return ggml_op_name(GGML_OP_MUL); + case NPU_OP_RMS_NORM: + return ggml_op_name(GGML_OP_RMS_NORM); + default: + return "UNKNOWN"; + } +} + enum npu_device_tensor_data_type type_to_npu_type(ggml_type type) { switch (type) { case GGML_TYPE_F32: diff --git a/ggml/src/ggml-qnn/npu/host/util.hpp b/ggml/src/ggml-qnn/npu/host/util.hpp index 469e5066602ed..f8ec5c3b9f537 100644 --- a/ggml/src/ggml-qnn/npu/host/util.hpp +++ b/ggml/src/ggml-qnn/npu/host/util.hpp @@ -5,6 +5,7 @@ namespace hexagon { enum npu_device_tensor_op op_to_npu_op(ggml_op op); +const char * get_npu_op_desc(enum npu_device_tensor_op op); enum npu_device_tensor_data_type type_to_npu_type(ggml_type type); // TODO: merge with qcom_htp_arch diff --git a/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl b/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl index df3cdf4957295..ed20c125b379c 100644 --- a/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl +++ b/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl @@ -4,6 +4,7 @@ const uint32_t DEVICE_TENSOR_MAX_DIMS = 4; const uint32_t DEVICE_TENSOR_MAX_SRC = 2; +const uint32_t DEVICE_TENSOR_MAX_OP_PARAMS = 4; const uint32_t QUANT_BLOCK_SIZE = 32; const uint32_t QUANT_K_BLOCK_SIZE = 256; const uint32_t QUANT_K_SCALE_SIZE = 12; @@ -38,6 +39,7 @@ interface npu_device : remote_handle64{ NPU_OP_ADD, NPU_OP_SUB, NPU_OP_MUL, + NPU_OP_RMS_NORM, NPU_OP_COUNT }; @@ -55,6 +57,12 @@ interface npu_device : remote_handle64{ tensor_data_type type; }; + struct tensor_update_config { + tensor_op op; + int32_t params[DEVICE_TENSOR_MAX_OP_PARAMS]; + tensor_handle_t src_handles[DEVICE_TENSOR_MAX_SRC]; + }; + struct tensor_config { ne_type ne; uint64_t nb[DEVICE_TENSOR_MAX_DIMS]; @@ -82,15 +90,9 @@ interface npu_device : remote_handle64{ rout tensor_handle_t tensor_handle ); - AEEResult tensor_set_src( + AEEResult tensor_update_params( in tensor_handle_t tensor_handle, - in uint64_t index, - in tensor_handle_t src - ); - - AEEResult tensor_set_op( - in tensor_handle_t tensor_handle, - in tensor_op op + in tensor_update_config config ); AEEResult tensor_free( @@ -106,6 +108,12 @@ interface npu_device : remote_handle64{ in sequence tensor_handles ); + AEEResult graph_set_tensor_with_param( + in graph_handle_t graph_handle, + in sequence tensor_handles, + in sequence tensor_params + ); + AEEResult graph_compute( in graph_handle_t graph_handle ); diff --git a/ggml/src/ggml-qnn/qnn/profiler.cpp b/ggml/src/ggml-qnn/qnn/event_tracer.cpp similarity index 99% rename from ggml/src/ggml-qnn/qnn/profiler.cpp rename to ggml/src/ggml-qnn/qnn/event_tracer.cpp index 5625c3acf7ebb..41bf0ab88eb30 100644 --- a/ggml/src/ggml-qnn/qnn/profiler.cpp +++ b/ggml/src/ggml-qnn/qnn/event_tracer.cpp @@ -1,5 +1,5 @@ -#include "profiler.hpp" +#include "event_tracer.hpp" #include #include diff --git a/ggml/src/ggml-qnn/qnn/event_tracer.hpp b/ggml/src/ggml-qnn/qnn/event_tracer.hpp new file mode 100644 index 0000000000000..3db137ebe167c --- /dev/null +++ b/ggml/src/ggml-qnn/qnn/event_tracer.hpp @@ -0,0 +1,45 @@ +#pragma once + +#include + +#include +#include +#include +#include + +#include "logger.hpp" +#include "profiler.hpp" +#include "qnn-types.hpp" + +namespace qnn { + +// forward declaration of qnn_interface +class qnn_interface; + +class qnn_event_tracer { + public: + // ref: + // https://github.com/pytorch/executorch/blob/ae3d558d5e6aa04fc52a3065399fe6a773702f52/backends/qualcomm/serialization/qc_schema.py#L53 + // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/overview.html#supported-snapdragon-devices + enum sdk_profile_level { PROFILE_OFF = 0, PROFILE_BASIC, PROFILE_DETAIL, PROFILE_OP_TRACE }; + + explicit qnn_event_tracer(const std::string & prefix, std::shared_ptr interface, + Qnn_BackendHandle_t backend_handle, sdk_profile_level level); + ~qnn_event_tracer(); + + Qnn_ProfileHandle_t get_handle() const { return _handle; } + + void print_profile_events(); + + private: + std::shared_ptr _interface; + Qnn_ProfileHandle_t _handle = nullptr; + std::string _prefix; + + DISABLE_COPY(qnn_event_tracer); + DISABLE_MOVE(qnn_event_tracer); +}; + +using qnn_event_tracer_ptr = std::shared_ptr; + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/qnn/graph.cpp b/ggml/src/ggml-qnn/qnn/graph.cpp index 3094b5c3bee67..b4dcc7797dfd9 100644 --- a/ggml/src/ggml-qnn/qnn/graph.cpp +++ b/ggml/src/ggml-qnn/qnn/graph.cpp @@ -4,10 +4,10 @@ #include #include +#include "event_tracer.hpp" #include "ggml-impl.h" #include "logger.hpp" #include "op-config.hpp" -#include "profiler.hpp" #include "tensor.hpp" #ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING @@ -411,8 +411,8 @@ bool qnn_graph::build_graph_from_ggml_graph(const ggml_cgraph * cgraph) { GGML_TYPE_COUNT > GGML_TYPE_Q8_0 && GGML_TYPE_Q8_0 > GGML_TYPE_F16 && GGML_TYPE_F16 > GGML_TYPE_F32, "GGML_TYPE enum order is not correct"); - QNN_SCOPED_PERFORMANCE_TRACKER("[%s][%s]build_graph_from_ggml_graph", get_backend_name(_device), - _graph_name.c_str()); + SCOPED_PERFORMANCE_TRACKER("[%s][%s]build_graph_from_ggml_graph", get_backend_name(_device), + _graph_name.c_str()); auto override_data_type = get_override_data_type(inputs, outputs); if (override_data_type != GGML_TYPE_COUNT) { @@ -466,8 +466,7 @@ bool qnn_graph::execute(const ggml_cgraph * cgraph, std::shared_ptrqnn_graph_execute(_graph_handle, qnn_tensor_inputs.data(), @@ -529,7 +528,7 @@ bool qnn_graph::execute(const ggml_cgraph * cgraph, std::shared_ptr #include "convert.hpp" +#include "event_tracer.hpp" #include "ggml-qnn.h" #include "op-config.hpp" -#include "profiler.hpp" #include "qnn-lib.hpp" namespace qnn { diff --git a/ggml/src/ggml-qnn/qnn/profiler.hpp b/ggml/src/ggml-qnn/qnn/profiler.hpp deleted file mode 100644 index 0d4f839fda270..0000000000000 --- a/ggml/src/ggml-qnn/qnn/profiler.hpp +++ /dev/null @@ -1,100 +0,0 @@ -#pragma once - -#include - -#include -#include -#include -#include - -#include "logger.hpp" -#include "qnn-types.hpp" - -namespace qnn { - -#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING - -class qnn_scoped_timer { - public: - qnn_scoped_timer(const std::string & log_prefix) : _log_prefix(std::move(log_prefix)) { - _begin_us = ggml_time_us(); - } - - qnn_scoped_timer(qnn_scoped_timer && other) { - _begin_us = other._begin_us; - _log_prefix = std::move(other._log_prefix); - } - - ~qnn_scoped_timer() { print(); } - - void operator=(qnn_scoped_timer && other) { - _begin_us = other._begin_us; - _log_prefix = std::move(other._log_prefix); - } - - void print() const { - auto duration = (ggml_time_us() - _begin_us) / 1000.0; - QNN_LOG_INFO("[profiler]%s, duration: %.4f ms\n", _log_prefix.c_str(), duration); - } - - - private: - int64_t _begin_us = 0LL; - std::string _log_prefix; - - qnn_scoped_timer(const qnn_scoped_timer &) = delete; - void operator=(const qnn_scoped_timer &) = delete; -}; - -inline qnn_scoped_timer make_scope_perf_timer(const char * format, ...) { - va_list args; - va_start(args, format); - char buffer[4096]; - vsnprintf(buffer, sizeof(buffer), format, args); - va_end(args); - return qnn_scoped_timer(buffer); -} - -#else - -inline void make_scope_perf_timer(const char *, ...) {} - -#endif - -// forward declaration of qnn_interface -class qnn_interface; - -class qnn_event_tracer { - public: - // ref: - // https://github.com/pytorch/executorch/blob/ae3d558d5e6aa04fc52a3065399fe6a773702f52/backends/qualcomm/serialization/qc_schema.py#L53 - // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/overview.html#supported-snapdragon-devices - enum sdk_profile_level { PROFILE_OFF = 0, PROFILE_BASIC, PROFILE_DETAIL, PROFILE_OP_TRACE }; - - explicit qnn_event_tracer(const std::string & prefix, std::shared_ptr interface, - Qnn_BackendHandle_t backend_handle, sdk_profile_level level); - ~qnn_event_tracer(); - - Qnn_ProfileHandle_t get_handle() const { return _handle; } - - void print_profile_events(); - - private: - std::shared_ptr _interface; - Qnn_ProfileHandle_t _handle = nullptr; - std::string _prefix; - - DISABLE_COPY(qnn_event_tracer); - DISABLE_MOVE(qnn_event_tracer); -}; - -using qnn_event_tracer_ptr = std::shared_ptr; - -} // namespace qnn - -#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING -# define QNN_SCOPED_PERFORMANCE_TRACKER(fmt, ...) \ - auto __qnn_timer_##__LINE__ = qnn::make_scope_perf_timer(fmt, __VA_ARGS__) -#else -# define QNN_SCOPED_PERFORMANCE_TRACKER(fmt, ...) ((void) 0) -#endif diff --git a/ggml/src/ggml-qnn/qnn/qnn-lib.cpp b/ggml/src/ggml-qnn/qnn/qnn-lib.cpp index 474bf53434628..e32bab5f9247d 100644 --- a/ggml/src/ggml-qnn/qnn/qnn-lib.cpp +++ b/ggml/src/ggml-qnn/qnn/qnn-lib.cpp @@ -34,21 +34,36 @@ constexpr const qnn::device_caps kDeviceCaps[] = { { // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/CpuOpDefSupplement.html#matmul kQnnCpuLibName, GGML_BACKEND_DEVICE_TYPE_ACCEL, (1L << GGML_TYPE_I8) | (1L << GGML_TYPE_F32), - 0xFFFFFE, // all quantized types can be offload to CPU, at current implementation, those types will be dequantized into float32 on cpu - 0, // 0 for no limitation +#ifdef GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS + // all quantized types can be offload to CPU, at current implementation, those types will be dequantized into float32 on cpu + 0xFFFFFE, +#else + 0, +#endif + + 0, // 0 for no limitation }, { // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/GpuOpDefSupplement.html#matmul kQnnGpuLibName, GGML_BACKEND_DEVICE_TYPE_GPU, (1L << GGML_TYPE_F32) | (1L << GGML_TYPE_F16), +#ifdef GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS // all quantized types can be offload to GPU, at current implementation, those types will be dequantized into float32 on cpu - 0xFFFFFE, (128256L * 4096 * + 0xFFFFFE, +#else + 0, +#endif + (128256L * 4096 * sizeof(float)), // tested on 8 gen 2, failed to allocate tensor with size 128256x4096 and float32 }, { // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/HtpOpDefSupplement.html#matmul kQnnNpuLibName, GGML_BACKEND_DEVICE_TYPE_ACCEL, +#ifdef GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS (1L << GGML_TYPE_F32) | (1L << GGML_TYPE_F16) | (1L << GGML_TYPE_I16), (1L << GGML_TYPE_Q2_K) | (1L << GGML_TYPE_Q3_K) | (1L << GGML_TYPE_Q4_K) | (1L << GGML_TYPE_Q8_K), +#else + 0, +#endif (8192L * 2048 + 8192 * 512 + 2048 * 512) * sizeof(float), // TODO: should have a better way to get this value }, }; diff --git a/ggml/src/ggml-qnn/shared/common.hpp b/ggml/src/ggml-qnn/shared/common.hpp index 4feb3365ce102..b5e1e5213e8c3 100644 --- a/ggml/src/ggml-qnn/shared/common.hpp +++ b/ggml/src/ggml-qnn/shared/common.hpp @@ -45,6 +45,10 @@ size_t get_system_free_memory_in_bytes(); class_name(class_name &&) = delete; \ void operator=(class_name &&) = delete +#define DISABLE_COPY_AND_MOVE(class_name) \ + DISABLE_COPY(class_name); \ + DISABLE_MOVE(class_name) + #define LOG_ERROR(...) (GGML_LOG_ERROR(__VA_ARGS__)) #define LOG_WARN(...) (GGML_LOG_WARN(__VA_ARGS__)) #define LOG_INFO(...) (GGML_LOG_INFO(__VA_ARGS__)) diff --git a/ggml/src/ggml-qnn/shared/profiler.hpp b/ggml/src/ggml-qnn/shared/profiler.hpp new file mode 100644 index 0000000000000..7180dc02957bc --- /dev/null +++ b/ggml/src/ggml-qnn/shared/profiler.hpp @@ -0,0 +1,61 @@ +#pragma once + +#include +#include +#include + +#include "common.hpp" +#include "ggml-impl.h" + +namespace profiler { + +#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING + +class scoped_timer { + public: + scoped_timer(const std::string & log_prefix) : _log_prefix(std::move(log_prefix)) { _begin_us = ggml_time_us(); } + + scoped_timer(scoped_timer && other) { + _begin_us = other._begin_us; + _log_prefix = std::move(other._log_prefix); + } + + ~scoped_timer() { print(); } + + void operator=(scoped_timer && other) { + _begin_us = other._begin_us; + _log_prefix = std::move(other._log_prefix); + } + + void print() const { + auto duration = ggml_time_us() - _begin_us; + GGML_LOG_INFO("[profiler]%s, dur: %lld us\n", _log_prefix.c_str(), (long long) duration); + } + + + private: + int64_t _begin_us = 0LL; + std::string _log_prefix; + + DISABLE_COPY(scoped_timer); +}; + +inline scoped_timer make_scope_perf_timer(const char * format, ...) { + va_list args; + va_start(args, format); + char buffer[4096]; + vsnprintf(buffer, sizeof(buffer), format, args); + va_end(args); + return scoped_timer(buffer); +} + +#endif + +} // namespace profiler + +#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING +# define SCOPED_PERFORMANCE_TRACKER(fmt, ...) \ + auto __scoped_timer_##__LINE__ = profiler::make_scope_perf_timer(fmt, __VA_ARGS__) +#else +# define SCOPED_PERFORMANCE_TRACKER(fmt, ...) ((void) 0) +#endif diff --git a/ggml/src/ggml-qnn/shared/rpc-mem.hpp b/ggml/src/ggml-qnn/shared/rpc-mem.hpp index ba8449192b5dd..9552ca9555380 100644 --- a/ggml/src/ggml-qnn/shared/rpc-mem.hpp +++ b/ggml/src/ggml-qnn/shared/rpc-mem.hpp @@ -64,8 +64,10 @@ class rpc_mem { void * buf = nullptr; if (_rpc_interface->is_alloc2_available()) { + LOG_DEBUG("rpcmem_alloc2 available, using it\n"); buf = _rpc_interface->rpcmem_alloc2(heapid, flags, size); } else { + LOG_DEBUG("rpcmem_alloc2 not available, using rpcmem_alloc\n"); buf = _rpc_interface->rpcmem_alloc(heapid, flags, size); } From af620a12f7a83ce74e1a94e34eaa88b99dc1127b Mon Sep 17 00:00:00 2001 From: nullname Date: Wed, 18 Jun 2025 10:32:08 +0800 Subject: [PATCH 159/166] feat: flash attention support for hexagon-npu (#45) * add flash attn op * expend src tensor size * add flash attn sources * add quantize row functions * make a separated file for vec_dot * wip * wip * refactor: rename quants.hpp includes and add vec_dot to type traits * add flash_attn impl * split vec_scale_f32 * move vec_reduction_qf32 to vec_ops * add vec_scale_f16 * opt * add vec_mad * implement vec_mad_f16 * opt * add op template * opt * add align version * enable flash attn * wip * log print improve * add profiler log * wip * wip * add multi sub proc perf tracker * increase log buffer * remove sub prov pcycle * wip * wip * add prefetch for vec_dot * wip * wip * opt f16 vec dot * opt f16 vecdot * reuse vec_dot_product_impl in vec dot f32 * small opt to unblock pipeline * opt on aligned address wip * Revert "opt on aligned address" This reverts commit 27be1eb61a7d29d2f5fa6f90383e1b5d7fdf9b6a. * add profiler log at thread_pool * wip * invalidate all... * Reapply "opt on aligned address" This reverts commit f075a4c4586e32b7e5819c1fe7f9b6ed218b1767. * add is_constant for tensor config * disable align tensor opt in mul_mat * wip * wip * vec_scale_impl: unrolling the loop * wip * wip * replace reinterpret_cast with direct pointer access for write/read buffers * add fetch * wip * wip * wip * add log * check tensor shape at flash_attn * wip * wip * fix: update tensor type handling in flash_attn_impl * wip * fix: align cache size * fix: qf16->hf * fix: swap order of elements in vector combine for correct scaling * fix: opt f16 scale and mad * fix leftover fetch * wip * load into vector pair * opt cache size calculation in flash_attn_impl * refactoring: hold vtcm at thread local object * wip * add profiler log * mark tensors as modified * restrict tensor invalidation to the first thread in compute_impl * Revert "restrict tensor invalidation to the first thread in compute_impl" This reverts commit 0a8ff2b1bcf366097c16d7437c091382eacbef8b. * invalidate last tensor in compute_impl * invalidate last tensor in compute function * wip * refactor dequantize_row_q4_0 to simplify vector alignment * wip * refactoring: move VTCM quota calculation to thread pool * wip * fix: correct condition check for HEXAGON_SDK_ROOT existence * wip * wip * wip * wip * fix: update condition checks match the naming * fix: improve tensor handling checks and logging in graph and operation implementations * wip --- ggml/src/ggml-qnn/npu/CMakeLists.txt | 2 + ggml/src/ggml-qnn/npu/device/device.cpp | 28 +- ggml/src/ggml-qnn/npu/device/graph.cpp | 29 +- ggml/src/ggml-qnn/npu/device/graph.hpp | 6 +- .../src/ggml-qnn/npu/device/op_flash_attn.cpp | 321 ++++++++++++ .../src/ggml-qnn/npu/device/op_flash_attn.hpp | 11 + ggml/src/ggml-qnn/npu/device/op_impl.cpp | 235 ++++----- ggml/src/ggml-qnn/npu/device/op_impl.hpp | 4 +- ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp | 226 ++------- ggml/src/ggml-qnn/npu/device/op_mul_mat.hpp | 60 +-- ggml/src/ggml-qnn/npu/device/op_types.hpp | 64 +-- ggml/src/ggml-qnn/npu/device/quants.cpp | 213 -------- ggml/src/ggml-qnn/npu/device/tensor.hpp | 34 +- ggml/src/ggml-qnn/npu/device/thread_pool.hpp | 126 +++-- ggml/src/ggml-qnn/npu/device/type_traits.cpp | 467 ++++++++++++++++++ .../device/{quants.hpp => type_traits.hpp} | 37 +- ggml/src/ggml-qnn/npu/device/util.hpp | 139 ++++-- ggml/src/ggml-qnn/npu/device/vec_ops.cpp | 156 ++++++ ggml/src/ggml-qnn/npu/device/vec_ops.hpp | 274 ++++++++++ ggml/src/ggml-qnn/npu/host/buffer.cpp | 4 +- ggml/src/ggml-qnn/npu/host/graph.cpp | 16 +- ggml/src/ggml-qnn/npu/host/host_device.cpp | 59 +-- ggml/src/ggml-qnn/npu/host/tensor.hpp | 52 +- ggml/src/ggml-qnn/npu/host/util.cpp | 78 ++- ggml/src/ggml-qnn/npu/host/util.hpp | 2 + ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl | 16 +- ggml/src/ggml-qnn/shared/CMakeLists.txt | 2 + 27 files changed, 1860 insertions(+), 801 deletions(-) create mode 100644 ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp create mode 100644 ggml/src/ggml-qnn/npu/device/op_flash_attn.hpp delete mode 100644 ggml/src/ggml-qnn/npu/device/quants.cpp create mode 100644 ggml/src/ggml-qnn/npu/device/type_traits.cpp rename ggml/src/ggml-qnn/npu/device/{quants.hpp => type_traits.hpp} (65%) create mode 100644 ggml/src/ggml-qnn/npu/device/vec_ops.cpp create mode 100644 ggml/src/ggml-qnn/npu/device/vec_ops.hpp diff --git a/ggml/src/ggml-qnn/npu/CMakeLists.txt b/ggml/src/ggml-qnn/npu/CMakeLists.txt index 5e1281c3d5cf4..e8ce255fec6a0 100644 --- a/ggml/src/ggml-qnn/npu/CMakeLists.txt +++ b/ggml/src/ggml-qnn/npu/CMakeLists.txt @@ -3,6 +3,8 @@ cmake_policy(SET CMP0115 OLD) if(DEFINED ENV{HEXAGON_SDK_ROOT}) set(HEXAGON_SDK_ROOT $ENV{HEXAGON_SDK_ROOT}) + message("HEXAGON_SDK_ROOT (from environment): ${HEXAGON_SDK_ROOT}") +elseif(DEFINED HEXAGON_SDK_ROOT) message("HEXAGON_SDK_ROOT: ${HEXAGON_SDK_ROOT}") else() message(FATAL_ERROR "HEXAGON_SDK_ROOT not defined") diff --git a/ggml/src/ggml-qnn/npu/device/device.cpp b/ggml/src/ggml-qnn/npu/device/device.cpp index 8a10e9e7525b1..ff2819bae65e5 100644 --- a/ggml/src/ggml-qnn/npu/device/device.cpp +++ b/ggml/src/ggml-qnn/npu/device/device.cpp @@ -9,10 +9,10 @@ #include "graph.hpp" #include "hexagon_npu.h" #include "op_impl.hpp" -#include "quants.hpp" #include "remote.h" #include "tensor.hpp" #include "thread_pool.hpp" +#include "type_traits.hpp" #include "util.hpp" namespace { @@ -124,21 +124,20 @@ int npu_device_close(remote_handle64 h) { AEEResult npu_device_device_get_alignment(remote_handle64 _h, uint32_t * alignment) { NPU_UNUSED(_h); - *alignment = sizeof(HVX_Vector); + *alignment = sizeof(HVX_VectorPair); return AEE_SUCCESS; } -AEEResult npu_device_device_support_op(remote_handle64 _h, const npu_device_tensor_spec * src0, - const npu_device_tensor_spec * src1, const npu_device_tensor_spec * dst, - npu_device_tensor_op op, boolean * is_supported) { +AEEResult npu_device_device_support_op(remote_handle64 _h, npu_device_tensor_op op, const npu_device_tensor_spec * dst, + const npu_device_tensor_spec * srcs, int srcsLen, boolean * is_supported) { NPU_UNUSED(_h); - if (!src0 || !src1 || !dst || !is_supported) { + if (!srcs || srcsLen <= 0 || !dst || !is_supported) { DEVICE_LOG_ERROR("npu_device_device_support_op: Invalid arguments"); return AEE_EINVARGS; } - *is_supported = hexagon::support_op(*src0, *src1, *dst, op); + *is_supported = hexagon::support_op(op, dst, srcs, srcsLen); return AEE_SUCCESS; } @@ -208,19 +207,20 @@ AEEResult npu_device_graph_set_tensor_with_param(remote_handle64 _h, npu_device_ int tensor_paramsLen) { NPU_UNUSED(_h); auto * graph = graph_from_handle(graph_handle); - if (!graph || !tensor_handles || tensor_handlesLen <= 0 || !tensor_params || - tensor_handlesLen != tensor_paramsLen) { + if (!graph || tensor_handlesLen != tensor_paramsLen || tensor_handlesLen < 0) { return AEE_EINVHANDLE; } - graph->set_tensor(tensor_handles, tensor_handlesLen); - for (int i = 0; i < tensor_handlesLen; ++i) { - auto * tensor = tensor_from_handle(tensor_handles[i]); - if (tensor) { - tensor->update_config(tensor_params[i]); + if (tensor_params && tensor_handles) { + for (int i = 0; i < tensor_handlesLen; ++i) { + auto * tensor = tensor_from_handle(tensor_handles[i]); + if (tensor) { + tensor->update_config(tensor_params[i]); + } } } + graph->set_tensor(tensor_handles, tensor_handlesLen); return AEE_SUCCESS; } diff --git a/ggml/src/ggml-qnn/npu/device/graph.cpp b/ggml/src/ggml-qnn/npu/device/graph.cpp index c9cad772320f1..5bc14a0aca7af 100644 --- a/ggml/src/ggml-qnn/npu/device/graph.cpp +++ b/ggml/src/ggml-qnn/npu/device/graph.cpp @@ -10,8 +10,7 @@ namespace hexagon { graph::graph() noexcept { - _vtcm_quota_size = hexagon::vtcm_mem::get_avail_block_size(); // TODO: move to device init? - DEVICE_LOG_DEBUG("graph(%p) created: vtcm quota size: %zu\n", (void *) this, _vtcm_quota_size); + DEVICE_LOG_DEBUG("graph(%p) created\n", (void *) this); } graph::~graph() noexcept { @@ -20,9 +19,10 @@ graph::~graph() noexcept { } void graph::set_tensor(const npu_device_tensor_handle_t * tensors, int tensor_count) { - if (tensor_count <= 0) { + if (tensor_count <= 0 || !tensors) { _tensors.reset(); _tensor_count = 0; + DEVICE_LOG_DEBUG("graph(%p) set_tensor: no tensors to set\n", (void *) this); return; } @@ -50,21 +50,27 @@ bool graph::compute(default_thread_pool * thread_pool, const float * f16_to_f32_ DEVICE_SCOPED_PERFORMANCE_TRACKER("[%p]compute", (void *) this); _f16_to_f32_table = f16_to_f32_table; if (thread_pool) { - thread_pool->sync_execute(reinterpret_cast(&graph::thread_pool_task), this); + thread_pool->sync_execute(&graph::thread_pool_task, this); } else { - compute_impl(nullptr, 0, 1); + default_thread_pool::thread_params param = { + 0, 1, nullptr, hexagon::vtcm_mem::get_avail_block_size() + }; // TODO: should have a better way to initialize thread_params + + compute_impl(nullptr, ¶m); } + _tensors[_tensor_count - 1]->invalidate(); _f16_to_f32_table = nullptr; return true; } -void graph::thread_pool_task(default_thread_pool * pool, size_t thread_idx, size_t thread_count, graph * graph) { - graph->compute_impl(pool, thread_idx, thread_count); +void graph::thread_pool_task(default_thread_pool * pool, default_thread_pool::thread_params * thread_params, + void * graph) { + reinterpret_cast(graph)->compute_impl(pool, thread_params); } -void graph::compute_impl(default_thread_pool * pool, size_t thread_idx, size_t thread_count) { - hexagon::compute_params params = { thread_idx, thread_count, _vtcm_quota_size / thread_count, _f16_to_f32_table }; +void graph::compute_impl(default_thread_pool * pool, default_thread_pool::thread_params * thread_params) { + hexagon::compute_params params = { thread_params, _f16_to_f32_table }; for (size_t i = 0; i < _tensor_count; ++i) { auto * dst = _tensors[i]; @@ -78,13 +84,12 @@ void graph::compute_impl(default_thread_pool * pool, size_t thread_idx, size_t t DEVICE_LOG_ERROR("graph(%p) tensor[%zu] op %d compute failed\n", (void *) this, i, op); } - DEVICE_SCOPED_PERFORMANCE_TRACKER("[%p]sync_thread, tidx: %zu", (void *) this, thread_idx); - const bool should_sync = requires_thread_barrier(op); if (pool && should_sync && i < _tensor_count - 1) { + DEVICE_SCOPED_PERFORMANCE_TRACKER("[%p]sync_thread, tidx: %zu, tensor[%zu/%zu]", (void *) this, + params.get_thread_index(), i, _tensor_count); pool->sync_thread(); } - dst->invalidate(); } } diff --git a/ggml/src/ggml-qnn/npu/device/graph.hpp b/ggml/src/ggml-qnn/npu/device/graph.hpp index c6b68c4eeadd9..36cf5bfc5c452 100644 --- a/ggml/src/ggml-qnn/npu/device/graph.hpp +++ b/ggml/src/ggml-qnn/npu/device/graph.hpp @@ -20,12 +20,12 @@ class graph { bool compute(default_thread_pool * thread_pool, const float * f16_to_f32_table); private: - static void thread_pool_task(default_thread_pool * pool, size_t thread_idx, size_t thread_count, graph * graph); - void compute_impl(default_thread_pool * pool, size_t thread_idx, size_t thread_count); + static void thread_pool_task(default_thread_pool * pool, default_thread_pool::thread_params * thread_params, + void * graph); + void compute_impl(default_thread_pool * pool, default_thread_pool::thread_params * thread_params); std::unique_ptr _tensors; size_t _tensor_count = 0; - size_t _vtcm_quota_size = 0; const float * _f16_to_f32_table = nullptr; DISABLE_COPY_AND_MOVE(graph); diff --git a/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp b/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp new file mode 100644 index 0000000000000..0c1ac778ba6f0 --- /dev/null +++ b/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp @@ -0,0 +1,321 @@ + +#include "op_flash_attn.hpp" + +#include "type_traits.hpp" +#include "util.hpp" +#include "vec_ops.hpp" + +namespace { + +// TODO: use a more efficient conversion +inline float f16_to_f32(const npu_device_fp16_t src) { + return reinterpret_cast(src); +} + +// From: ggml/src/ggml-cpu/ops.cpp +void flash_attn_impl(hexagon::tensor * out, const hexagon::tensor * q, const hexagon::tensor * k, + const hexagon::tensor * v, const hexagon::tensor * mask, hexagon::compute_params * params) { + static_assert(3 <= hexagon::kMaxParamsCount, "flash_attn op params count exceeds max params count"); + + float scale = out->get_op_param(0); + const float max_bias = out->get_op_param(1); + const float logit_softcap = out->get_op_param(2); + + if (logit_softcap != 0) { + scale /= logit_softcap; + } + + // broadcast factors + const int64_t rk2 = q->get_ne(2) / k->get_ne(2); + const int64_t rk3 = q->get_ne(3) / k->get_ne(3); + const int64_t rv2 = q->get_ne(2) / v->get_ne(2); + const int64_t rv3 = q->get_ne(3) / v->get_ne(3); + + const uint32_t n_head = q->get_ne(2); + const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head)); + + const float m0 = powf(2.0f, -(max_bias) / n_head_log2); + const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); + + const auto q_to_vec_dot = hexagon::get_type_traits(k->get_type()).from_float; // TODO: fix this + const auto kq_vec_dot = hexagon::get_type_traits(k->get_type()).vec_dot; + const auto v_to_float = hexagon::get_type_traits(v->get_type()).to_float; + if (!q_to_vec_dot || !kq_vec_dot) { + DEVICE_LOG_ERROR("flash_attn_impl: unsupported data type for q, k, or v\n"); + return; + } + + const int64_t total_rows = q->get_ne(1) * q->get_ne(2) * q->get_ne(3); // total number of rows in Q + const auto start_end_row = params->get_work_slice(total_rows); // work slice for this thread + + const auto DK = k->get_ne(0); + const auto DV = v->get_ne(0); + const auto row_bytes_q = q->get_ne(0) * hexagon::get_type_traits(q->get_type()).type_size; + const auto row_bytes_k = DK * hexagon::get_type_traits(k->get_type()).type_size; + const auto row_bytes_v = DV * hexagon::get_type_traits(v->get_type()).type_size; + + constexpr const size_t kFloatsPerVector = hexagon::kBytesPerVector / sizeof(float); + const auto aligned_dk = (DK + kFloatsPerVector - 1) / kFloatsPerVector * kFloatsPerVector; + const auto aligned_dv = (DV + kFloatsPerVector - 1) / kFloatsPerVector * kFloatsPerVector; + size_t total_cache_size = sizeof(float) * (aligned_dk + 2 * aligned_dv); + auto * cache_ptr = params->get_vtcm_cache(total_cache_size); + if (!cache_ptr) { + DEVICE_LOG_ERROR("Failed to allocate VTCM cache for flash_attn: %zu bytes\n", total_cache_size); + return; + } + + // loop over n_batch and n_head + const auto rows_per_batch = q->get_ne(2) * q->get_ne(1); + const auto out_rows_per_batch = out->get_ne(2) * out->get_ne(1); + const bool is_v_f16 = + v->get_type() == NPU_DATA_TYPE_F16; // check if V is in FP16 format, otherwise it is in FP32 format + uint8_t * dst_ptr = out->get_write_buffer(); + if (!dst_ptr) { + DEVICE_LOG_ERROR("flash_attn_impl: dst_ptr is not writable, tensor: %p, type: %s\n", (void *) out, + hexagon::get_type_name(out->get_type())); + return; + } + + DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_WITH_MULTI_SUB_PROC(out, params->get_thread_index(), flash_attn); + const uint8_t * q_ptr = q->get_read_buffer(); + const uint8_t * k_ptr = k->get_read_buffer(); + const uint8_t * v_ptr = v->get_read_buffer(); + const uint8_t * mask_ptr = mask ? mask->get_read_buffer() : nullptr; + for (auto ir = start_end_row.first; ir < start_end_row.second; ++ir) { + // q indices + const auto iq3 = ir / rows_per_batch; + const auto iq2 = (ir - iq3 * rows_per_batch) / q->get_ne(1); + const auto iq1 = (ir - iq3 * rows_per_batch - iq2 * q->get_ne(1)); + + const uint32_t h = iq2; // head index + const float slope = + (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2 * (h - n_head_log2) + 1) : 1.0f; + + float S = 0.0f; // sum + float M = -INFINITY; // maximum KQ value + + float * VKQ32 = reinterpret_cast(cache_ptr); // FP32 VKQ accumulator + float * V32 = VKQ32 + aligned_dv; // (temporary) FP32 V buffer + auto * VKQ16 = reinterpret_cast(VKQ32 + aligned_dv); // (temporary) FP16 VKQ accumulator + auto * Q_q = reinterpret_cast( + VKQ32 + 2 * aligned_dv); // (temporary) buffer for Q converted to quantized/FP16 + + if (is_v_f16) { + memset(VKQ16, 0, DV * sizeof(npu_device_fp16_t)); + } else { + memset(VKQ32, 0, DV * sizeof(float)); + } + + const npu_device_fp16_t * mp = + mask_ptr ? reinterpret_cast(mask_ptr + iq1 * mask->get_nb(1)) : nullptr; + + // k indices + const int ik3 = iq3 / rk3; + const int ik2 = iq2 / rk2; + + // v indices + const int iv3 = iq3 / rv3; + const int iv2 = iq2 / rv2; + + const auto * q_data = q_ptr + (iq1 * q->get_nb(1) + iq2 * q->get_nb(2) + iq3 * q->get_nb(3)); + if (iq1 < q->get_ne(1) - 1) { + hexagon::l2fetch_row(q_data + q->get_nb(1), row_bytes_q); + } + + q_to_vec_dot(reinterpret_cast(q_data), Q_q, DK, params->f16_to_f32_table); + + // online softmax / attention + // loop over n_kv and n_head_kv + // ref: https://arxiv.org/pdf/2112.05682.pdf + for (int64_t ic = 0; ic < k->get_ne(1); ++ic) { + DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(flash_attn, 0, loop); + float mv = mp ? (slope * f16_to_f32(mp[ic])) : 0.0f; + if (mv == -INFINITY) { + continue; + } + + float s = 0.f; + { + DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(flash_attn, 1, kq_dot); + const auto * k_data = k_ptr + (ic * k->get_nb(1) + ik2 * k->get_nb(2) + ik3 * k->get_nb(3)); + if (ic < k->get_ne(1) - 1) { + hexagon::l2fetch_row(k_data + k->get_nb(1), row_bytes_k); + } + + s = kq_vec_dot(k_data, Q_q, DK); // KQ value + s = s * scale; // scale KQ value + if (logit_softcap != 0.0f) { + s = logit_softcap * tanhf(s); // TODO: vectorize this? + } + + s += mv; // apply mask + } + + const float Mold = M; + + float ms = 1.0f; // upon new higher max val, scale VKQ and KQ sum with this value + float vs = 1.0f; // post-softmax KQ value, expf(s - M) + + const auto * v_data = v_ptr + (ic * v->get_nb(1) + iv2 * v->get_nb(2) + iv3 * v->get_nb(3)); + if (ic < v->get_ne(1)) { + hexagon::l2fetch_row(v_data, row_bytes_v); + } + + if (is_v_f16) { + if (s > M) { + // s is new maximum, ms < 1.0f, vs == expf(s - s) == 1.0f + M = s; + ms = expf(Mold - M); + + // V = V*expf(Mold - M) + hexagon::vec_scale_f16(VKQ16, ms, VKQ16, DV); + } else { + // no new maximum, ms == 1.0f, vs != 1.0f + vs = expf(s - M); + } + + // V += v*expf(s - M) + DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(flash_attn, 2, mad); + hexagon::vec_mad_f16(reinterpret_cast(v_data), vs, VKQ16, DV); + } else { + if (s > M) { + // s is new maximum, ms < 1.0f, vs == expf(s - s) == 1.0f + M = s; + ms = expf(Mold - M); + + // V = V*expf(Mold - M) + hexagon::vec_scale_f32(VKQ32, ms, VKQ32, DV); + } else { + // no new maximum, ms == 1.0f, vs != 1.0f + vs = expf(s - M); + } + + // V += v*expf(s - M) + DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(flash_attn, 2, mad); + if (v_to_float) { + v_to_float(v_data, V32, DV, params->f16_to_f32_table); + hexagon::vec_mad_f32(V32, vs, VKQ32, DV); + } else { + // V is F32 + hexagon::vec_mad_f32(reinterpret_cast(v_data), vs, VKQ32, DV); + } + } + + S = S * ms + vs; // scale and increment sum with partial sum + } + + if (is_v_f16) { + // TODO: use a more efficient conversion + for (int64_t d = 0; d < DV; ++d) { + VKQ32[d] = f16_to_f32(VKQ16[d]); + } + } + + // V /= S + const float S_inv = 1.0f / S; + hexagon::vec_scale_f32(VKQ32, S_inv, VKQ32, DV); + + // dst indices + const int i1 = iq1; + const int i2 = iq2; + const int i3 = iq3; + + // permute(0, 2, 1, 3) + memcpy(dst_ptr + (i3 * out_rows_per_batch + i2 + i1 * out->get_ne(1)) * out->get_nb(1), VKQ32, out->get_nb(1)); + } + + out->release_write_buffer(); // mark the output tensor as modified +} + +} // namespace + +namespace hexagon { + +bool flash_attn_f32(tensor * out, compute_params * params) { + if (!out || !params) { + DEVICE_LOG_DEBUG("invalid out or params\n"); + return false; + } + + const auto * q = out->get_src(0); + const auto * k = out->get_src(1); + const auto * v = out->get_src(2); + const auto * mask = out->get_src(3); + if (!q || !k || !v || !mask) { + DEVICE_LOG_DEBUG("invalid src tensors: q: %p, k: %p, v: %p, mask: %p\n", (void *) q, (void *) k, (void *) v, + (void *) mask); + return false; + } + + flash_attn_impl(out, q, k, v, mask, params); + return true; +} + +bool is_flash_attn_supported(npu_device_tensor_op op, const npu_device_tensor_spec * dst, + const npu_device_tensor_spec * srcs, size_t src_len) { + if (op != NPU_OP_FLASH_ATTN) { + DEVICE_LOG_DEBUG("op is not NPU_OP_FLASH_ATTN: %d\n", op); + return false; + } + + if (!dst || !srcs || src_len < 4) { + DEVICE_LOG_DEBUG("[%s]invalid dst or srcs\n", op_get_name(op)); + return false; + } + + if (dst->type != NPU_DATA_TYPE_F32) { + DEVICE_LOG_DEBUG("[%s]dst type is not F32: %s\n", op_get_name(op), get_type_name(dst->type)); + return false; + } + + const auto * q = &srcs[0]; + if (q->type != NPU_DATA_TYPE_F32) { + DEVICE_LOG_DEBUG("[%s]q type is not F32: %s\n", op_get_name(op), get_type_name(q->type)); + return false; + } + + const auto * k = &srcs[1]; + if (k->type != NPU_DATA_TYPE_F16) { // TODO: support more k types + DEVICE_LOG_DEBUG("[%s]k type is not F16: %s\n", op_get_name(op), get_type_name(k->type)); + return false; + } + + const auto * v = &srcs[2]; + if (v->type != k->type) { // TODO: support more v types + DEVICE_LOG_DEBUG("[%s]v type is not the same as k: %s vs %s\n", op_get_name(op), get_type_name(v->type), + get_type_name(k->type)); + return false; + } + + const auto * mask = &srcs[3]; + if (mask->type != NPU_DATA_TYPE_F16) { + DEVICE_LOG_DEBUG("[%s]mask type is not F16: %s\n", op_get_name(op), get_type_name(mask->type)); + return false; + } + + if (dst->ne[0] != v->ne[0] || dst->ne[2] != q->ne[1]) { + DEVICE_LOG_DEBUG( + "[%s]dst shape does not match q and v: dst ne: %ld, %ld, %ld, %ld, q ne: %ld, %ld, %ld, %ld, " + "v ne: %ld, %ld, %ld, %ld\n", + op_get_name(op), dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], q->ne[0], q->ne[1], q->ne[2], q->ne[3], + v->ne[0], v->ne[1], v->ne[2], v->ne[3]); + return false; + } + + if (is_transposed_or_permuted(dst->nb)) { + DEVICE_LOG_DEBUG("[%s]dst cannot be transposed or permuted, nb: %zu, %zu, %zu, %zu\n", op_get_name(op), + dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3]); + return false; + } + + if (q->ne[0] != k->ne[0]) { + DEVICE_LOG_DEBUG("[%s]q and k shapes do not match: q ne: %ld, %ld, %ld, %ld, k ne: %ld, %ld, %ld, %ld\n", + op_get_name(op), q->ne[0], q->ne[1], q->ne[2], q->ne[3], k->ne[0], k->ne[1], k->ne[2], + k->ne[3]); + return false; + } + + return true; +} + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/device/op_flash_attn.hpp b/ggml/src/ggml-qnn/npu/device/op_flash_attn.hpp new file mode 100644 index 0000000000000..63d6d09d54990 --- /dev/null +++ b/ggml/src/ggml-qnn/npu/device/op_flash_attn.hpp @@ -0,0 +1,11 @@ +#pragma once + +#include "op_types.hpp" + +namespace hexagon { + +bool flash_attn_f32(tensor * out, compute_params * params); +bool is_flash_attn_supported(npu_device_tensor_op op, const npu_device_tensor_spec * dst, + const npu_device_tensor_spec * srcs, size_t src_len); + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/device/op_impl.cpp b/ggml/src/ggml-qnn/npu/device/op_impl.cpp index 777072024a450..4d271a4899aec 100644 --- a/ggml/src/ggml-qnn/npu/device/op_impl.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_impl.cpp @@ -2,13 +2,12 @@ #include "op_impl.hpp" -#include -#include - #include +#include "op_flash_attn.hpp" #include "op_mul_mat.hpp" -#include "quants.hpp" +#include "type_traits.hpp" +#include "vec_ops.hpp" namespace { @@ -16,12 +15,12 @@ template inline void vec_op_impl(const _TyData * src0, const _TyData * src1, size_t count, _TyData * dst) { constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(_TyData); - HVX_Vector * iptr0 = ((HVX_Vector *) src0); - HVX_Vector * iptr0_end = ((HVX_Vector *) src0) + (count / kElementsPerVector); - HVX_Vector * iptr1 = ((HVX_Vector *) src1); - HVX_Vector * optr = ((HVX_Vector *) dst); // framework will ensure the dst is aligned - HVX_Vector prev0 = *iptr0++; - HVX_Vector prev1 = *iptr1++; + HVX_Vector * iptr0 = ((HVX_Vector *) src0); + HVX_Vector * const iptr0_end = ((HVX_Vector *) src0) + (count / kElementsPerVector); + HVX_Vector * iptr1 = ((HVX_Vector *) src1); + HVX_Vector * optr = ((HVX_Vector *) dst); // framework will ensure the dst is aligned + HVX_Vector prev0 = *iptr0++; + HVX_Vector prev1 = *iptr1++; while (iptr0 < iptr0_end) { HVX_Vector curr0 = *iptr0++; @@ -33,25 +32,25 @@ inline void vec_op_impl(const _TyData * src0, const _TyData * src1, size_t count prev1 = curr1; } + const size_t leftover = count % kElementsPerVector; if ((iptr0_end - ((HVX_Vector *) src0)) > 0) { // handle the last vector // see also: // https://github.com/UbiquitousLearning/mllm/blob/babf4410352ce8730824c87699c025a0d4ce3a6f/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/LLaMAMul.cpp#L147 // or qualcomm sdk libs\qhl_hvx\src\qhblas_hvx\qhblas_hvx_aw_vector_add_ah.c - bool iptr0_aligned = hexagon::is_addr_aligned(iptr0); - HVX_Vector curr0 = iptr0_aligned ? prev0 : *iptr0; - iptr0 = iptr0_aligned ? iptr0 : iptr0 + 1; - bool iptr1_aligned = hexagon::is_addr_aligned(iptr1); - HVX_Vector curr1 = iptr1_aligned ? prev1 : *iptr1; - iptr1 = iptr1_aligned ? iptr1 : iptr1 + 1; - HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); - HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); - *optr++ = _OpIntrinsic(s0, s1); - prev0 = curr0; - prev1 = curr1; - } - - const size_t leftover = count % kElementsPerVector; + bool should_fetch_src0 = leftover != 0 || !hexagon::is_addr_aligned(iptr0); + bool should_fetch_src1 = leftover != 0 || !hexagon::is_addr_aligned(iptr1); + HVX_Vector curr0 = should_fetch_src0 ? *iptr0 : prev0; + HVX_Vector curr1 = should_fetch_src1 ? *iptr1 : prev1; + iptr0 += should_fetch_src0 ? 1 : 0; + iptr1 += should_fetch_src1 ? 1 : 0; + HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); + *optr++ = _OpIntrinsic(s0, s1); + prev0 = curr0; + prev1 = curr1; + } + const size_t leftover_bytes = leftover * sizeof(_TyData); if (leftover > 0) { // handle the leftover elements @@ -136,18 +135,23 @@ template bool element_wise_op(hexagon::tensor * out, hexagon::co return false; } - const auto * src0_ptr = reinterpret_cast(src0->get_read_buffer()); - const auto * src1_ptr = reinterpret_cast(src1->get_read_buffer()); - auto * dst_ptr = reinterpret_cast(out->get_write_buffer()); - auto total_rows = out->get_ne(3) * out->get_ne(2) * out->get_ne(1); - const auto rows_per_cube = out->get_ne(2) * out->get_ne(1); - const auto start_end = hexagon::get_thread_work_slice(total_rows, params->tidx, params->tcnt); + uint8_t * dst_ptr = out->get_write_buffer(); + if (!dst_ptr) { + DEVICE_LOG_ERROR("element_wise_op: dst_ptr is not writable, tensor: %p, type: %s\n", (void *) out, + hexagon::get_type_name(out->get_type())); + return false; + } + const uint8_t * src0_ptr = src0->get_read_buffer(); + const uint8_t * src1_ptr = src1->get_read_buffer(); + auto total_rows = out->get_ne(3) * out->get_ne(2) * out->get_ne(1); + const auto rows_per_cube = out->get_ne(2) * out->get_ne(1); + const auto start_end = params->get_work_slice(total_rows); if (start_end.first >= start_end.second) { return true; } - DEVICE_SCOPED_OP_PERFORMANCE_TRACKER(out, params->tidx); + DEVICE_SCOPED_OP_PERFORMANCE_TRACKER(out, params->get_thread_index()); const size_t valid_row_bytes = src0->get_ne(0) * sizeof(data_type); for (int64_t ir = start_end.first; ir < start_end.second; ++ir) { @@ -171,6 +175,7 @@ template bool element_wise_op(hexagon::tensor * out, hexagon::co static_cast(out->get_ne(0)), reinterpret_cast(dst_row)); } + out->release_write_buffer(); // mark the output tensor as modified return true; } @@ -184,27 +189,36 @@ bool is_same_shape(const npu_device_tensor_spec & src, const npu_device_tensor_s return true; } -bool is_element_wise_op_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, - const npu_device_tensor_spec & dst, npu_device_tensor_op op) { +bool is_element_wise_op_supported(npu_device_tensor_op op, const npu_device_tensor_spec * dst, + const npu_device_tensor_spec * srcs, size_t src_len) { if (op != NPU_OP_ADD && op != NPU_OP_SUB && op != NPU_OP_MUL) { DEVICE_LOG_DEBUG("[%s]unsupported\n", hexagon::op_get_name(op)); return false; } - if (dst.type != src0.type || dst.type != src1.type) { + if (!dst || !srcs || src_len < 2) { + DEVICE_LOG_DEBUG("[%s]invalid dst or srcs\n", hexagon::op_get_name(op)); + return false; + } + + const auto & src0 = srcs[0]; + const auto & src1 = srcs[1]; + if (dst->type != src0.type || dst->type != src1.type) { DEVICE_LOG_DEBUG("[%s]src0.type and dst.type mismatch: %s vs %s\n", hexagon::op_get_name(op), - hexagon::get_type_name(src0.type), hexagon::get_type_name(dst.type)); + hexagon::get_type_name(src0.type), hexagon::get_type_name(dst->type)); return false; } - if (dst.type != NPU_DATA_TYPE_F32 && dst.type != NPU_DATA_TYPE_F16) { - DEVICE_LOG_DEBUG("[%s]unsupported data type: %s\n", hexagon::op_get_name(op), hexagon::get_type_name(dst.type)); + if (dst->type != NPU_DATA_TYPE_F32 && dst->type != NPU_DATA_TYPE_F16) { + DEVICE_LOG_DEBUG("[%s]unsupported data type: %s\n", hexagon::op_get_name(op), + hexagon::get_type_name(dst->type)); return false; } // TODO: fix FP16 add/sub - if (dst.type == NPU_DATA_TYPE_F16 && op != NPU_OP_MUL) { - DEVICE_LOG_DEBUG("[%s]unsupported data type: %s\n", hexagon::op_get_name(op), hexagon::get_type_name(dst.type)); + if (dst->type == NPU_DATA_TYPE_F16 && op != NPU_OP_MUL) { + DEVICE_LOG_DEBUG("[%s]unsupported data type: %s\n", hexagon::op_get_name(op), + hexagon::get_type_name(dst->type)); return false; } @@ -214,7 +228,7 @@ bool is_element_wise_op_supported(const npu_device_tensor_spec & src0, const npu return false; } - if (!is_same_shape(src0, dst)) { + if (!is_same_shape(src0, *dst)) { DEVICE_LOG_DEBUG("[%s]src0 and dst have different shape\n", hexagon::op_get_name(op)); return false; } @@ -225,10 +239,10 @@ bool is_element_wise_op_supported(const npu_device_tensor_spec & src0, const npu void rms_norm_vec_f32(const float * src, size_t count, float eps, float * dst) { constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(float); - HVX_Vector * src_vec_ptr = ((HVX_Vector *) src); - HVX_Vector * src_vec_end = ((HVX_Vector *) src) + (count / kElementsPerVector); - HVX_Vector prev = *src_vec_ptr++; - HVX_Vector sum = Q6_V_vzero(); + HVX_Vector * src_vec_ptr = ((HVX_Vector *) src); + HVX_Vector * const src_vec_end = ((HVX_Vector *) src) + (count / kElementsPerVector); + HVX_Vector prev = *src_vec_ptr++; + HVX_Vector sum = Q6_V_vzero(); while (src_vec_ptr < src_vec_end) { HVX_Vector curr = *src_vec_ptr++; HVX_Vector s0 = Q6_V_valign_VVR(curr, prev, (size_t) src); @@ -236,17 +250,17 @@ void rms_norm_vec_f32(const float * src, size_t count, float eps, float * dst) { prev = curr; } + const size_t leftover = count % kElementsPerVector; if ((src_vec_end - ((HVX_Vector *) src)) > 0) { // handle the last vector - bool src_ptr_aligned = hexagon::is_addr_aligned(src_vec_ptr); - HVX_Vector curr = src_ptr_aligned ? prev : *src_vec_ptr; - src_vec_ptr = src_ptr_aligned ? src_vec_ptr : src_vec_ptr + 1; - HVX_Vector s0 = Q6_V_valign_VVR(curr, prev, (size_t) src); - sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_Vqf32_vmpy_VsfVsf(s0, s0)); - prev = curr; + bool should_fetch_src = leftover != 0 || !hexagon::is_addr_aligned(src_vec_ptr); + HVX_Vector curr = should_fetch_src ? *src_vec_ptr : prev; + src_vec_ptr += should_fetch_src ? 1 : 0; + HVX_Vector s0 = Q6_V_valign_VVR(curr, prev, (size_t) src); + sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_Vqf32_vmpy_VsfVsf(s0, s0)); + prev = curr; } - const size_t leftover = count % kElementsPerVector; const size_t leftover_bytes = leftover * sizeof(float); if (leftover > 0) { // handle the leftover elements @@ -257,37 +271,9 @@ void rms_norm_vec_f32(const float * src, size_t count, float eps, float * dst) { Q6_V_valign_VVR(Q6_Vqf32_vmpy_VsfVsf(curr, curr), Q6_V_vzero(), leftover_bytes)); } - const float mean = hexagon::vec_reduction_f32(sum) / count; // TODO: figure out how to do division in vector - const float scale = 1.0f / sqrtf(mean + eps); // TODO: use buildin blas sqrtf? - - HVX_Vector scale_vec = Q6_V_vsplat_R(reinterpret_cast(scale)); - src_vec_ptr = ((HVX_Vector *) src); - prev = *src_vec_ptr++; - HVX_Vector * dst_vec_ptr = ((HVX_Vector *) dst); // framework will ensure the dst is aligned - while (src_vec_ptr < src_vec_end) { - HVX_Vector curr = *src_vec_ptr++; - HVX_Vector s0 = Q6_V_valign_VVR(curr, prev, (size_t) src); - *dst_vec_ptr++ = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(s0, scale_vec)); - prev = curr; - } - - if ((src_vec_end - ((HVX_Vector *) src)) > 0) { - // handle the last vector - bool src_ptr_aligned = hexagon::is_addr_aligned(src_vec_ptr); - HVX_Vector curr = src_ptr_aligned ? prev : *src_vec_ptr; - src_vec_ptr = src_ptr_aligned ? src_vec_ptr : src_vec_ptr + 1; - HVX_Vector s0 = Q6_V_valign_VVR(curr, prev, (size_t) src); - *dst_vec_ptr++ = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(s0, scale_vec)); - prev = curr; - } - - if (leftover > 0) { - // handle the leftover elements - HVX_Vector curr = - (leftover_bytes + hexagon::unaligned_bytes(src_vec_ptr) > hexagon::kBytesPerVector) ? *src_vec_ptr : prev; - curr = Q6_V_valign_VVR(curr, prev, (size_t) src); - q6op_vstu_variable_ARV(dst_vec_ptr, leftover_bytes, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(curr, scale_vec))); - } + const float mean = hexagon::vec_reduction_qf32_f32(sum) / count; // TODO: figure out how to do division in vector + const float scale = 1.0f / sqrtf(mean + eps); // TODO: use buildin blas sqrtf? + hexagon::vec_scale_f32(src, scale, dst, count); } // TODO: merge with element_wise_op? @@ -305,16 +291,22 @@ template bool unary_op(hexagon::tensor * out, hexagon::compute_p return true; // skip if no src } - const auto * src0_ptr = reinterpret_cast(src0->get_read_buffer()); - auto * dst_ptr = reinterpret_cast(out->get_write_buffer()); + auto * dst_ptr = out->get_write_buffer(); + if (!dst_ptr) { + DEVICE_LOG_ERROR("unary_op: dst_ptr is not writable, tensor: %p, type: %s\n", (void *) out, + hexagon::get_type_name(out->get_type())); + return false; + } + + const auto * src0_ptr = src0->get_read_buffer(); auto total_rows = out->get_ne(3) * out->get_ne(2) * out->get_ne(1); const auto rows_per_cube = out->get_ne(2) * out->get_ne(1); - const auto start_end = hexagon::get_thread_work_slice(total_rows, params->tidx, params->tcnt); + const auto start_end = params->get_work_slice(total_rows); if (start_end.first >= start_end.second) { return true; } - DEVICE_SCOPED_OP_PERFORMANCE_TRACKER(out, params->tidx); + DEVICE_SCOPED_OP_PERFORMANCE_TRACKER(out, params->get_thread_index()); const auto param = out->get_op_param(0); const size_t valid_row_bytes = src0->get_ne(0) * sizeof(data_type); @@ -333,28 +325,36 @@ template bool unary_op(hexagon::tensor * out, hexagon::compute_p reinterpret_cast(dst_row)); } + out->release_write_buffer(); // mark the output tensor as modified return true; } -bool is_unary_op_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, - const npu_device_tensor_spec & dst, npu_device_tensor_op op) { +bool is_unary_op_supported(npu_device_tensor_op op, const npu_device_tensor_spec * dst, + const npu_device_tensor_spec * srcs, size_t src_len) { if (op != NPU_OP_RMS_NORM) { DEVICE_LOG_DEBUG("[%s]unsupported\n", hexagon::op_get_name(op)); return false; } - if (dst.type != src0.type) { + if (!dst || !srcs || src_len < 1) { + DEVICE_LOG_DEBUG("[%s]invalid dst or srcs\n", hexagon::op_get_name(op)); + return false; + } + + const auto & src0 = srcs[0]; + if (dst->type != src0.type) { DEVICE_LOG_DEBUG("[%s]src0.type and dst.type mismatch: %s vs %s\n", hexagon::op_get_name(op), - hexagon::get_type_name(src0.type), hexagon::get_type_name(dst.type)); + hexagon::get_type_name(src0.type), hexagon::get_type_name(dst->type)); return false; } - if (dst.type != NPU_DATA_TYPE_F32) { - DEVICE_LOG_DEBUG("[%s]unsupported data type: %s\n", hexagon::op_get_name(op), hexagon::get_type_name(dst.type)); + if (dst->type != NPU_DATA_TYPE_F32) { + DEVICE_LOG_DEBUG("[%s]unsupported data type: %s\n", hexagon::op_get_name(op), + hexagon::get_type_name(dst->type)); return false; } - if (!is_same_shape(src0, dst)) { + if (!is_same_shape(src0, *dst)) { DEVICE_LOG_DEBUG("[%s]src0 and dst have different shape\n", hexagon::op_get_name(op)); return false; } @@ -371,40 +371,47 @@ struct op_capabilities { constexpr const op_capabilities kOpCapabilities[] = { { - NPU_OP_MUL_MAT, hexagon::is_mul_mat_supported, + NPU_OP_MUL_MAT, hexagon::is_mul_mat_supported, { hexagon::mul_mat_f32, // NPU_DATA_TYPE_F32 nullptr, // NPU_DATA_TYPE_F16 - }, true, - }, + }, true, // requires_thread_barrier + }, { - NPU_OP_ADD, is_element_wise_op_supported, + NPU_OP_ADD, is_element_wise_op_supported, { element_wise_op>, // NPU_DATA_TYPE_F32 element_wise_op>, // NPU_DATA_TYPE_F16 - }, false, - }, + }, false, // requires_thread_barrier + }, { - NPU_OP_SUB, is_element_wise_op_supported, + NPU_OP_SUB, is_element_wise_op_supported, { element_wise_op>, // NPU_DATA_TYPE_F32 element_wise_op>, // NPU_DATA_TYPE_F16 - }, false, - }, + }, false, // requires_thread_barrier + }, { - NPU_OP_MUL, is_element_wise_op_supported, + NPU_OP_MUL, is_element_wise_op_supported, { element_wise_op>, // NPU_DATA_TYPE_F32 element_wise_op>, // NPU_DATA_TYPE_F16 - }, false, - }, + }, false, // requires_thread_barrier + }, { - NPU_OP_RMS_NORM, is_unary_op_supported, + NPU_OP_RMS_NORM, is_unary_op_supported, { unary_op, // NPU_DATA_TYPE_F32 nullptr, // NPU_DATA_TYPE_F16 - }, false, - }, + }, false, // requires_thread_barrier + }, + { + NPU_OP_FLASH_ATTN,hexagon::is_flash_attn_supported, + { + hexagon::flash_attn_f32, // NPU_DATA_TYPE_F32 + nullptr, // NPU_DATA_TYPE_F16 + }, true, // requires_thread_barrier + }, }; static_assert(kOpCapabilities[NPU_OP_MUL_MAT].compute_funcs[NPU_DATA_TYPE_F32] == hexagon::mul_mat_f32, @@ -415,6 +422,8 @@ static_assert(kOpCapabilities[NPU_OP_MUL_MAT].op == NPU_OP_MUL_MAT, "kOpArray[NP static_assert(kOpCapabilities[NPU_OP_MUL].op == NPU_OP_MUL, "kOpArray[NPU_OP_MUL].op != NPU_OP_MUL"); static_assert(kOpCapabilities[NPU_OP_RMS_NORM].op == NPU_OP_RMS_NORM, "kOpArray[NPU_OP_RMS_NORM].op != NPU_OP_RMS_NORM"); +static_assert(kOpCapabilities[NPU_OP_FLASH_ATTN].op == NPU_OP_FLASH_ATTN, + "kOpArray[NPU_OP_FLASH_ATTN].op != NPU_OP_FLASH_ATTN"); hexagon::compute_func_type get_compute_func_impl(npu_device_tensor_op op, npu_device_tensor_data_type type) { if (op >= NPU_OP_COUNT) { @@ -440,16 +449,16 @@ bool requires_thread_barrier(npu_device_tensor_op op) { return kOpCapabilities[op].requires_thread_barrier; } -bool support_op(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, - const npu_device_tensor_spec & dst, npu_device_tensor_op op) { - if (get_compute_func_impl(op, dst.type) == nullptr) { +bool support_op(npu_device_tensor_op op, const npu_device_tensor_spec * dst, const npu_device_tensor_spec * srcs, + size_t src_len) { + if (get_compute_func_impl(op, dst->type) == nullptr) { DEVICE_LOG_ERROR("[%s]unsupported, get_compute_func failed\n", op_get_name(op)); return false; } auto is_supported_func = kOpCapabilities[op].is_supported; - if (!is_supported_func || !is_supported_func(src0, src1, dst, op)) { - DEVICE_LOG_DEBUG("[%s]unsupported, is_supported_func failed\n", op_get_name(op)); + if (!is_supported_func || !is_supported_func(op, dst, srcs, src_len)) { + DEVICE_LOG_DEBUG("[%s]unsupported, is_supported_func return false\n", op_get_name(op)); return false; } diff --git a/ggml/src/ggml-qnn/npu/device/op_impl.hpp b/ggml/src/ggml-qnn/npu/device/op_impl.hpp index 9b75ec6d47967..709d493428688 100644 --- a/ggml/src/ggml-qnn/npu/device/op_impl.hpp +++ b/ggml/src/ggml-qnn/npu/device/op_impl.hpp @@ -8,7 +8,7 @@ compute_func_type get_compute_func(tensor * dst); bool requires_thread_barrier(npu_device_tensor_op op); -bool support_op(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, - const npu_device_tensor_spec & dst, npu_device_tensor_op op); +bool support_op(npu_device_tensor_op op, const npu_device_tensor_spec * dst, const npu_device_tensor_spec * srcs, + size_t src_len); } // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp index 6087673ac65af..449f0edee1544 100644 --- a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp @@ -1,167 +1,12 @@ #include "op_mul_mat.hpp" -#include - -#include "quants.hpp" #include "thread_pool.hpp" // TODO: remove this dependency +#include "type_traits.hpp" +#include "vec_ops.hpp" #include "vtcm_mem.hpp" namespace { -inline float vec_dot_product_f32_f32(const float * src0, const float * src1, size_t count) { - constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(float); - - HVX_Vector * src0_vec_ptr = ((HVX_Vector *) src0); - HVX_Vector * src0_vec_ptr_end = ((HVX_Vector *) src0) + count / kElementsPerVector; - HVX_Vector * src1_vec_ptr = ((HVX_Vector *) src1); - HVX_Vector prev0 = *src0_vec_ptr++; - HVX_Vector prev1 = *src1_vec_ptr++; - HVX_Vector sum = Q6_V_vzero(); - - while (src0_vec_ptr_end - src0_vec_ptr > 1) { - HVX_Vector curr0_lo = src0_vec_ptr[0]; - HVX_Vector curr0_hi = src0_vec_ptr[1]; - HVX_Vector curr1_lo = src1_vec_ptr[0]; - HVX_Vector curr1_hi = src1_vec_ptr[1]; - - HVX_Vector l0 = Q6_V_valign_VVR(curr0_lo, prev0, (size_t) src0); - HVX_Vector l1 = Q6_V_valign_VVR(curr1_lo, prev1, (size_t) src1); - HVX_Vector h0 = Q6_V_valign_VVR(curr0_hi, curr0_lo, (size_t) src0); - HVX_Vector h1 = Q6_V_valign_VVR(curr1_hi, curr1_lo, (size_t) src1); - sum = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(l0, l1), sum); - sum = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(h0, h1), sum); - - prev0 = curr0_hi; - prev1 = curr1_hi; - src0_vec_ptr += 2; - src1_vec_ptr += 2; - } - - if (src0_vec_ptr_end - src0_vec_ptr > 0) { - HVX_Vector curr0 = *src0_vec_ptr++; - HVX_Vector curr1 = *src1_vec_ptr++; - HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); - HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); - sum = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(s0, s1), sum); - prev0 = curr0; - prev1 = curr1; - } - - if ((src0_vec_ptr_end - ((HVX_Vector *) src0)) > 0) { - // handle the last vector - // see also: - // https://github.com/UbiquitousLearning/mllm/blob/babf4410352ce8730824c87699c025a0d4ce3a6f/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/LLaMAMul.cpp#L147 - // or qualcomm sdk libs\qhl_hvx\src\qhblas_hvx\qhblas_hvx_aw_vector_add_ah.c - bool iptr0_aligned = hexagon::is_addr_aligned(src0_vec_ptr); - HVX_Vector curr0 = iptr0_aligned ? prev0 : *src0_vec_ptr; - src0_vec_ptr = iptr0_aligned ? src0_vec_ptr : src0_vec_ptr + 1; - bool iptr1_aligned = hexagon::is_addr_aligned(src1_vec_ptr); - HVX_Vector curr1 = iptr1_aligned ? prev1 : *src1_vec_ptr; - src1_vec_ptr = iptr1_aligned ? src1_vec_ptr : src1_vec_ptr + 1; - HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); - HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); - sum = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(s0, s1), sum); - prev0 = curr0; - prev1 = curr1; - } - - const size_t leftover = count % kElementsPerVector; - const size_t leftover_bytes = leftover * sizeof(float); - if (leftover > 0) { - // handle the leftover elements - HVX_Vector curr0 = (leftover_bytes + hexagon::unaligned_bytes(src0_vec_ptr) > hexagon::kBytesPerVector) ? - *src0_vec_ptr : - prev0; - curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); - - HVX_Vector curr1 = (leftover_bytes + hexagon::unaligned_bytes(src1_vec_ptr) > hexagon::kBytesPerVector) ? - *src1_vec_ptr : - prev1; - curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); - - sum = Q6_Vqf32_vadd_Vqf32Vqf32( - Q6_V_valign_VVR(Q6_Vqf32_vmpy_VsfVsf(curr0, curr1), Q6_V_vzero(), leftover_bytes), sum); - } - - return hexagon::vec_reduction_f32(sum); -} - -// TODO: merge with vec_dot_product_f32_f32? -inline float vec_dot_product_f16_f16(const npu_device_fp16_t * src0, const npu_device_fp16_t * src1, size_t count) { - constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(npu_device_fp16_t); - constexpr const size_t kFloatsPerVector = hexagon::kBytesPerVector / sizeof(float); - - HVX_Vector * src0_vec_ptr = ((HVX_Vector *) src0); - HVX_Vector * src0_vec_ptr_end = ((HVX_Vector *) src0) + (count / kElementsPerVector); - HVX_Vector * src1_vec_ptr = ((HVX_Vector *) src1); - HVX_Vector prev0 = *src0_vec_ptr++; - HVX_Vector prev1 = *src1_vec_ptr++; - HVX_Vector sum_hi = Q6_V_vzero(); - HVX_Vector sum_lo = Q6_V_vzero(); - - while (src0_vec_ptr < src0_vec_ptr_end) { - HVX_Vector curr0 = *src0_vec_ptr++; - HVX_Vector curr1 = *src1_vec_ptr++; - HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); - HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); - HVX_VectorPair result = Q6_Wqf32_vmpy_VhfVhf(s0, s1); - sum_hi = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_hi_W(result), sum_hi); - sum_lo = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(result), sum_lo); - prev0 = curr0; - prev1 = curr1; - } - - if ((src0_vec_ptr_end - ((HVX_Vector *) src0)) > 0) { - // handle the last vector - // see also: - // https://github.com/UbiquitousLearning/mllm/blob/babf4410352ce8730824c87699c025a0d4ce3a6f/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/LLaMAMul.cpp#L147 - // or qualcomm sdk libs\qhl_hvx\src\qhblas_hvx\qhblas_hvx_aw_vector_add_ah.c - bool iptr0_aligned = hexagon::is_addr_aligned(src0_vec_ptr); - HVX_Vector curr0 = iptr0_aligned ? prev0 : *src0_vec_ptr; - src0_vec_ptr = iptr0_aligned ? src0_vec_ptr : src0_vec_ptr + 1; - bool iptr1_aligned = hexagon::is_addr_aligned(src1_vec_ptr); - HVX_Vector curr1 = iptr1_aligned ? prev1 : *src1_vec_ptr; - src1_vec_ptr = iptr1_aligned ? src1_vec_ptr : src1_vec_ptr + 1; - HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); - HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); - HVX_VectorPair result = Q6_Wqf32_vmpy_VhfVhf(s0, s1); - sum_hi = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_hi_W(result), sum_hi); - sum_lo = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(result), sum_lo); - prev0 = curr0; - prev1 = curr1; - } - - const size_t leftover = count % kElementsPerVector; - const size_t leftover_bytes = leftover * sizeof(npu_device_fp16_t); - if (leftover > 0) { - // handle the leftover elements - HVX_Vector curr0 = (leftover_bytes + hexagon::unaligned_bytes(src0_vec_ptr) > hexagon::kBytesPerVector) ? - *src0_vec_ptr : - prev0; - curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); - - HVX_Vector curr1 = (leftover_bytes + hexagon::unaligned_bytes(src1_vec_ptr) > hexagon::kBytesPerVector) ? - *src1_vec_ptr : - prev1; - curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); - - HVX_VectorPair result = Q6_Wqf32_vmpy_VhfVhf(curr0, curr1); - - // TODO: can we do this better? - if (leftover > kFloatsPerVector) { - sum_hi = Q6_Vqf32_vadd_Vqf32Vqf32( - Q6_V_valign_VVR(Q6_V_hi_W(result), Q6_V_vzero(), (leftover % kFloatsPerVector) * sizeof(float)), - sum_hi); - sum_lo = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(result), sum_lo); - } else { - sum_lo = Q6_Vqf32_vadd_Vqf32Vqf32( - Q6_V_valign_VVR(Q6_V_lo_W(result), Q6_V_vzero(), leftover * sizeof(float)), sum_lo); - } - } - - return hexagon::vec_reduction_f32(Q6_Vqf32_vadd_Vqf32Vqf32(sum_hi, sum_lo)); -} - template struct get_data_type {}; template struct get_data_type { @@ -175,29 +20,26 @@ void mul_mat_impl(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tenso const bool is_quantized = hexagon::is_quantized_type(src0->get_type()); const auto src0_actual_row_size = hexagon::get_dequantized_row_size(src0); - auto * dequantize_row_func = hexagon::get_type_traits(src0->get_type()).dequantize_row; + auto * dequantize_row_func = hexagon::get_type_traits(src0->get_type()).to_float; if (is_quantized && dequantize_row_func == nullptr) { DEVICE_LOG_ERROR("Unsupported quantized src0 type: %d, dequantize_row_func is null\n", src0->get_type()); return; } - const auto r02 = src1->get_ne(2) / src0->get_ne(2); - const auto r03 = src1->get_ne(3) / src0->get_ne(3); - const auto * src0_ptr = reinterpret_cast(src0->get_read_buffer()); - const auto * src1_ptr = reinterpret_cast(src1->get_read_buffer()); - auto * dst_ptr = reinterpret_cast(dst->get_write_buffer()); - const auto total_planes = dst->get_ne(3) * dst->get_ne(2); + const auto r02 = src1->get_ne(2) / src0->get_ne(2); + const auto r03 = src1->get_ne(3) / src0->get_ne(3); + const auto total_planes = dst->get_ne(3) * dst->get_ne(2); auto start_end_plane = std::pair{ 0, total_planes }; auto start_end_row = std::pair{ 0, dst->get_ne(1) }; auto start_end_element = std::pair{ 0, dst->get_ne(0) }; - if (total_planes >= params->tcnt) { - start_end_plane = hexagon::get_thread_work_slice(total_planes, params->tidx, params->tcnt); - } else if (dst->get_ne(1) >= params->tcnt) { - start_end_row = hexagon::get_thread_work_slice(dst->get_ne(1), params->tidx, params->tcnt); + if (total_planes >= params->get_thread_count()) { + start_end_plane = params->get_work_slice(total_planes); + } else if (dst->get_ne(1) >= params->get_thread_count()) { + start_end_row = params->get_work_slice(dst->get_ne(1)); } else { - start_end_element = hexagon::get_thread_work_slice(dst->get_ne(0), params->tidx, params->tcnt); + start_end_element = params->get_work_slice(dst->get_ne(0)); } if (start_end_plane.second <= start_end_plane.first || start_end_row.second <= start_end_row.first || @@ -218,7 +60,7 @@ void mul_mat_impl(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tenso bool is_mem_cache = false; if (is_quantized) { src0_plane_slice_row_count = - std::min(params->vtcm_quota_size / src0_actual_row_size, src0_plane_slice_row_count); + std::min(params->get_vtcm_quota_size() / src0_actual_row_size, src0_plane_slice_row_count); src0_plane_cache_size = src0_actual_row_size * src0_plane_slice_row_count; src0_plane_cache_ptr = params->get_vtcm_cache(src0_plane_cache_size); if (src0_plane_cache_ptr == nullptr) { @@ -238,7 +80,17 @@ void mul_mat_impl(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tenso src0_plane_cache_size); const size_t valid_row_bytes = src1->get_ne(0) * sizeof(data_type); - DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_WITH_SUB_PROC(dst, params->tidx, dequant); + DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_WITH_SUB_PROC(dst, params->get_thread_index(), dequant); + + uint8_t * dst_ptr = dst->get_write_buffer(); + if (!dst_ptr) { + DEVICE_LOG_ERROR("mul_mat_impl: dst_ptr is not writable, tensor: %p, type: %s\n", (void *) dst, + hexagon::get_type_name(dst->get_type())); + return; + } + + const uint8_t * src0_ptr = src0->get_read_buffer(); + const uint8_t * src1_ptr = src1->get_read_buffer(); for (int64_t ip = start_end_plane.first; ip < start_end_plane.second; ip++) { const auto i3 = ip / dst->get_ne(2); const auto i2 = ip - i3 * dst->get_ne(2); @@ -289,6 +141,8 @@ void mul_mat_impl(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tenso } } } + + dst->release_write_buffer(); // mark the output tensor as modified } bool is_quantized_mul_mat_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1) { @@ -299,7 +153,7 @@ bool is_quantized_mul_mat_supported(const npu_device_tensor_spec & src0, const n } const auto type_traits = hexagon::get_type_traits(src0.type); - if (!type_traits.is_quantized || type_traits.dequantize_row == nullptr) { + if (!type_traits.is_quantized || type_traits.to_float == nullptr) { DEVICE_LOG_DEBUG("[MUL_MAT]src0.type(%s) and src1.type(%s) mismatch and src0 is not quantized\n", hexagon::get_type_name(src0.type), hexagon::get_type_name(src1.type)); return false; @@ -311,7 +165,7 @@ bool is_quantized_mul_mat_supported(const npu_device_tensor_spec & src0, const n return false; } - const auto vtcm_thread_quota_size = hexagon::vtcm_mem::get_total_size() / hexagon::kMaxThreadCount; + const auto vtcm_thread_quota_size = hexagon::default_thread_pool::get_per_thread_vtcm_quota(); if (src0.ne[0] * sizeof(hexagon::dequantized_element_type) > vtcm_thread_quota_size) { DEVICE_LOG_DEBUG("[MUL_MAT]src0.type(%s) ne[0] is too large: %ld, vtcm_thread_quota_size: %zu\n", hexagon::get_type_name(src0.type), (long) src0.ne[0], vtcm_thread_quota_size); @@ -339,14 +193,13 @@ bool mul_mat_f32(hexagon::tensor * out, compute_params * params) { return true; // skip if no src } - // TODO: array? switch (src1->get_type()) { case NPU_DATA_TYPE_F32: - mul_mat_impl(src0, src1, out, params); + mul_mat_impl(src0, src1, out, params); return true; case NPU_DATA_TYPE_F16: - mul_mat_impl(src0, src1, out, params); + mul_mat_impl(src0, src1, out, params); return true; default: break; @@ -356,18 +209,25 @@ bool mul_mat_f32(hexagon::tensor * out, compute_params * params) { return false; } -bool is_mul_mat_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, - const npu_device_tensor_spec & dst, npu_device_tensor_op op) { +bool is_mul_mat_supported(npu_device_tensor_op op, const npu_device_tensor_spec * dst, + const npu_device_tensor_spec * srcs, size_t src_len) { if (op != NPU_OP_MUL_MAT) { DEVICE_LOG_DEBUG("op is not MUL_MAT: %d\n", op); return false; } - if (dst.type != NPU_DATA_TYPE_F32) { - DEVICE_LOG_DEBUG("[%s]dst type is not F32: %s\n", op_get_name(op), get_type_name(dst.type)); + if (!dst || !srcs || src_len < 2) { + DEVICE_LOG_DEBUG("[%s]invalid dst or srcs\n", hexagon::op_get_name(op)); + return false; + } + + if (dst->type != NPU_DATA_TYPE_F32) { + DEVICE_LOG_DEBUG("[%s]dst type is not F32: %s\n", op_get_name(op), get_type_name(dst->type)); return false; } + const auto & src0 = srcs[0]; + const auto & src1 = srcs[1]; if (src0.type != src1.type) { #ifdef GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS if (!is_quantized_mul_mat_supported(src0, src1)) { @@ -380,15 +240,15 @@ bool is_mul_mat_supported(const npu_device_tensor_spec & src0, const npu_device_ #endif } - if (src0.ne[0] != src1.ne[0] || src0.ne[1] != dst.ne[0]) { + if (src0.ne[0] != src1.ne[0] || src0.ne[1] != dst->ne[0]) { DEVICE_LOG_DEBUG("[%s]src0 and src1 cannot multiply: %ldx%ld vs %ldx%ld\n", op_get_name(op), (long) src0.ne[0], (long) src0.ne[1], (long) src1.ne[0], (long) src1.ne[1]); return false; } - if (src1.ne[1] != dst.ne[1] || src1.ne[2] != dst.ne[2] || src1.ne[3] != dst.ne[3]) { + if (src1.ne[1] != dst->ne[1] || src1.ne[2] != dst->ne[2] || src1.ne[3] != dst->ne[3]) { DEVICE_LOG_DEBUG("[%s]src1 and dst dimensions not match: %ldx%ld vs %ldx%ld\n", op_get_name(op), - (long) src1.ne[2], (long) src1.ne[3], (long) dst.ne[2], (long) dst.ne[3]); + (long) src1.ne[2], (long) src1.ne[3], (long) dst->ne[2], (long) dst->ne[3]); return false; } diff --git a/ggml/src/ggml-qnn/npu/device/op_mul_mat.hpp b/ggml/src/ggml-qnn/npu/device/op_mul_mat.hpp index 8cf41e0a99d86..434406f9301f9 100644 --- a/ggml/src/ggml-qnn/npu/device/op_mul_mat.hpp +++ b/ggml/src/ggml-qnn/npu/device/op_mul_mat.hpp @@ -7,64 +7,8 @@ namespace hexagon { -inline size_t unaligned_bytes(const void * addr) { - return ((size_t) addr) & kAlignMask; -} - -inline bool is_addr_aligned(void * addr) { - return unaligned_bytes(addr) == 0; -} - -inline void l2fetch(const void * p, uint32_t stride, uint32_t width, uint32_t height, uint32_t dir) { - uint64_t control = HEXAGON_V64_CREATE_H(dir, stride, width, height); - __asm__ __volatile__(" l2fetch(%0,%1) " : : "r"(p), "r"(control)); -} - -inline void l2fetch_row(const uint8_t * curr_row, size_t bytes) { - // TODO: should we use small kL2FetchAheadVectors? - int32_t l2fetch_vectors = Q6_R_min_RR(bytes / kBytesPerVector, kL2FetchAheadVectors); - hexagon::l2fetch(curr_row, kBytesPerVector, kBytesPerVector, l2fetch_vectors, 0); -} - -inline float get_flt0_from_fltv(HVX_Vector vect) { - // See also: tools\HEXAGON_Tools\8.6.07\Examples\StandAlone_Applications\QFloat\QFloat.c - - union { - int32_t i; - float f; - } cvt; - - cvt.i = vect[0]; - return cvt.f; -} - -inline HVX_Vector vec_reduction_qf32(HVX_Vector sums) { - constexpr const size_t kFloatsPerVector = hexagon::kBytesPerVector / sizeof(float); - static_assert(kFloatsPerVector == 32 || kFloatsPerVector == 16, "kFloatsPerVector should be 16 or 32"); - - // TODO: do we have a better way to do the reduction? - switch (kFloatsPerVector) { - default: - case 32: - sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 16 * sizeof(float))); - // fallthrough - case 16: - sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 8 * sizeof(float))); - sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 4 * sizeof(float))); - sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 2 * sizeof(float))); - sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, sizeof(float))); - break; - } - - return sums; -} - -inline float vec_reduction_f32(HVX_Vector sums) { - return hexagon::get_flt0_from_fltv(Q6_Vsf_equals_Vqf32(vec_reduction_qf32(sums))); -} - bool mul_mat_f32(tensor * out, compute_params * params); -bool is_mul_mat_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, - const npu_device_tensor_spec & dst, npu_device_tensor_op op); +bool is_mul_mat_supported(npu_device_tensor_op op, const npu_device_tensor_spec * dst, + const npu_device_tensor_spec * srcs, size_t src_len); } // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/device/op_types.hpp b/ggml/src/ggml-qnn/npu/device/op_types.hpp index 153bbab058b89..bad83ad95e452 100644 --- a/ggml/src/ggml-qnn/npu/device/op_types.hpp +++ b/ggml/src/ggml-qnn/npu/device/op_types.hpp @@ -9,46 +9,12 @@ #include "hexagon_npu.h" #include "tensor.hpp" +#include "thread_pool.hpp" #include "util.hpp" -#include "vtcm_mem.hpp" +#include "vec_ops.hpp" namespace hexagon { -struct compute_params { - const size_t tidx; - const size_t tcnt; - const size_t vtcm_quota_size; - const float * f16_to_f32_table; - std::unique_ptr vtcm_cache; - std::unique_ptr mem_cache; - size_t mem_cache_size = 0; - - uint8_t * get_vtcm_cache(size_t size) { - if (!vtcm_cache || vtcm_cache->get_size() < size) { - vtcm_cache = std::make_unique(size, false); - } - - if (!vtcm_cache->is_valid()) { - return nullptr; - } - - return vtcm_cache->get_mem(); - } - - uint8_t * get_mem_cache(size_t size) { - if (!mem_cache || mem_cache_size < size) { - mem_cache = std::make_unique(size + 256); - mem_cache_size = mem_cache ? size : 0; - } - - return mem_cache.get(); - } -}; - -typedef bool (*compute_func_type)(tensor * dst, compute_params * params); -typedef bool (*op_is_supported_func_type)(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, - const npu_device_tensor_spec & dst, npu_device_tensor_op op); - inline constexpr std::pair get_thread_work_slice(int64_t total, size_t tidx, size_t tcnt) { if (total <= 0 || tidx >= tcnt) { return { 0, 0 }; // No work for this thread @@ -72,9 +38,27 @@ inline constexpr std::pair get_thread_work_slice(int64_t total return { start, std::min(end, total) }; } -constexpr const size_t kBytesPerVector = sizeof(HVX_Vector); // 128 for v73 -constexpr const size_t kAlignMask = kBytesPerVector - 1; -constexpr const size_t kL2CacheSize = 8 * 1024; // // 8KB L2 cache -constexpr const size_t kL2FetchAheadVectors = kL2CacheSize / kBytesPerVector; +struct compute_params { + default_thread_pool::thread_params * const thread_params; + const float * f16_to_f32_table; + + uint8_t * get_vtcm_cache(size_t size) { return thread_params->get_vtcm_cache(size); } + + uint8_t * get_mem_cache(size_t size) { return thread_params->get_mem_cache(size); } + + std::pair get_work_slice(int64_t total) const { + return get_thread_work_slice(total, thread_params->tidx, thread_params->tcnt); + } + + size_t get_vtcm_quota_size() const { return thread_params->vtcm_quota_size; } + + size_t get_thread_count() const { return thread_params->tcnt; } + + size_t get_thread_index() const { return thread_params->tidx; } +}; + +typedef bool (*compute_func_type)(tensor * dst, compute_params * params); +typedef bool (*op_is_supported_func_type)(npu_device_tensor_op op, const npu_device_tensor_spec * dst, + const npu_device_tensor_spec * srcs, size_t src_len); } // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/device/quants.cpp b/ggml/src/ggml-qnn/npu/device/quants.cpp deleted file mode 100644 index 67e77c2fc2a2a..0000000000000 --- a/ggml/src/ggml-qnn/npu/device/quants.cpp +++ /dev/null @@ -1,213 +0,0 @@ -#include "quants.hpp" - -#include - -#include - -#include "op_types.hpp" // TODO: remove this include - -static_assert(sizeof(npu_device_block_q4_K) == - 2 * sizeof(npu_device_fp16_t) + QUANT_K_SCALE_SIZE + QUANT_K_BLOCK_SIZE / 2, - "wrong q4_K block size/padding"); - -static_assert(sizeof(npu_device_block_q4_0) == sizeof(npu_device_fp16_t) + QUANT_BLOCK_SIZE / 2, - "wrong q4_0 block size/padding"); - -static_assert(sizeof(npu_device_block_q8_0) == sizeof(npu_device_fp16_t) + QUANT_BLOCK_SIZE, - "wrong q8_0 block size/padding"); - -namespace { - -inline HVX_Vector vmemu(const void * unaligned_ptr) { - HVX_Vector ret = *reinterpret_cast(unaligned_ptr); - return ret; -} - -inline float to_float(const npu_device_fp16_t src) { - return reinterpret_cast(src); -} - -template inline HVX_Vector load_block_generic(const _TBlock & src) { - uint8_t buffer[hexagon::kBytesPerVector]; - - static_assert(sizeof(buffer) == sizeof(HVX_Vector), "wrong cvt size/padding"); - static_assert(sizeof(buffer) >= sizeof(src.qs), "wrong q4_0 block size/padding"); - - memcpy(&buffer[0], src.qs, sizeof(src.qs)); - return *reinterpret_cast(buffer); -} - -template inline HVX_Vector load_dual_block_generic(const _TBlock & src1, const _TBlock & src2) { - uint8_t buffer[hexagon::kBytesPerVector]; - - static_assert(sizeof(buffer) == sizeof(HVX_Vector), "wrong cvt size/padding"); - static_assert(sizeof(buffer) >= sizeof(src1.qs) * 2, "wrong q4_0 block size/padding"); - - memcpy(&buffer[0], src1.qs, sizeof(src1.qs)); - memcpy(&buffer[sizeof(src1.qs)], src2.qs, sizeof(src2.qs)); - return *reinterpret_cast(buffer); -} - -inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8_t * m) { - if (j < 4) { - *d = q[j] & 63; - *m = q[j + 4] & 63; - } else { - *d = (q[j + 4] & 0xF) | ((q[j - 4] >> 6) << 4); - *m = (q[j + 4] >> 4) | ((q[j - 0] >> 6) << 4); - } -} - -void dequantize_row_q8_0(const void * src, float * dst, size_t count, const float * f16_to_f32_table) { - constexpr const int qk = QUANT_BLOCK_SIZE; - static_assert(QUANT_BLOCK_SIZE == hexagon::kBytesPerVector / sizeof(float)); - - const int nb = count / qk; - const auto * src_ptr = reinterpret_cast(src); - HVX_UVector * out = ((HVX_UVector *) dst); // TODO: opt for aligned access - - for (int i = 0; i < nb; i++) { - const auto & src = src_ptr[i]; - HVX_Vector d = Q6_Vh_vsplat_R(src.d); - - HVX_Vector q_lo = load_block_generic(src); - HVX_VectorPair q = Q6_Wh_vunpack_Vb(q_lo); - q = Q6_Wh_vunpack_Vb(Q6_V_lo_W(q)); - q_lo = Q6_Vhf_equals_Vh(Q6_V_lo_W(q)); - q = Q6_Wqf32_vmpy_VhfVhf(q_lo, d); - out[i] = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(q)); - } -} - -void dequantize_row_q4_0(const void * src, float * dst, size_t count, const float * f16_to_f32_table) { - constexpr const int qk = QUANT_BLOCK_SIZE; - static_assert(qk % 2 == 0, "qk must be even"); - static_assert(QUANT_BLOCK_SIZE == hexagon::kBytesPerVector / sizeof(float)); - constexpr const uint32_t kSizeOfQs = sizeof(npu_device_block_q4_0::qs); - - const int nb = count / qk; - const auto * src_ptr = reinterpret_cast(src); - HVX_Vector mask = Q6_Vb_vsplat_R(0x0F); - HVX_Vector minus = Q6_Vb_vsplat_R(8); - HVX_UVector * out = ((HVX_UVector *) dst); // TODO: opt for aligned access - - const int loop_count = nb - (nb % 2); - for (int i = 0; i < loop_count; i += 2) { - const auto & src1 = src_ptr[i]; - const auto & src2 = src_ptr[i + 1]; - - HVX_Vector d1 = Q6_Vh_vsplat_R(src1.d); - HVX_Vector d2 = Q6_Vh_vsplat_R(src2.d); - d1 = Q6_V_valign_VVR(d1, Q6_V_vzero(), hexagon::kBytesPerVector / 2); - d1 = Q6_V_valign_VVR(d2, d1, hexagon::kBytesPerVector / 2); - HVX_Vector d = Q6_Vh_vshuff_Vh(d1); - - HVX_Vector q_lo = load_dual_block_generic(src1, src2); - HVX_Vector q_hi = Q6_Vub_vlsr_VubR(q_lo, 4); - HVX_VectorPair q = Q6_W_vshuff_VVR(q_hi, Q6_V_vand_VV(q_lo, mask), kSizeOfQs); - q_lo = Q6_V_valign_VVR(Q6_V_lo_W(q), Q6_V_vzero(), hexagon::kBytesPerVector / 2); - q_lo = Q6_V_valign_VVR(Q6_V_hi_W(q), q_lo, hexagon::kBytesPerVector / 2); - q_lo = Q6_Vb_vshuff_Vb(q_lo); - q_lo = Q6_Vb_vsub_VbVb(q_lo, minus); - q = Q6_Wh_vunpack_Vb(q_lo); - q_lo = Q6_Vhf_equals_Vh(Q6_V_lo_W(q)); - q = Q6_Wqf32_vmpy_VhfVhf(q_lo, d); - out[i] = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(q)); - out[i + 1] = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(q)); - } - - if (loop_count < nb) { - const auto & curr_blk = src_ptr[nb - 1]; - HVX_Vector d = Q6_Vh_vsplat_R(curr_blk.d); - - HVX_Vector q_lo = load_block_generic(curr_blk); - HVX_Vector q_hi = Q6_Vub_vlsr_VubR(q_lo, 4); - q_lo = Q6_V_valign_VVR(Q6_V_vand_VV(q_lo, mask), Q6_V_vzero(), sizeof(curr_blk.qs)); - q_lo = Q6_V_valign_VVR(q_hi, q_lo, hexagon::kBytesPerVector - sizeof(curr_blk.qs)); - q_lo = Q6_Vb_vsub_VbVb(q_lo, minus); - - HVX_VectorPair q = Q6_Wh_vunpack_Vb(q_lo); - q = Q6_Wh_vunpack_Vb(Q6_V_lo_W(q)); - q_lo = Q6_Vhf_equals_Vh(Q6_V_lo_W(q)); - q = Q6_Wqf32_vmpy_VhfVhf(q_lo, d); - out[nb - 1] = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(q)); - } -} - -void dequantize_row_q4_K(const void * src, float * dst, size_t count, const float * f16_to_f32_table) { - const int nb = count / QUANT_K_BLOCK_SIZE; - const auto * src_ptr = reinterpret_cast(src); - - // TODO: use intrinsics - for (int i = 0; i < nb; i++) { - const uint8_t * q = src_ptr[i].qs; - - const float d = f16_to_f32_table[src_ptr[i].d]; - const float min = f16_to_f32_table[src_ptr[i].dmin]; - - int is = 0; - uint8_t sc = 0; - uint8_t m = 0; - const auto * scales = src_ptr[i].scales; - for (int j = 0; j < QUANT_K_BLOCK_SIZE; j += 64) { - get_scale_min_k4(is + 0, scales, &sc, &m); - const float d1 = d * sc; - const float m1 = min * m; - get_scale_min_k4(is + 1, scales, &sc, &m); - const float d2 = d * sc; - const float m2 = min * m; - for (int l = 0; l < 32; ++l) { - dst[0] = d1 * (q[l] & 0xF) - m1; - dst[32] = d2 * ((q[l] >> 4) & 0xF) - m2; - dst++; - } - dst += 32; - q += 32; - is += 2; - } - } -} - -constexpr const hexagon::device_type_traits kDeviceTypeTraits[] = { - { NPU_DATA_TYPE_F32, "F32", 1, false, nullptr }, - { NPU_DATA_TYPE_F16, "F16", 1, false, nullptr }, - { NPU_DATA_TYPE_Q8_0, "Q8_0", QUANT_BLOCK_SIZE, true, dequantize_row_q8_0 }, - { NPU_DATA_TYPE_Q4_0, "Q4_0", QUANT_BLOCK_SIZE, true, dequantize_row_q4_0 }, - { NPU_DATA_TYPE_Q4_K, "Q4_K", QUANT_K_BLOCK_SIZE, true, dequantize_row_q4_K }, -}; - -static_assert(std::size(kDeviceTypeTraits) == NPU_DATA_TYPE_COUNT, - "kDeviceTypeTraits size mismatch with npu_device_tensor_data_type enum"); -static_assert(kDeviceTypeTraits[NPU_DATA_TYPE_F32].type == NPU_DATA_TYPE_F32, - "kDeviceTypeTraits F32 type mismatch with npu_device_tensor_data_type enum"); -static_assert(kDeviceTypeTraits[NPU_DATA_TYPE_F16].type == NPU_DATA_TYPE_F16, - "kDeviceTypeTraits F16 type mismatch with npu_device_tensor_data_type enum"); -static_assert(kDeviceTypeTraits[NPU_DATA_TYPE_Q8_0].type == NPU_DATA_TYPE_Q8_0, - "kDeviceTypeTraits Q8_0 type mismatch with npu_device_tensor_data_type enum"); -static_assert(kDeviceTypeTraits[NPU_DATA_TYPE_Q4_0].type == NPU_DATA_TYPE_Q4_0, - "kDeviceTypeTraits Q4_0 type mismatch with npu_device_tensor_data_type enum"); -static_assert(kDeviceTypeTraits[NPU_DATA_TYPE_Q4_K].type == NPU_DATA_TYPE_Q4_K, - "kDeviceTypeTraits Q4_K type mismatch with npu_device_tensor_data_type enum"); - -} // namespace - -namespace hexagon { - -bool init_f16_f32_table(float * table, size_t count) { - constexpr const size_t kTableSize = (1U << 16); - if (count < kTableSize) { - return false; - } - - for (size_t i = 0; i < count; ++i) { - table[i] = to_float(i); - } - - return true; -} - -const device_type_traits & get_type_traits(npu_device_tensor_data_type type) { - return kDeviceTypeTraits[type]; -} - -} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/device/tensor.hpp b/ggml/src/ggml-qnn/npu/device/tensor.hpp index 7e980d8402fb2..bad260e5e50c2 100644 --- a/ggml/src/ggml-qnn/npu/device/tensor.hpp +++ b/ggml/src/ggml-qnn/npu/device/tensor.hpp @@ -3,6 +3,8 @@ #include #include +#include + #include "hexagon_npu.h" #include "util.hpp" @@ -23,7 +25,7 @@ class tensor { } _data = static_cast(mmap_address); - DEVICE_LOG_INFO("tensor(%p[%ldx%ldx%ldx%ld]), fd: %d, offset: %zu, mmap_address: %p, phy_address: 0x%lx\n", + DEVICE_LOG_INFO("tensor(%p[%ldx%ldx%ldx%ld]), fd: %d, offset: %zu, mmap_addr: %p, phy_addr: 0x%lx\n", (void *) this, (long) _info.ne[0], (long) _info.ne[1], (long) _info.ne[2], (long) _info.ne[3], _info.buffer_fd, _info.offset, (void *) mmap_address, phy_address); } @@ -47,14 +49,14 @@ class tensor { void invalidate() const { if (_data) { qurt_mem_cache_clean((qurt_addr_t) (_data + _info.offset), (qurt_size_t) _info.size, - QURT_MEM_CACHE_INVALIDATE, QURT_MEM_DCACHE); + QURT_MEM_CACHE_FLUSH_INVALIDATE_ALL, QURT_MEM_DCACHE); } } void update_config(const npu_device_tensor_update_config & config) { static_assert(sizeof(_op_params) == sizeof(config.params), "op params size mismatch"); - _info.op = config.op; + _op_type = config.op; memcpy(_op_params, config.params, sizeof(_op_params)); for (size_t i = 0; i < DEVICE_TENSOR_MAX_SRC; ++i) { auto src_handle = config.src_handles[i]; @@ -76,7 +78,12 @@ class tensor { const size_t get_nb(size_t index) const { return _info.nb[index]; } - npu_device_tensor_op get_op() const { return _info.op; } + const bool is_permuted() const { + // Check if the tensor is permuted by comparing the nb values + return is_transposed_or_permuted(_info.nb); + } + + npu_device_tensor_op get_op() const { return _op_type; } template const _TyParam get_op_param(size_t index) const { static_assert(sizeof(_TyParam) <= sizeof(_op_params), "_op_param type size exceeds op params size"); @@ -95,19 +102,34 @@ class tensor { npu_device_tensor_data_type get_type() const { return _info.type; } const uint8_t * get_read_buffer() const { - invalidate(); + if (!_info.is_constant && _has_modified) { + invalidate(); + const_cast(this)->_has_modified = false; // TODO: avoid const_cast + } + + return _data + _info.offset; + } + + uint8_t * get_write_buffer() const { + if (_info.is_constant) { + DEVICE_LOG_ERROR("Attempt to write to a constant tensor: %p", (void *) this); + return nullptr; // Do not allow writing to constant tensors + } + return _data + _info.offset; } - uint8_t * get_write_buffer() const { return _data + _info.offset; } + void release_write_buffer() { _has_modified = true; } bool is_valid() const { return _data != nullptr; } private: npu_device_tensor_config _info = {}; + npu_device_tensor_op _op_type = NPU_OP_COUNT; int32_t _op_params[kMaxParamsCount] = {}; tensor * _src[kMaxTensorSrc] = {}; uint8_t * _data = nullptr; + std::atomic_bool _has_modified = false; DISABLE_COPY_AND_MOVE(tensor); }; diff --git a/ggml/src/ggml-qnn/npu/device/thread_pool.hpp b/ggml/src/ggml-qnn/npu/device/thread_pool.hpp index 9a525213c9fad..9661c006707c3 100644 --- a/ggml/src/ggml-qnn/npu/device/thread_pool.hpp +++ b/ggml/src/ggml-qnn/npu/device/thread_pool.hpp @@ -8,6 +8,7 @@ #include #include "util.hpp" +#include "vtcm_mem.hpp" namespace hexagon { @@ -78,28 +79,65 @@ template class qurt_thread { using qurt_thread_ptr = std::unique_ptr>; -template class thread_pool { - static_assert(_thread_count > 1, "Thread count must be greater than 1"); - constexpr const static size_t kMaxSubThreadCount = _thread_count - 1; +template class thread_pool { + static_assert(_ThreadCount > 1, "Thread count must be greater than 1"); + constexpr const static size_t kMaxThreadCount = _ThreadCount; + constexpr const static size_t kMaxSubThreadCount = _ThreadCount - 1; public: typedef qurt_thread thread_type; - typedef void (*task_type)(thread_pool * pool, size_t thread_idx, size_t thread_count, void * arg); + + struct thread_params { + size_t tidx; + const size_t tcnt = kMaxThreadCount; + thread_pool * pool = nullptr; + size_t vtcm_quota_size; + + std::unique_ptr vtcm_cache; + std::unique_ptr mem_cache; + size_t mem_cache_size = 0; + + uint8_t * get_vtcm_cache(size_t size) { + if (!vtcm_cache || vtcm_cache->get_size() < size) { + DEVICE_SCOPED_PERFORMANCE_TRACKER("[thread_params]get_vtcm_cache, size: %zu, tidx: %zu", size, tidx); + vtcm_cache.reset(); // reset the cache to create a new one + vtcm_cache = std::make_unique(size, false); + } + + if (!vtcm_cache->is_valid()) { + return nullptr; + } + + return vtcm_cache->get_mem(); + } + + uint8_t * get_mem_cache(size_t size) { + if (!mem_cache || mem_cache_size < size) { + mem_cache.reset(); // reset the cache to create a new one + mem_cache = std::make_unique(size + 256); + mem_cache_size = mem_cache ? size : 0; + } + + return mem_cache.get(); + } + }; + + typedef void (*task_type)(thread_pool * pool, thread_params * param, void * arg); thread_pool() { - std::string thread_name_base = "thread_pool_"; + for (size_t i = 0; i < kMaxThreadCount; ++i) { + _thread_params[i].tidx = i; + _thread_params[i].vtcm_quota_size = hexagon::vtcm_mem::get_avail_block_size() / kMaxThreadCount; + _thread_params[i].pool = this; + } + qurt_barrier_init(&_pending, kMaxSubThreadCount + 1); qurt_barrier_init(&_completed, kMaxSubThreadCount + 1); - const auto priority = qurt_thread_get_priority(qurt_thread_get_id()); + const auto priority = qurt_thread_get_priority(qurt_thread_get_id()); + std::string thread_name_base = "thread_pool_"; for (size_t i = 0; i < kMaxSubThreadCount; ++i) { - auto & thread_arg = _thread_args[i]; - thread_arg.pool = this; - thread_arg.thread_idx = i + 1; - auto thread = std::make_unique( - thread_name_base + std::to_string(i), - reinterpret_cast(&thread_pool::thread_func_impl), &thread_arg, - priority); + thread_name_base + std::to_string(i), &thread_pool::thread_func_impl, &_thread_params[i + 1], priority); if (!thread->is_valid()) { DEVICE_LOG_ERROR("Failed to create thread: %zu", i); // destroy all barriers and threads at destructor @@ -108,6 +146,7 @@ template class thread_pool { _threads[i] = std::move(thread); } + DEVICE_LOG_DEBUG("thread_pool.created: %zu", kMaxSubThreadCount); } @@ -130,60 +169,85 @@ template class thread_pool { return false; } +#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING + _task_begin_cycles = HAP_perf_get_qtimer_count(); +#endif + _task = task; _arg = arg; qurt_barrier_wait(&_pending); - task(this, 0, kMaxSubThreadCount + 1, arg); + task(this, &_thread_params[0], arg); DEVICE_LOG_DEBUG("main_thread.task_completed: 0"); qurt_barrier_wait(&_completed); _task = nullptr; _arg = nullptr; + +#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING + _task_begin_cycles = 0; +#endif + return true; } void sync_thread() { qurt_barrier_wait(&_completed); } - private: - struct thread_pool_arg { - thread_pool * pool = nullptr; - size_t thread_idx = 0; - }; + static size_t get_per_thread_vtcm_quota() { return vtcm_mem::get_total_size() / kMaxThreadCount; } - static void thread_func_impl(thread_type * thread, thread_pool_arg * arg) { + private: + static void thread_func_impl(thread_type * thread, void * arg) { NPU_UNUSED(thread); - DEVICE_LOG_DEBUG("thread_func_impl.start: %zu", arg->thread_idx); + auto * param = reinterpret_cast(arg); - auto & pool = *arg->pool; + DEVICE_LOG_DEBUG("thread_func_impl.start: %zu", param->tidx); + + auto & pool = *(param->pool); for (;;) { qurt_barrier_wait(&pool._pending); if (pool._thread_exit) { - DEVICE_LOG_DEBUG("thread_func_impl.exit: %zu", arg->thread_idx); + DEVICE_LOG_DEBUG("thread_func_impl.exit: %zu", param->tidx); break; } +#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING + auto task_begin_cycles = pool._task_begin_cycles.load(); + DEVICE_LOG_WARN("[profiler]worker_thread, tidx: %zu, prepare: %lluus", param->tidx, + static_cast( + HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - task_begin_cycles))); +#endif + auto task = pool._task; if (task) { - task(arg->pool, arg->thread_idx, kMaxSubThreadCount + 1, pool._arg); + task(param->pool, param, pool._arg); } - DEVICE_LOG_DEBUG("thread_func_impl.task_completed: %zu", arg->thread_idx); + DEVICE_LOG_DEBUG("thread_func_impl.task_completed: %zu", param->tidx); qurt_barrier_wait(&pool._completed); + +#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING + DEVICE_LOG_WARN("[profiler]worker_thread, tidx: %zu, task_end: %lluus", param->tidx, + static_cast( + HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - task_begin_cycles))); +#endif } - DEVICE_LOG_DEBUG("thread_func_impl.end: %zu", arg->thread_idx); + DEVICE_LOG_DEBUG("thread_func_impl.end: %zu", param->tidx); } std::atomic_bool _thread_exit = false; std::array _threads; - thread_pool_arg _thread_args[kMaxSubThreadCount] = {}; - qurt_barrier_t _pending = {}; - qurt_barrier_t _completed = {}; - task_type _task = nullptr; - void * _arg = nullptr; + qurt_barrier_t _pending = {}; + qurt_barrier_t _completed = {}; + thread_params _thread_params[kMaxThreadCount] = {}; + task_type _task = nullptr; + void * _arg = nullptr; + +#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING + std::atomic _task_begin_cycles = 0; +#endif DISABLE_COPY_AND_MOVE(thread_pool); }; diff --git a/ggml/src/ggml-qnn/npu/device/type_traits.cpp b/ggml/src/ggml-qnn/npu/device/type_traits.cpp new file mode 100644 index 0000000000000..87d361819d620 --- /dev/null +++ b/ggml/src/ggml-qnn/npu/device/type_traits.cpp @@ -0,0 +1,467 @@ +#include "type_traits.hpp" + +#include + +#include + +#include "op_types.hpp" // TODO: remove this include +#include "vec_ops.hpp" + +static_assert(sizeof(npu_device_block_q4_k) == + 2 * sizeof(npu_device_fp16_t) + QUANT_K_SCALE_SIZE + QUANT_K_BLOCK_SIZE / 2, + "wrong q4_K block size/padding"); + +static_assert(sizeof(npu_device_block_q4_0) == sizeof(npu_device_fp16_t) + QUANT_BLOCK_SIZE / 2, + "wrong q4_0 block size/padding"); + +static_assert(sizeof(npu_device_block_q8_0) == sizeof(npu_device_fp16_t) + QUANT_BLOCK_SIZE, + "wrong q8_0 block size/padding"); + +namespace { + +inline float to_float(const npu_device_fp16_t src) { + return reinterpret_cast(src); +} + +inline npu_device_fp16_t to_fp16(const float src) { + __fp16 f16_value = static_cast<__fp16>(src); + return reinterpret_cast(f16_value); +} + +template inline HVX_Vector load_block_generic(const _TBlock & src) { + uint8_t buffer[hexagon::kBytesPerVector]; + + static_assert(sizeof(buffer) == sizeof(HVX_Vector), "wrong cvt size/padding"); + static_assert(sizeof(buffer) >= sizeof(src.qs), "wrong q4_0 block size/padding"); + + memcpy(&buffer[0], src.qs, sizeof(src.qs)); + return *reinterpret_cast(buffer); +} + +template inline HVX_Vector load_dual_block_generic(const _TBlock & src1, const _TBlock & src2) { + uint8_t buffer[hexagon::kBytesPerVector]; + + static_assert(sizeof(buffer) == sizeof(HVX_Vector), "wrong cvt size/padding"); + static_assert(sizeof(buffer) >= sizeof(src1.qs) * 2, "wrong q4_0 block size/padding"); + + memcpy(&buffer[0], src1.qs, sizeof(src1.qs)); + memcpy(&buffer[sizeof(src1.qs)], src2.qs, sizeof(src2.qs)); + return *reinterpret_cast(buffer); +} + +inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8_t * m) { + if (j < 4) { + *d = q[j] & 63; + *m = q[j + 4] & 63; + } else { + *d = (q[j + 4] & 0xF) | ((q[j - 4] >> 6) << 4); + *m = (q[j + 4] >> 4) | ((q[j - 0] >> 6) << 4); + } +} + +inline int nearest_int(float fval) { + float val = fval + 12582912.f; + int i = reinterpret_cast(val); + return (i & 0x007fffff) - 0x00400000; +} + +float make_qkx2_quants(int n, int nmax, const float * x, const float * weights, uint8_t * L, float * the_min, + uint8_t * Laux, float rmin, float rdelta, int nstep, bool use_mad) { + float min = x[0]; + float max = x[0]; + float sum_w = weights[0]; + float sum_x = sum_w * x[0]; + for (int i = 1; i < n; ++i) { + if (x[i] < min) { + min = x[i]; + } + if (x[i] > max) { + max = x[i]; + } + float w = weights[i]; + sum_w += w; + sum_x += w * x[i]; + } + if (min > 0) { + min = 0; + } + if (max == min) { + for (int i = 0; i < n; ++i) { + L[i] = 0; + } + *the_min = -min; + return 0.f; + } + float iscale = nmax / (max - min); + float scale = 1 / iscale; + float best_mad = 0; + for (int i = 0; i < n; ++i) { + int l = nearest_int(iscale * (x[i] - min)); + L[i] = std::max(0, std::min(nmax, l)); + float diff = scale * L[i] + min - x[i]; + diff = use_mad ? fabsf(diff) : diff * diff; + float w = weights[i]; + best_mad += w * diff; + } + if (nstep < 1) { + *the_min = -min; + return scale; + } + for (int is = 0; is <= nstep; ++is) { + iscale = (rmin + rdelta * is + nmax) / (max - min); + float sum_l = 0, sum_l2 = 0, sum_xl = 0; + for (int i = 0; i < n; ++i) { + int l = nearest_int(iscale * (x[i] - min)); + l = std::max(0, std::min(nmax, l)); + Laux[i] = l; + float w = weights[i]; + sum_l += w * l; + sum_l2 += w * l * l; + sum_xl += w * l * x[i]; + } + float D = sum_w * sum_l2 - sum_l * sum_l; + if (D > 0) { + float this_scale = (sum_w * sum_xl - sum_x * sum_l) / D; + float this_min = (sum_l2 * sum_x - sum_l * sum_xl) / D; + if (this_min > 0) { + this_min = 0; + this_scale = sum_xl / sum_l2; + } + float mad = 0; + for (int i = 0; i < n; ++i) { + float diff = this_scale * Laux[i] + this_min - x[i]; + diff = use_mad ? fabsf(diff) : diff * diff; + float w = weights[i]; + mad += w * diff; + } + if (mad < best_mad) { + for (int i = 0; i < n; ++i) { + L[i] = Laux[i]; + } + best_mad = mad; + scale = this_scale; + min = this_min; + } + } + } + *the_min = -min; + return scale; +} + +void quantize_row_fp16(const float * src, void * dst, size_t count, const float * f16_to_f32_table) { + auto * out = reinterpret_cast(dst); + // TODO: use hvx intrinsics for better performance + for (size_t i = 0; i < count; i++) { + out[i] = to_fp16(src[i]); + } +} + +void quantize_row_q8_0(const float * src, void * dst, size_t count, const float * f16_to_f32_table) { + const int nb = count / QUANT_BLOCK_SIZE; + auto * out = reinterpret_cast(dst); + + for (int i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + + for (int j = 0; j < QUANT_BLOCK_SIZE; j++) { + const float v = src[i * QUANT_BLOCK_SIZE + j]; + amax = std::max(amax, fabsf(v)); + } + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f / d : 0.0f; + + out[i].d = to_fp16(d); + + for (int j = 0; j < QUANT_BLOCK_SIZE; ++j) { + const float x0 = src[i * QUANT_BLOCK_SIZE + j] * id; + + out[i].qs[j] = roundf(x0); + } + } +} + +void quantize_row_q4_0(const float * src, void * dst, size_t count, const float * f16_to_f32_table) { + constexpr const int qk = QUANT_BLOCK_SIZE; + + const int nb = count / qk; + auto * out = reinterpret_cast(dst); + + for (int i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + float max = 0.0f; + + for (int j = 0; j < qk; j++) { + const float v = src[i * qk + j]; + if (amax < fabsf(v)) { + amax = fabsf(v); + max = v; + } + } + + const float d = max / -8; + const float id = d ? 1.0f / d : 0.0f; + + out[i].d = to_fp16(d); + + for (int j = 0; j < qk / 2; ++j) { + const float x0 = src[i * qk + 0 + j] * id; + const float x1 = src[i * qk + qk / 2 + j] * id; + + const uint8_t xi0 = std::min(15, (x0 + 8.5f)); + const uint8_t xi1 = std::min(15, (x1 + 8.5f)); + + out[i].qs[j] = xi0; + out[i].qs[j] |= xi1 << 4; + } + } +} + +void quantize_row_q4_K(const float * src, void * dst, size_t count, const float * f16_to_f32_table) { + const int nb = count / QUANT_K_BLOCK_SIZE; + auto * out = reinterpret_cast(dst); + + uint8_t L[QUANT_K_BLOCK_SIZE]; + uint8_t Laux[32]; + float weights[32]; + float mins[QUANT_K_BLOCK_SIZE / 32]; + float scales[QUANT_K_BLOCK_SIZE / 32]; + + for (int i = 0; i < nb; i++) { + float max_scale = 0; // as we are deducting the min, scales are always positive + float max_min = 0; + for (int j = 0; j < QUANT_K_BLOCK_SIZE / 32; ++j) { + //scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j], 9, 0.5f); + float sum_x2 = 0; + for (int l = 0; l < 32; ++l) { + sum_x2 += src[32 * j + l] * src[32 * j + l]; + } + float av_x = sqrtf(sum_x2 / 32); + for (int l = 0; l < 32; ++l) { + weights[l] = av_x + fabsf(src[32 * j + l]); + } + scales[j] = + make_qkx2_quants(32, 15, src + 32 * j, weights, L + 32 * j, &mins[j], Laux, -1.f, 0.1f, 20, false); + float scale = scales[j]; + if (scale > max_scale) { + max_scale = scale; + } + float min = mins[j]; + if (min > max_min) { + max_min = min; + } + } + + float inv_scale = max_scale > 0 ? 63.f / max_scale : 0.f; + float inv_min = max_min > 0 ? 63.f / max_min : 0.f; + for (int j = 0; j < QUANT_K_BLOCK_SIZE / 32; ++j) { + uint8_t ls = nearest_int(inv_scale * scales[j]); + uint8_t lm = nearest_int(inv_min * mins[j]); + ls = std::min(63, ls); + lm = std::min(63, lm); + if (j < 4) { + out[i].scales[j] = ls; + out[i].scales[j + 4] = lm; + } else { + out[i].scales[j + 4] = (ls & 0xF) | ((lm & 0xF) << 4); + out[i].scales[j - 4] |= ((ls >> 4) << 6); + out[i].scales[j - 0] |= ((lm >> 4) << 6); + } + } + out[i].d = to_fp16(max_scale / 63.f); + out[i].dmin = to_fp16(max_min / 63.f); + + uint8_t sc, m; + for (int j = 0; j < QUANT_K_BLOCK_SIZE / 32; ++j) { + get_scale_min_k4(j, out[i].scales, &sc, &m); + const float d = f16_to_f32_table[out[i].d] * sc; + if (!d) { + continue; + } + const float dm = f16_to_f32_table[out[i].dmin] * m; + for (int ii = 0; ii < 32; ++ii) { + int l = nearest_int((src[32 * j + ii] + dm) / d); + l = std::max(0, std::min(15, l)); + L[32 * j + ii] = l; + } + } + + uint8_t * q = out[i].qs; + for (int j = 0; j < QUANT_K_BLOCK_SIZE; j += 64) { + for (int l = 0; l < 32; ++l) { + q[l] = L[j + l] | (L[j + l + 32] << 4); + } + q += 32; + } + + src += QUANT_K_BLOCK_SIZE; + } +} + +void dequantize_row_q8_0(const void * src, float * dst, size_t count, const float * f16_to_f32_table) { + constexpr const int qk = QUANT_BLOCK_SIZE; + static_assert(QUANT_BLOCK_SIZE == hexagon::kBytesPerVector / sizeof(float)); + + const int nb = count / qk; + const auto * src_ptr = reinterpret_cast(src); + HVX_UVector * out = ((HVX_UVector *) dst); // TODO: opt for aligned access + + for (int i = 0; i < nb; i++) { + const auto & src = src_ptr[i]; + HVX_Vector d = Q6_Vh_vsplat_R(src.d); + + HVX_Vector q_lo = load_block_generic(src); + HVX_VectorPair q = Q6_Wh_vunpack_Vb(q_lo); + q = Q6_Wh_vunpack_Vb(Q6_V_lo_W(q)); + q_lo = Q6_Vhf_equals_Vh(Q6_V_lo_W(q)); + q = Q6_Wqf32_vmpy_VhfVhf(q_lo, d); + out[i] = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(q)); + } +} + +void dequantize_row_q4_0(const void * src, float * dst, size_t count, const float * f16_to_f32_table) { + constexpr const int qk = QUANT_BLOCK_SIZE; + static_assert(qk % 2 == 0, "qk must be even"); + static_assert(QUANT_BLOCK_SIZE == hexagon::kBytesPerVector / sizeof(float)); + constexpr const uint32_t kSizeOfQs = sizeof(npu_device_block_q4_0::qs); + + const int nb = count / qk; + const auto * src_ptr = reinterpret_cast(src); + HVX_Vector mask = Q6_Vb_vsplat_R(0x0F); + HVX_Vector minus = Q6_Vb_vsplat_R(8); + HVX_UVector * out = ((HVX_UVector *) dst); // TODO: opt for aligned access + + const int loop_count = nb - (nb % 2); + for (int i = 0; i < loop_count; i += 2) { + const auto & src1 = src_ptr[i]; + const auto & src2 = src_ptr[i + 1]; + + HVX_Vector d1 = Q6_Vh_vsplat_R(src1.d); + HVX_Vector d2 = Q6_Vh_vsplat_R(src2.d); + HVX_Vector d = Q6_Vh_vshuff_Vh(Q6_V_valign_VVR(d2, d1, hexagon::kBytesPerVector / 2)); + + HVX_Vector q_lo = load_dual_block_generic(src1, src2); + HVX_Vector q_hi = Q6_Vub_vlsr_VubR(q_lo, 4); + HVX_VectorPair q = Q6_W_vshuff_VVR(q_hi, Q6_V_vand_VV(q_lo, mask), kSizeOfQs); + q_lo = Q6_V_valign_VVR(Q6_V_lo_W(q), Q6_V_vzero(), hexagon::kBytesPerVector / 2); + q_lo = Q6_V_valign_VVR(Q6_V_hi_W(q), q_lo, hexagon::kBytesPerVector / 2); + q_lo = Q6_Vb_vshuff_Vb(q_lo); + q_lo = Q6_Vb_vsub_VbVb(q_lo, minus); + q = Q6_Wh_vunpack_Vb(q_lo); + q_lo = Q6_Vhf_equals_Vh(Q6_V_lo_W(q)); + q = Q6_Wqf32_vmpy_VhfVhf(q_lo, d); + out[i] = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(q)); + out[i + 1] = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(q)); + } + + if (loop_count < nb) { + const auto & curr_blk = src_ptr[nb - 1]; + HVX_Vector d = Q6_Vh_vsplat_R(curr_blk.d); + + HVX_Vector q_lo = load_block_generic(curr_blk); + HVX_Vector q_hi = Q6_Vub_vlsr_VubR(q_lo, 4); + q_lo = Q6_V_valign_VVR(Q6_V_vand_VV(q_lo, mask), Q6_V_vzero(), sizeof(curr_blk.qs)); + q_lo = Q6_V_valign_VVR(q_hi, q_lo, hexagon::kBytesPerVector - sizeof(curr_blk.qs)); + q_lo = Q6_Vb_vsub_VbVb(q_lo, minus); + + HVX_VectorPair q = Q6_Wh_vunpack_Vb(q_lo); + q = Q6_Wh_vunpack_Vb(Q6_V_lo_W(q)); + q_lo = Q6_Vhf_equals_Vh(Q6_V_lo_W(q)); + q = Q6_Wqf32_vmpy_VhfVhf(q_lo, d); + out[nb - 1] = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(q)); + } +} + +void dequantize_row_q4_K(const void * src, float * dst, size_t count, const float * f16_to_f32_table) { + const int nb = count / QUANT_K_BLOCK_SIZE; + const auto * src_ptr = reinterpret_cast(src); + + // TODO: use intrinsics + for (int i = 0; i < nb; i++) { + const uint8_t * q = src_ptr[i].qs; + + const float d = f16_to_f32_table[src_ptr[i].d]; + const float min = f16_to_f32_table[src_ptr[i].dmin]; + + int is = 0; + uint8_t sc = 0; + uint8_t m = 0; + const auto * scales = src_ptr[i].scales; + for (int j = 0; j < QUANT_K_BLOCK_SIZE; j += 64) { + get_scale_min_k4(is + 0, scales, &sc, &m); + const float d1 = d * sc; + const float m1 = min * m; + get_scale_min_k4(is + 1, scales, &sc, &m); + const float d2 = d * sc; + const float m2 = min * m; + for (int l = 0; l < 32; ++l) { + dst[0] = d1 * (q[l] & 0xF) - m1; + dst[32] = d2 * ((q[l] >> 4) & 0xF) - m2; + dst++; + } + dst += 32; + q += 32; + is += 2; + } + } +} + +template struct dot_func_traits {}; + +template struct dot_func_traits { + using param_type = std::remove_const_t>; +}; + +template float wrap_dot_func(const void * src0, const void * src1, size_t count) { + using param_type = typename dot_func_traits::param_type; + return _Func(reinterpret_cast(src0), reinterpret_cast(src1), count); +} + +constexpr const hexagon::device_type_traits kDeviceTypeTraits[] = { + { NPU_DATA_TYPE_F32, "F32", 1, sizeof(float), false, nullptr, nullptr, + wrap_dot_func }, + { NPU_DATA_TYPE_F16, "F16", 1, sizeof(npu_device_fp16_t), false, nullptr, quantize_row_fp16, + wrap_dot_func }, + { NPU_DATA_TYPE_Q8_0, "Q8_0", QUANT_BLOCK_SIZE, sizeof(npu_device_block_q8_0), true, dequantize_row_q8_0, + quantize_row_q8_0 }, + { NPU_DATA_TYPE_Q4_0, "Q4_0", QUANT_BLOCK_SIZE, sizeof(npu_device_block_q4_0), true, dequantize_row_q4_0, + quantize_row_q4_0 }, + { NPU_DATA_TYPE_Q4_K, "Q4_K", QUANT_K_BLOCK_SIZE, sizeof(npu_device_block_q4_k), true, dequantize_row_q4_K, + quantize_row_q4_K }, +}; + +static_assert(std::size(kDeviceTypeTraits) == NPU_DATA_TYPE_COUNT, + "kDeviceTypeTraits size mismatch with npu_device_tensor_data_type enum"); +static_assert(kDeviceTypeTraits[NPU_DATA_TYPE_F32].type == NPU_DATA_TYPE_F32, + "kDeviceTypeTraits F32 type mismatch with npu_device_tensor_data_type enum"); +static_assert(kDeviceTypeTraits[NPU_DATA_TYPE_F16].type == NPU_DATA_TYPE_F16, + "kDeviceTypeTraits F16 type mismatch with npu_device_tensor_data_type enum"); +static_assert(kDeviceTypeTraits[NPU_DATA_TYPE_Q8_0].type == NPU_DATA_TYPE_Q8_0, + "kDeviceTypeTraits Q8_0 type mismatch with npu_device_tensor_data_type enum"); +static_assert(kDeviceTypeTraits[NPU_DATA_TYPE_Q4_0].type == NPU_DATA_TYPE_Q4_0, + "kDeviceTypeTraits Q4_0 type mismatch with npu_device_tensor_data_type enum"); +static_assert(kDeviceTypeTraits[NPU_DATA_TYPE_Q4_K].type == NPU_DATA_TYPE_Q4_K, + "kDeviceTypeTraits Q4_K type mismatch with npu_device_tensor_data_type enum"); + +} // namespace + +namespace hexagon { + +bool init_f16_f32_table(float * table, size_t count) { + constexpr const size_t kTableSize = (1U << 16); + if (count < kTableSize) { + return false; + } + + for (size_t i = 0; i < count; ++i) { + table[i] = to_float(i); + } + + return true; +} + +const device_type_traits & get_type_traits(npu_device_tensor_data_type type) { + return kDeviceTypeTraits[type]; +} + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/device/quants.hpp b/ggml/src/ggml-qnn/npu/device/type_traits.hpp similarity index 65% rename from ggml/src/ggml-qnn/npu/device/quants.hpp rename to ggml/src/ggml-qnn/npu/device/type_traits.hpp index 6006cd22e93a4..1a0b1665aeaad 100644 --- a/ggml/src/ggml-qnn/npu/device/quants.hpp +++ b/ggml/src/ggml-qnn/npu/device/type_traits.hpp @@ -7,14 +7,20 @@ namespace hexagon { bool init_f16_f32_table(float * table, size_t count); +typedef void (*quantize_row_type)(const float * src, void * dst, size_t count, const float * f16_to_f32_table); typedef void (*dequantize_row_type)(const void * src, float * dst, size_t count, const float * f16_to_f32_table); +typedef float (*vec_dot_type)(const void * src0, const void * src1, size_t count); struct device_type_traits { npu_device_tensor_data_type type; const char * type_name; int64_t blck_size; + size_t type_size; bool is_quantized; - dequantize_row_type dequantize_row; + + dequantize_row_type to_float; + quantize_row_type from_float; + vec_dot_type vec_dot; }; const device_type_traits & get_type_traits(npu_device_tensor_data_type type); @@ -44,10 +50,10 @@ inline const char * get_type_name(npu_device_tensor_data_type type) { #ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING namespace hexagon { -inline auto make_scoped_op_perf_timer(tensor * op, size_t tidx, const char * sub_proc_log_prefix = nullptr) { +inline auto make_scoped_op_perf_timer(tensor * op, size_t tidx) { auto * src0 = op->get_src(0); auto * src1 = op->get_src(1); - char buffer[512]; + char buffer[1024]; if (src1 == nullptr) { snprintf(buffer, sizeof(buffer), "[%s][%lldx%lldx%lldx%lld%s], tidx: %zu", op_get_name(op->get_op()), src0->get_ne(0), src0->get_ne(1), src0->get_ne(2), src0->get_ne(3), get_type_name(src0->get_type()), @@ -58,7 +64,7 @@ inline auto make_scoped_op_perf_timer(tensor * op, size_t tidx, const char * sub get_type_name(src0->get_type()), src1->get_ne(0), src1->get_ne(1), src1->get_ne(2), src1->get_ne(3), get_type_name(src1->get_type()), tidx); } - return npu_scoped_timer<512>(buffer, sub_proc_log_prefix); + return npu_scoped_timer<1024>(buffer); } } // namespace hexagon @@ -67,14 +73,23 @@ inline auto make_scoped_op_perf_timer(tensor * op, size_t tidx, const char * sub auto __npu_op_timer_##__LINE__ = hexagon::make_scoped_op_perf_timer(op, tidx) # define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_WITH_SUB_PROC(op, tidx, sub_prefix) \ - auto __npu_op_timer_##sub_prefix = hexagon::make_scoped_op_perf_timer(op, tidx, #sub_prefix) + auto __npu_op_timer_##sub_prefix = hexagon::make_scoped_op_perf_timer(op, tidx) + +# define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_SUB_PROC(sub_prefix) \ + hexagon::npu_sub_process_scoped_timer \ + __npu_op_sub_timer##sub_prefix(__npu_op_timer_##sub_prefix, #sub_prefix) + +# define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_WITH_MULTI_SUB_PROC(op, tidx, tracker_name) \ + auto __npu_op_timer_##tracker_name = hexagon::make_scoped_op_perf_timer(op, tidx) -# define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_SUB_PROC(sub_prefix) \ - hexagon::npu_sub_process_scoped_timer \ - __npu_op_sub_timer##sub_prefix(__npu_op_timer_##sub_prefix) +# define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(tracker_name, idx, sub_prefix) \ + hexagon::npu_sub_process_scoped_timer \ + __npu_op_sub_timer##sub_prefix(__npu_op_timer_##tracker_name, #sub_prefix) #else -# define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER(op, tidx) ((void) 0) -# define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_WITH_SUB_PROC(op, tidx, sub_prefix) ((void) 0) -# define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_SUB_PROC(sub_prefix) ((void) 0) +# define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER(op, tidx) ((void) 0) +# define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_WITH_SUB_PROC(op, tidx, sub_prefix) ((void) 0) +# define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_SUB_PROC(sub_prefix) ((void) 0) +# define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_WITH_MULTI_SUB_PROC(op, tidx, tracker_name) ((void) 0) +# define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(tracker_name, idx, sub_prefix) ((void) 0) #endif diff --git a/ggml/src/ggml-qnn/npu/device/util.hpp b/ggml/src/ggml-qnn/npu/device/util.hpp index 3ae7f100de507..8c819fe5838b2 100644 --- a/ggml/src/ggml-qnn/npu/device/util.hpp +++ b/ggml/src/ggml-qnn/npu/device/util.hpp @@ -52,11 +52,18 @@ inline constexpr const char * op_get_name(npu_device_tensor_op op) { return "MUL"; case NPU_OP_RMS_NORM: return "RMS_NORM"; + case NPU_OP_FLASH_ATTN: + return "FLASH_ATTN_EXT"; default: return "UNKNOWN"; } } +inline bool is_transposed_or_permuted(const npu_device_nb_type & nb) { + // Check if the tensor is transposed or permuted + return (nb[0] > nb[1]) || (nb[1] > nb[2]) || (nb[2] > nb[3]); +} + class power_utils { public: power_utils() { @@ -160,16 +167,22 @@ class power_utils { #ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING +struct sub_process_data { + char log_prefix[32] = {}; + uint64_t proc_cycles = 0; + uint64_t proc_pcycles = 0; + uint64_t proc_count = 0; +}; + template class npu_scoped_timer { public: - enum { kBufferCount = _buffer_count }; + enum { + kBufferCount = _buffer_count, + kSubProcCount = 4, + }; - explicit npu_scoped_timer(const char * log_prefix, const char * sub_proc_log_prefix) { + explicit npu_scoped_timer(const char * log_prefix) { strncpy(_log_prefix, log_prefix, kBufferCount - 1); - if (sub_proc_log_prefix != nullptr) { - strncpy(_sub_proc_log_prefix, sub_proc_log_prefix, kBufferCount - 1); - } - _begin_cycles = HAP_perf_get_qtimer_count(); _begin_pcycles = HAP_perf_get_pcycles(); } @@ -180,61 +193,121 @@ template class npu_scoped_timer { void operator=(npu_scoped_timer && other) { strncpy(_log_prefix, other._log_prefix, kBufferCount - 1); - strncpy(_sub_proc_log_prefix, other._sub_proc_log_prefix, kBufferCount - 1); - _begin_cycles = other._begin_cycles; - _sub_proc_cycles = other._sub_proc_cycles; - _sub_proc_count = other._sub_proc_count; + _begin_cycles = other._begin_cycles; + _begin_pcycles = other._begin_pcycles; + memcpy(&_sub_proc_data, &other._sub_proc_data, sizeof(_sub_proc_data)); } - void add_sub_proc_cycles(uint64_t cycles, uint64_t pcycles) { - _sub_proc_cycles += cycles; - _sub_proc_pcycles += pcycles; - _sub_proc_count++; + void add_sub_proc_cycles(size_t sub_proc_idx, const char * sub_proc_prefix, uint64_t cycles, uint64_t pcycles) { + auto & sub_proc_data = _sub_proc_data[sub_proc_idx]; + sub_proc_data.proc_cycles += cycles; + sub_proc_data.proc_pcycles += pcycles; + + if (!sub_proc_data.proc_count) { + strncpy(sub_proc_data.log_prefix, sub_proc_prefix, sizeof(sub_proc_data.log_prefix) - 1); + } + + sub_proc_data.proc_count++; } void print() const { + static_assert(kSubProcCount == 4, "Sub process count must be 4 for logging format"); + auto total_cycles = HAP_perf_get_qtimer_count() - _begin_cycles; auto total_pcycles = HAP_perf_get_pcycles() - _begin_pcycles; auto duration = HAP_perf_qtimer_count_to_us(total_cycles); - if (_sub_proc_count > 0) { - auto sub_proc_duration = HAP_perf_qtimer_count_to_us(_sub_proc_cycles); - DEVICE_LOG_WARN("[profiler]%s, pcyc: %llu, dur: %lluus, [%s]cnt: %llu, pcyc: %llu, dur: %lluus\n", - _log_prefix, total_pcycles, duration, _sub_proc_log_prefix, _sub_proc_count, - _sub_proc_pcycles, sub_proc_duration); - } else { - DEVICE_LOG_WARN("[profiler]%s, pcyc: %llu, dur: %lluus\n", _log_prefix, total_pcycles, duration); + int sub_proc_count = 0; + for (int i = kSubProcCount; i > 0; --i) { + if (_sub_proc_data[i - 1].proc_count > 0) { + sub_proc_count = i; + break; + } + } + + auto sub_proc0_duration = HAP_perf_qtimer_count_to_us(_sub_proc_data[0].proc_cycles); + auto sub_proc1_duration = HAP_perf_qtimer_count_to_us(_sub_proc_data[1].proc_cycles); + auto sub_proc2_duration = HAP_perf_qtimer_count_to_us(_sub_proc_data[2].proc_cycles); + auto sub_proc3_duration = HAP_perf_qtimer_count_to_us(_sub_proc_data[3].proc_cycles); + + switch (sub_proc_count) { + case 4: + DEVICE_LOG_WARN( + "[profiler]%s, pcyc: %llu, dur: %lluus, [%s]cnt: %llu, dur: %lluus, " + "[%s]cnt: %llu, dur: %lluus, [%s]cnt: %llu, dur: %lluus, " + "[%s]cnt: %llu, dur: %lluus\n", + _log_prefix, (unsigned long long) total_pcycles, (unsigned long long) duration, + _sub_proc_data[0].log_prefix, (unsigned long long) _sub_proc_data[0].proc_count, + (unsigned long long) sub_proc0_duration, _sub_proc_data[1].log_prefix, + (unsigned long long) _sub_proc_data[1].proc_count, (unsigned long long) sub_proc1_duration, + _sub_proc_data[2].log_prefix, (unsigned long long) _sub_proc_data[2].proc_count, + (unsigned long long) sub_proc2_duration, _sub_proc_data[3].log_prefix, + (unsigned long long) _sub_proc_data[3].proc_count, (unsigned long long) sub_proc3_duration); + break; + case 3: + DEVICE_LOG_WARN( + "[profiler]%s, pcyc: %llu, dur: %lluus, [%s]cnt: %llu, dur: %lluus, " + "[%s]cnt: %llu, dur: %lluus, [%s]cnt: %llu, dur: %lluus\n", + _log_prefix, (unsigned long long) total_pcycles, (unsigned long long) duration, + _sub_proc_data[0].log_prefix, (unsigned long long) _sub_proc_data[0].proc_count, + (unsigned long long) sub_proc0_duration, _sub_proc_data[1].log_prefix, + (unsigned long long) _sub_proc_data[1].proc_count, (unsigned long long) sub_proc1_duration, + _sub_proc_data[2].log_prefix, (unsigned long long) _sub_proc_data[2].proc_count, + (unsigned long long) sub_proc2_duration); + break; + case 2: + DEVICE_LOG_WARN( + "[profiler]%s, pcyc: %llu, dur: %lluus, [%s]cnt: %llu, dur: %lluus, " + "[%s]cnt: %llu, dur: %lluus\n", + _log_prefix, (unsigned long long) total_pcycles, (unsigned long long) duration, + _sub_proc_data[0].log_prefix, (unsigned long long) _sub_proc_data[0].proc_count, + (unsigned long long) sub_proc0_duration, _sub_proc_data[1].log_prefix, + (unsigned long long) _sub_proc_data[1].proc_count, (unsigned long long) sub_proc1_duration); + break; + case 1: + DEVICE_LOG_WARN("[profiler]%s, pcyc: %llu, dur: %lluus, [%s]cnt: %llu, dur: %lluus\n", _log_prefix, + (unsigned long long) total_pcycles, (unsigned long long) duration, + _sub_proc_data[0].log_prefix, (unsigned long long) _sub_proc_data[0].proc_count, + (unsigned long long) sub_proc0_duration); + break; + default: + case 0: + DEVICE_LOG_WARN("[profiler]%s, pcyc: %llu, dur: %lluus\n", _log_prefix, + (unsigned long long) total_pcycles, (unsigned long long) duration); + break; } } private: - char _log_prefix[kBufferCount] = {}; - char _sub_proc_log_prefix[kBufferCount] = {}; - uint64_t _begin_cycles = 0; - uint64_t _begin_pcycles = 0; - uint64_t _sub_proc_cycles = 0; - uint64_t _sub_proc_pcycles = 0; - uint64_t _sub_proc_count = 0; + char _log_prefix[kBufferCount] = {}; + uint64_t _begin_cycles = 0; + uint64_t _begin_pcycles = 0; + sub_process_data _sub_proc_data[kSubProcCount] = {}; DISABLE_COPY(npu_scoped_timer); }; -template class npu_sub_process_scoped_timer { +template class npu_sub_process_scoped_timer { public: + static_assert(_sub_idx < npu_scoped_timer<_buffer_count>::kSubProcCount, + "Sub process index must be less than kSubProcCount"); using npu_scoped_timer = npu_scoped_timer<_buffer_count>; - explicit npu_sub_process_scoped_timer(npu_scoped_timer & timer) : _timer(timer) { + explicit npu_sub_process_scoped_timer(npu_scoped_timer & timer, const char * prefix) : + _timer(timer), + _prefix(prefix) { _begin_cycles = HAP_perf_get_qtimer_count(); _begin_pcycles = HAP_perf_get_pcycles(); } ~npu_sub_process_scoped_timer() { - _timer.add_sub_proc_cycles(HAP_perf_get_qtimer_count() - _begin_cycles, + _timer.add_sub_proc_cycles(_sub_idx, _prefix, HAP_perf_get_qtimer_count() - _begin_cycles, HAP_perf_get_pcycles() - _begin_pcycles); } private: npu_scoped_timer & _timer; + const char * _prefix = nullptr; uint64_t _begin_cycles = 0; uint64_t _begin_pcycles = 0; @@ -244,10 +317,10 @@ template class npu_sub_process_scoped_timer { inline auto make_scoped_perf_timer(const char * format, ...) { va_list args; va_start(args, format); - char buffer[512]; + char buffer[1024]; vsnprintf(buffer, sizeof(buffer), format, args); va_end(args); - return npu_scoped_timer<512>(buffer, nullptr); + return npu_scoped_timer<1024>(buffer); } #endif diff --git a/ggml/src/ggml-qnn/npu/device/vec_ops.cpp b/ggml/src/ggml-qnn/npu/device/vec_ops.cpp new file mode 100644 index 0000000000000..5bdf183e5b185 --- /dev/null +++ b/ggml/src/ggml-qnn/npu/device/vec_ops.cpp @@ -0,0 +1,156 @@ +#include "vec_ops.hpp" + +#include + +#include "util.hpp" + +namespace { + +template +inline float vec_dot_product_impl(const _TElem * src0, const _TElem * src1, size_t count) { + constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(_TElem); + + HVX_Vector * src0_vec_ptr = ((HVX_Vector *) src0); + HVX_Vector * const src0_vec_ptr_end = ((HVX_Vector *) src0) + count / kElementsPerVector; + HVX_Vector * src1_vec_ptr = ((HVX_Vector *) src1); + HVX_Vector prev0 = *src0_vec_ptr++; + HVX_Vector prev1 = *src1_vec_ptr++; + HVX_Vector sum = Q6_V_vzero(); + HVX_Vector sum0 = Q6_V_vzero(); + HVX_Vector sum1 = Q6_V_vzero(); + + while (src0_vec_ptr_end - src0_vec_ptr > 1) { + HVX_VectorPair curr0 = reinterpret_cast(src0_vec_ptr)[0]; + HVX_VectorPair curr1 = reinterpret_cast(src1_vec_ptr)[0]; + + HVX_Vector l0 = Q6_V_valign_VVR(Q6_V_lo_W(curr0), prev0, (size_t) src0); + HVX_Vector l1 = Q6_V_valign_VVR(Q6_V_lo_W(curr1), prev1, (size_t) src1); + HVX_Vector h0 = Q6_V_valign_VVR(Q6_V_hi_W(curr0), Q6_V_lo_W(curr0), (size_t) src0); + HVX_Vector h1 = Q6_V_valign_VVR(Q6_V_hi_W(curr1), Q6_V_lo_W(curr1), (size_t) src1); + prev0 = Q6_V_hi_W(curr0); + prev1 = Q6_V_hi_W(curr1); + src0_vec_ptr += 2; + src1_vec_ptr += 2; + + sum0 = _AddFunc(_MpyFunc(l0, l1), sum0); + sum1 = _AddFunc(_MpyFunc(h0, h1), sum1); + } + + sum = _AddFunc(sum0, sum1); + if (src0_vec_ptr_end - src0_vec_ptr > 0) { + HVX_Vector curr0 = *src0_vec_ptr++; + HVX_Vector curr1 = *src1_vec_ptr++; + HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); + prev0 = curr0; + prev1 = curr1; + + sum = _AddFunc(_MpyFunc(s0, s1), sum); + } + + const size_t leftover = count % kElementsPerVector; + if ((src0_vec_ptr_end - ((HVX_Vector *) src0)) > 0) { + // handle the last vector + // see also: + // https://github.com/UbiquitousLearning/mllm/blob/babf4410352ce8730824c87699c025a0d4ce3a6f/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/LLaMAMul.cpp#L147 + // or qualcomm sdk libs\qhl_hvx\src\qhblas_hvx\qhblas_hvx_aw_vector_add_ah.c + bool should_fetch_src0 = leftover != 0 || !hexagon::is_addr_aligned(src0_vec_ptr); + bool should_fetch_src1 = leftover != 0 || !hexagon::is_addr_aligned(src1_vec_ptr); + HVX_Vector curr0 = should_fetch_src0 ? *src0_vec_ptr : prev0; + HVX_Vector curr1 = should_fetch_src1 ? *src1_vec_ptr : prev1; + src0_vec_ptr += should_fetch_src0 ? 1 : 0; + src1_vec_ptr += should_fetch_src1 ? 1 : 0; + HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); + prev0 = curr0; + prev1 = curr1; + + sum = _AddFunc(_MpyFunc(s0, s1), sum); + } + + const size_t leftover_bytes = leftover * sizeof(_TElem); + if (leftover > 0) { + // handle the leftover elements + HVX_Vector curr0 = (leftover_bytes + hexagon::unaligned_bytes(src0_vec_ptr) > hexagon::kBytesPerVector) ? + *src0_vec_ptr : + prev0; + curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + + HVX_Vector curr1 = (leftover_bytes + hexagon::unaligned_bytes(src1_vec_ptr) > hexagon::kBytesPerVector) ? + *src1_vec_ptr : + prev1; + curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); + + sum = _AddFunc(Q6_V_valign_VVR(_MpyFunc(curr0, curr1), Q6_V_vzero(), leftover_bytes), sum); + } + + return _ReduceFunc(sum); +} + +template +inline float vec_dot_product_aligned_impl(const _TElem * src0, const _TElem * src1, size_t count) { + constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(_TElem); + + HVX_Vector * src0_vec_ptr = ((HVX_Vector *) src0); + HVX_Vector * const src0_vec_ptr_end = ((HVX_Vector *) src0) + count / kElementsPerVector; + HVX_Vector * src1_vec_ptr = ((HVX_Vector *) src1); + HVX_Vector sum0 = Q6_V_vzero(); + HVX_Vector sum1 = Q6_V_vzero(); + + while (src0_vec_ptr_end - src0_vec_ptr > 1) { + HVX_Vector curr0_lo = src0_vec_ptr[0]; + HVX_Vector curr0_hi = src0_vec_ptr[1]; + HVX_Vector curr1_lo = src1_vec_ptr[0]; + HVX_Vector curr1_hi = src1_vec_ptr[1]; + src0_vec_ptr += 2; + src1_vec_ptr += 2; + + sum0 = _AddFunc(_MpyFunc(curr0_lo, curr1_lo), sum0); + sum1 = _AddFunc(_MpyFunc(curr0_hi, curr1_hi), sum1); + } + + return _ReduceFunc(_AddFunc(sum0, sum1)); +} + +inline HVX_Vector vec_mpy_qf32(HVX_Vector src0, HVX_Vector src1) { + return Q6_Vqf32_vmpy_VsfVsf(src0, src1); +} + +inline HVX_Vector vec_add_qf32(HVX_Vector sum, HVX_Vector result) { + return Q6_Vqf32_vadd_Vqf32Vqf32(sum, result); +} + +inline HVX_Vector vec_mpy_qf16(HVX_Vector src0, HVX_Vector src1) { + return Q6_Vqf16_vmpy_VhfVhf(src0, src1); +} + +inline HVX_Vector vec_add_qf16(HVX_Vector sum, HVX_Vector result) { + return Q6_Vqf16_vadd_Vqf16Vqf16(sum, result); +} + +} // namespace + +namespace hexagon { + +float vec_dot_product_f32_f32(const float * src0, const float * src1, size_t count) { + return vec_dot_product_impl(src0, src1, count); +} + +float vec_dot_product_aligned_f32_f32(const float * src0, const float * src1, size_t count) { + return vec_dot_product_aligned_impl(src0, src1, + count); +} + +float vec_dot_product_f16_f16(const npu_device_fp16_t * src0, const npu_device_fp16_t * src1, size_t count) { + return vec_dot_product_impl( + src0, src1, count); +} + +float vec_dot_product_aligned_f16_f16(const npu_device_fp16_t * src0, const npu_device_fp16_t * src1, size_t count) { + return vec_dot_product_aligned_impl( + src0, src1, count); +} + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/device/vec_ops.hpp b/ggml/src/ggml-qnn/npu/device/vec_ops.hpp new file mode 100644 index 0000000000000..406075fc9cdde --- /dev/null +++ b/ggml/src/ggml-qnn/npu/device/vec_ops.hpp @@ -0,0 +1,274 @@ +#pragma once + +#include +#include + +#include + +#include "hexagon_npu.h" + +namespace hexagon { + +constexpr const size_t kBytesPerVector = sizeof(HVX_Vector); // 128 for v73 +constexpr const size_t kAlignMask = kBytesPerVector - 1; + +inline size_t unaligned_bytes(const void * addr) { + return ((size_t) addr) & kAlignMask; +} + +inline bool is_addr_aligned(const void * addr) { + return unaligned_bytes(addr) == 0; +} + +inline float get_flt0_from_fltv(HVX_Vector vect) { + static_assert(sizeof(vect[0]) == sizeof(float), "vect[0] should be a float"); + int32_t i = vect[0]; + return reinterpret_cast(i); +} + +inline HVX_UVector Q6_V_vmemu_R(const void * unaligned_ptr) { + return *reinterpret_cast(unaligned_ptr); +} + +inline HVX_Vector Q6_V_vmem_R(const void * aligned_ptr) { + return *reinterpret_cast(aligned_ptr); +} + +constexpr const size_t kL2CacheSize = 8 * 1024; // // 8KB L2 cache +constexpr const size_t kL2FetchAheadVectors = kL2CacheSize / kBytesPerVector; + +inline void l2fetch(const void * p, uint32_t stride, uint32_t width, uint32_t height, uint32_t dir) { + uint64_t control = HEXAGON_V64_CREATE_H(dir, stride, width, height); + __asm__ __volatile__(" l2fetch(%0,%1) " : : "r"(p), "r"(control)); +} + +inline void l2fetch_row(const uint8_t * row_ptr, size_t bytes) { + // TODO: should we use small kL2FetchAheadVectors? + int32_t l2fetch_vectors = Q6_R_min_RR(bytes / kBytesPerVector, kL2FetchAheadVectors); + hexagon::l2fetch(row_ptr, kBytesPerVector, kBytesPerVector, l2fetch_vectors, 0); +} + +/* + * This function converts a vector of IEEE float elements to a vector of qf32 elements + * See also: libs\qfe\inc\qhmath_hvx_convert.h + */ +inline HVX_Vector qhmath_hvx_vqf32_convert_vsf(HVX_Vector vin) { + return Q6_Vqf32_vadd_VsfVsf(vin, Q6_V_vzero()); +} + +/* + * This function converts a vector of IEEE half float elements to a vector of qf16 elements + * See also: libs\qfe\inc\qhmath_hvx_convert.h + */ +inline HVX_Vector qhmath_hvx_vqf16_convert_vhf(HVX_Vector vin) { + return Q6_Vqf16_vadd_VhfVhf(vin, Q6_V_vzero()); +} + +/* + * This function converts a pair of vectors of qf32 elements to a vector of IEEE half float elements + * See also: libs\qfe\inc\qhmath_hvx_convert.h + */ +inline HVX_Vector qhmath_hvx_vhf_convert_vqf32(HVX_VectorPair vin_vp) { + return Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(vin_vp)); +} + +/* + * This function converts a vector of qf16 elements to a pair of vectors of qf32 elements + * See also: libs\qfe\inc\qhmath_hvx_convert.h + */ +inline HVX_VectorPair qhmath_hvx_vqf32_convert_vqf16(HVX_Vector vxl) { + HVX_VectorPair vxw_vp, exponent_vp; + HVX_Vector mantissa_mask = Q6_Vh_vsplat_R(0xffe0); + HVX_Vector exp_mask = Q6_Vh_vsplat_R(0x1f); + HVX_Vector exp_offset = Q6_Vh_vsplat_R(0x70); + HVX_Vector mant32_shift = Q6_Vh_vsplat_R(0x10); + HVX_Vector reql, reqh, vxl_w, vxh_w, mantissa; + HVX_Vector el_exponent, eh_exponent; + + el_exponent = Q6_V_vand_VV(exp_mask, vxl); + // Obtain the mantissa part: bits (5-15) + mantissa = Q6_V_vand_VV(mantissa_mask, vxl); + // Convert qf16 biassed exponent to qf32 biased exponent + // new exp = exp + ( 127 (qf32 bias) -15(qf16 biass) ) = 112 + el_exponent = Q6_Vh_vadd_VhVh(exp_offset, el_exponent); + + vxw_vp = Q6_Ww_vunpack_Vh(mantissa); + vxl_w = Q6_V_lo_W(vxw_vp); + vxh_w = Q6_V_hi_W(vxw_vp); + + exponent_vp = Q6_Ww_vunpack_Vh(el_exponent); + el_exponent = Q6_V_lo_W(exponent_vp); + eh_exponent = Q6_V_hi_W(exponent_vp); + // Convert q16 mantiss to q32 mantissa + reql = Q6_Vw_vasl_VwVw(vxl_w, mant32_shift); + reqh = Q6_Vw_vasl_VwVw(vxh_w, mant32_shift); + // Add the exponent + vxl_w = Q6_Vw_vadd_VwVw(reql, el_exponent); + vxh_w = Q6_Vw_vadd_VwVw(reqh, eh_exponent); + + return Q6_W_vcombine_VV(vxh_w, vxl_w); +} + +inline HVX_Vector vec_reduction_qf32(HVX_Vector sums) { + constexpr const size_t kFloatsPerVector = hexagon::kBytesPerVector / sizeof(float); + static_assert(kFloatsPerVector == 32 || kFloatsPerVector == 16, "kFloatsPerVector should be 16 or 32"); + + // TODO: do we have a better way to do the reduction? + switch (kFloatsPerVector) { + default: + case 32: + sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 16 * sizeof(float))); + // fallthrough + case 16: + sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 8 * sizeof(float))); + sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 4 * sizeof(float))); + sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 2 * sizeof(float))); + sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, sizeof(float))); + break; + } + + return sums; +} + +inline float vec_reduction_qf32_f32(HVX_Vector sums) { + return get_flt0_from_fltv(Q6_Vsf_equals_Vqf32(vec_reduction_qf32(sums))); +} + +inline HVX_Vector vec_reduction_qf16(HVX_Vector sums) { + constexpr const size_t kFloatsPerVector = hexagon::kBytesPerVector / sizeof(npu_device_fp16_t); + static_assert(kFloatsPerVector == 64 || kFloatsPerVector == 32, "kFloatsPerVector should be 32 or 64"); + + // TODO: do we have a better way to do the reduction? + switch (kFloatsPerVector) { + default: + case 64: + sums = Q6_Vqf16_vadd_Vqf16Vqf16(sums, Q6_V_vror_VR(sums, 32 * sizeof(npu_device_fp16_t))); + // fallthrough + case 32: + sums = Q6_Vqf16_vadd_Vqf16Vqf16(sums, Q6_V_vror_VR(sums, 16 * sizeof(npu_device_fp16_t))); + sums = Q6_Vqf16_vadd_Vqf16Vqf16(sums, Q6_V_vror_VR(sums, 8 * sizeof(npu_device_fp16_t))); + sums = Q6_Vqf16_vadd_Vqf16Vqf16(sums, Q6_V_vror_VR(sums, 4 * sizeof(npu_device_fp16_t))); + sums = Q6_Vqf16_vadd_Vqf16Vqf16(sums, Q6_V_vror_VR(sums, 2 * sizeof(npu_device_fp16_t))); + sums = Q6_Vqf16_vadd_Vqf16Vqf16(sums, Q6_V_vror_VR(sums, sizeof(npu_device_fp16_t))); + break; + } + + return sums; +} + +inline float vec_reduction_qf16_f32(HVX_Vector sums) { + HVX_Vector vect = Q6_Vhf_equals_Vqf16(vec_reduction_qf16(sums)); + uint16_t i = (vect[0] & 0xffff); + return reinterpret_cast<__fp16 &>(i); +} + +inline HVX_Vector hvx_scale_f32(float scale) { + return Q6_V_vsplat_R(reinterpret_cast(scale)); +} + +template +inline void vec_scale_impl(const _TParam * src, float scale, _TParam * dst, size_t count) { + constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(_TParam); + + HVX_Vector * src_vec_ptr = ((HVX_Vector *) src); + HVX_Vector * const src_vec_end = ((HVX_Vector *) src) + (count / kElementsPerVector); + HVX_UVector * dst_vec_ptr = ((HVX_UVector *) dst); // TODO: opt the unaligned case? + HVX_Vector prev = *src_vec_ptr++; + const size_t leftover = count % kElementsPerVector; + const size_t leftover_bytes = leftover * sizeof(_TParam); + + HVX_Vector scale_vec = _FuncScaleConvert(scale); + + while (src_vec_end - src_vec_ptr > 1) { + HVX_VectorPair curr = reinterpret_cast(src_vec_ptr)[0]; + src_vec_ptr += 2; + + HVX_Vector lo = Q6_V_valign_VVR(Q6_V_lo_W(curr), prev, (size_t) src); + HVX_Vector hi = Q6_V_valign_VVR(Q6_V_hi_W(curr), Q6_V_lo_W(curr), (size_t) src); + + dst_vec_ptr[0] = _Func(lo, dst_vec_ptr, scale_vec); + dst_vec_ptr[1] = _Func(hi, dst_vec_ptr + 1, scale_vec); + + dst_vec_ptr += 2; + prev = Q6_V_hi_W(curr); + } + + if (src_vec_end - src_vec_ptr > 0) { + HVX_Vector curr = *src_vec_ptr++; + HVX_Vector s0 = Q6_V_valign_VVR(curr, prev, (size_t) src); + dst_vec_ptr[0] = _Func(s0, dst_vec_ptr, scale_vec); + dst_vec_ptr++; + prev = curr; + } + + if ((src_vec_end - ((HVX_Vector *) src)) > 0) { + // handle the last vector + bool should_fetch_next = leftover == 0 && hexagon::is_addr_aligned(src_vec_ptr); + HVX_Vector curr = should_fetch_next ? prev : *src_vec_ptr; + src_vec_ptr = should_fetch_next ? src_vec_ptr : src_vec_ptr + 1; + HVX_Vector s0 = Q6_V_valign_VVR(curr, prev, (size_t) src); + dst_vec_ptr[0] = _Func(s0, dst_vec_ptr, scale_vec); + dst_vec_ptr++; + prev = curr; + } + + if (leftover > 0) { + // handle the leftover elements + HVX_Vector curr = + (leftover_bytes + hexagon::unaligned_bytes(src_vec_ptr) > hexagon::kBytesPerVector) ? *src_vec_ptr : prev; + curr = Q6_V_valign_VVR(curr, prev, (size_t) src); + q6op_vstu_variable_ARV(dst_vec_ptr, leftover_bytes, _Func(curr, dst_vec_ptr, scale_vec)); + } +} + +inline HVX_Vector hvx_vec_scale_f32_f32(HVX_Vector src, HVX_UVector *, HVX_Vector scale_vec) { + return Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(src, scale_vec)); +} + +inline HVX_Vector hvx_vec_mad_f32_f32(HVX_Vector src, HVX_UVector * dst_ptr, HVX_Vector scale_vec) { + HVX_Vector dst = *dst_ptr; // TODO: opt the unaligned case? + src = Q6_Vqf32_vmpy_VsfVsf(src, scale_vec); + src = Q6_Vqf32_vadd_Vqf32Vsf(src, dst); + return Q6_Vsf_equals_Vqf32(src); +} + +inline void vec_scale_f32(const float * src, float scale, float * dst, size_t count) { + vec_scale_impl(src, scale, dst, count); +} + +inline void vec_mad_f32(const float * src, float scale, float * dst, size_t count) { + vec_scale_impl(src, scale, dst, count); +} + +inline HVX_Vector hvx_scale_f16(float scale) { + __fp16 f16_scale = scale; + return Q6_Vh_vsplat_R(reinterpret_cast(f16_scale)); +} + +inline HVX_Vector hvx_vec_scale_f16_f16(HVX_Vector src, HVX_UVector *, HVX_Vector scale_vec) { + return Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(src, scale_vec)); +} + +inline HVX_Vector hvx_vec_mad_f16_f16(HVX_Vector src, HVX_UVector * dst_ptr, HVX_Vector scale_vec) { + HVX_Vector dst = *dst_ptr; // TODO: opt the unaligned case? + HVX_Vector scaled = Q6_Vqf16_vmpy_VhfVhf(src, scale_vec); + HVX_Vector result = Q6_Vqf16_vadd_Vqf16Vhf(scaled, dst); + return Q6_Vhf_equals_Vqf16(result); +} + +inline void vec_scale_f16(const npu_device_fp16_t * src, float scale, npu_device_fp16_t * dst, size_t count) { + vec_scale_impl(src, scale, dst, count); +} + +inline void vec_mad_f16(const npu_device_fp16_t * src, float scale, npu_device_fp16_t * dst, size_t count) { + vec_scale_impl(src, scale, dst, count); +} + +float vec_dot_product_f32_f32(const float * src0, const float * src1, size_t count); +float vec_dot_product_aligned_f32_f32(const float * src0, const float * src1, size_t count); + +float vec_dot_product_f16_f16(const npu_device_fp16_t * src0, const npu_device_fp16_t * src1, size_t count); +float vec_dot_product_aligned_f16_f16(const npu_device_fp16_t * src0, const npu_device_fp16_t * src1, size_t count); + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/host/buffer.cpp b/ggml/src/ggml-qnn/npu/host/buffer.cpp index 7d3c1fbd9f7ac..c7482f8b590e6 100644 --- a/ggml/src/ggml-qnn/npu/host/buffer.cpp +++ b/ggml/src/ggml-qnn/npu/host/buffer.cpp @@ -114,7 +114,9 @@ size_t backend_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { size_t backend_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { auto * buffer_type_obj = get_buffer_type_object(buft); GGML_ASSERT(buffer_type_obj != nullptr); - return buffer_type_obj->get_max_buffer_size(); + auto size = buffer_type_obj->get_max_buffer_size(); + LOG_DEBUG("[hexagon-npu][%s]max_buffer_size: %zu\n", buffer_type_obj->get_name(), size); + return size; } bool backend_buffer_is_host(ggml_backend_buffer_type_t buft) { diff --git a/ggml/src/ggml-qnn/npu/host/graph.cpp b/ggml/src/ggml-qnn/npu/host/graph.cpp index d891280e5694c..9ac69924d3f49 100644 --- a/ggml/src/ggml-qnn/npu/host/graph.cpp +++ b/ggml/src/ggml-qnn/npu/host/graph.cpp @@ -29,6 +29,8 @@ bool host_graph::update(ggml_cgraph * cgraph) { return false; } + LOG_DEBUG("[%p]host_graph::update started\n", (void *) this); + SCOPED_PERFORMANCE_TRACKER("[hexagon-npu][%p]update, handle(%p)", (void *) this, (void *) _graph_handle); _tensor_handles.clear(); @@ -40,8 +42,9 @@ bool host_graph::update(ggml_cgraph * cgraph) { if (node->op == GGML_OP_NONE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_RESHAPE) { // skip view liked ops - LOG_DEBUG("node[%d]%s(%s), addr: %p, type: %s, skipped\n", i, ggml_get_name(node), ggml_op_desc(node), - (void *) node, ggml_type_name(node->type)); + LOG_DEBUG("node[%d]%s(%s), addr: %p, type: %s, dims: %ldx%ldx%ldx%ld, skipped\n", i, ggml_get_name(node), + ggml_op_desc(node), (void *) node, ggml_type_name(node->type), (long) node->ne[0], + (long) node->ne[1], (long) node->ne[2], (long) node->ne[3]); continue; } @@ -54,9 +57,10 @@ bool host_graph::update(ggml_cgraph * cgraph) { _tensor_handles.push_back(tensor_obj->get_device_tensor_handle()); _tensor_update_configs.push_back(tensor_obj->update_hosts_params_only(node)); - LOG_DEBUG("[%p]node[%d]%s(%s), addr: %p, type: %s, tensor_handle: %p\n", (void *) this, i, ggml_get_name(node), - ggml_op_desc(node), (void *) node, ggml_type_name(node->type), - (void *) tensor_obj->get_device_tensor_handle()); + LOG_DEBUG("node[%d]%s(%s), addr: %p, type: %s, dims: %ldx%ldx%ldx%ld, tensor_handle: %p\n", i, + ggml_get_name(node), ggml_op_desc(node), (void *) node, ggml_type_name(node->type), + (long) tensor_obj->get_ne(0), (long) tensor_obj->get_ne(1), (long) tensor_obj->get_ne(2), + (long) tensor_obj->get_ne(3), (void *) tensor_obj->get_device_tensor_handle()); } GGML_ASSERT(_tensor_handles.size() == _tensor_update_configs.size()); @@ -71,7 +75,7 @@ bool host_graph::update(ggml_cgraph * cgraph) { (int) _tensor_update_configs.size()); if (ret != AEE_SUCCESS) { - LOG_ERROR("Failed to set tensors in host_graph: 0x%x\n", (int) ret); + LOG_ERROR("[%p]failed to set tensors in host_graph: 0x%x\n", (void *) this, (int) ret); return false; } diff --git a/ggml/src/ggml-qnn/npu/host/host_device.cpp b/ggml/src/ggml-qnn/npu/host/host_device.cpp index 443abe5c9e6fe..e88ef002bf0c1 100644 --- a/ggml/src/ggml-qnn/npu/host/host_device.cpp +++ b/ggml/src/ggml-qnn/npu/host/host_device.cpp @@ -149,37 +149,17 @@ bool npu_device::supports_op_impl(const ggml_tensor * op) { return false; } - auto * src0 = op->src[0]; - if (!src0) { - LOG_DEBUG("[%s]Unsupported inplace op: %s\n", get_name(), ggml_op_desc(op)); - return false; - } - - if (type_to_npu_type(src0->type) == NPU_DATA_TYPE_COUNT) { - LOG_DEBUG("[%s]Unsupported src0 tensor type: %s\n", get_name(), ggml_type_name(src0->type)); - return false; - } - - auto * src1 = op->src[1]; - if (src1 && type_to_npu_type(src1->type) == NPU_DATA_TYPE_COUNT) { - LOG_DEBUG("[%s]Unsupported src1 tensor type: %s\n", get_name(), ggml_type_name(src1->type)); - return false; - } - auto npu_op = op_to_npu_op(op->op); if (npu_op == NPU_OP_COUNT) { LOG_DEBUG("[%s]Unsupported op: %s\n", get_name(), ggml_op_desc(op)); return false; } - if (!_device_handle && !init_device()) { - LOG_DEBUG("[%s]NPU device initialization failed\n", get_name()); - return false; - } - - constexpr const auto get_spec = [](const ggml_tensor * tensor) -> npu_device_tensor_spec { + int i = 0; + npu_device_tensor_spec srcs[DEVICE_TENSOR_MAX_SRC] = {}; + constexpr const auto get_spec = [](const ggml_tensor * tensor) -> npu_device_tensor_spec { if (!tensor) { - return npu_device_tensor_spec{ {}, NPU_DATA_TYPE_COUNT }; + return npu_device_tensor_spec{ {}, {}, NPU_DATA_TYPE_COUNT }; } static_assert(DEVICE_TENSOR_MAX_DIMS == GGML_MAX_DIMS, "tensor dimensions mismatch"); @@ -188,19 +168,40 @@ bool npu_device::supports_op_impl(const ggml_tensor * op) { spec.ne[1] = tensor->ne[1]; spec.ne[2] = tensor->ne[2]; spec.ne[3] = tensor->ne[3]; + + spec.nb[0] = tensor->nb[0]; + spec.nb[1] = tensor->nb[1]; + spec.nb[2] = tensor->nb[2]; + spec.nb[3] = tensor->nb[3]; spec.type = type_to_npu_type(tensor->type); return spec; }; + for (; i < (int) DEVICE_TENSOR_MAX_SRC && op->src[i]; ++i) { + auto * src = op->src[i]; + if (type_to_npu_type(src->type) == NPU_DATA_TYPE_COUNT) { + LOG_DEBUG("[%s]Unsupported src%d tensor type: %s\n", get_name(), i, ggml_type_name(src->type)); + return false; + } + + srcs[i] = get_spec(src); + } + + if (!_device_handle && !init_device()) { + LOG_DEBUG("[%s]NPU device initialization failed\n", get_name()); + return false; + } + boolean supported = false; - auto src0_spec = get_spec(src0); - auto src1_spec = get_spec(src1); auto dst_spec = get_spec(op); - auto ret = npu_device_device_support_op(_device_handle, &src0_spec, &src1_spec, &dst_spec, npu_op, &supported); + auto ret = npu_device_device_support_op(_device_handle, npu_op, &dst_spec, srcs, i, &supported); if (ret != AEE_SUCCESS || !supported) { +#ifndef NDEBUG + auto * src0_type = i ? ggml_type_name(op->src[0]->type) : "null"; + auto * src1_type = (i > 1) ? ggml_type_name(op->src[1]->type) : "null"; LOG_DEBUG("[%s][%s]unsupported %s(%s,%s), ret: 0x%x, supported: %d\n", get_name(), ggml_op_name(op->op), - ggml_type_name(op->type), ggml_type_name(src0->type), (src1 ? ggml_type_name(src1->type) : "null"), - ret, supported); + ggml_type_name(op->type), src0_type, src1_type, ret, supported); +#endif return false; } diff --git a/ggml/src/ggml-qnn/npu/host/tensor.hpp b/ggml/src/ggml-qnn/npu/host/tensor.hpp index 71205b39fb7a8..07e092049ce14 100644 --- a/ggml/src/ggml-qnn/npu/host/tensor.hpp +++ b/ggml/src/ggml-qnn/npu/host/tensor.hpp @@ -21,15 +21,17 @@ class host_tensor { explicit host_tensor(ggml_tensor * tensor, int buffer_fd, uint64_t offset, remote_handle64 device_handle) : _device_handle(device_handle) { - // TODO: figure out why the npu_device_tensor_config can't be larger than 100 bytes - static_assert(sizeof(npu_device_tensor_config) < 100, "npu_device_tensor_config size too large"); - - _info.buffer_fd = buffer_fd; - _info.offset = offset; - _info.type = type_to_npu_type(tensor->type); - _info.size = ggml_nbytes(tensor); + static_assert(sizeof(npu_device_tensor_config) < kMaxNpuRpcStructSize, + "npu_device_tensor_config size too large"); + + _info.buffer_fd = buffer_fd; + _info.offset = offset; + _info.type = type_to_npu_type(tensor->type); + _info.size = ggml_nbytes(tensor); + _info.is_constant = false; // TODO: support constant tensors in the future // _info.op will be updated in update_params() + _info_update.op = NPU_OP_COUNT; static_assert(DEVICE_TENSOR_MAX_DIMS == GGML_MAX_DIMS, "tensor dimensions mismatch"); static_assert(sizeof(_info.ne) == sizeof(tensor->ne), "tensor ne size mismatch"); @@ -46,10 +48,10 @@ class host_tensor { tensor->extra = this; _ggml_tensor = tensor; - LOG_DEBUG("host_tensor(%p), ggml_tensor(%p[%ldx%ldx%ldx%ld], nb[%ld][%ld][%ld][%ld], %s), handle(%p)\n", - (void *) this, (void *) tensor, (long) tensor->ne[0], (long) tensor->ne[1], (long) tensor->ne[2], + LOG_DEBUG("host_tensor(%p), ggml_tensor(%s[%ldx%ldx%ldx%ld], nb[%ld][%ld][%ld][%ld], %s, %p), handle(%p)\n", + (void *) this, tensor->name, (long) tensor->ne[0], (long) tensor->ne[1], (long) tensor->ne[2], (long) tensor->ne[3], (long) tensor->nb[0], (long) tensor->nb[1], (long) tensor->nb[2], - (long) tensor->nb[3], ggml_type_name(tensor->type), (void *) _device_tensor_handle); + (long) tensor->nb[3], ggml_type_name(tensor->type), (void *) tensor, (void *) _device_tensor_handle); } ~host_tensor() { @@ -76,11 +78,9 @@ class host_tensor { auto new_op = op_to_npu_op(_ggml_tensor->op); bool params_changed = new_op != _info_update.op; if (params_changed) { - LOG_DEBUG("host_tensor(%p) op changed: %s -> %s\n", (void *) this, get_npu_op_desc(_info.op), - get_npu_op_desc(new_op)); + LOG_DEBUG("host_tensor(%p) op changed: %s\n", (void *) this, get_npu_op_desc(new_op)); } - _info.op = new_op; _info_update.op = new_op; if (memcmp(_info_update.params, _ggml_tensor->op_params, sizeof(_info_update.params)) != 0) { @@ -92,15 +92,16 @@ class host_tensor { } npu_device_tensor_handle_t src_tensor_handles[DEVICE_TENSOR_MAX_SRC] = {}; + static_assert(std::is_same::value, + "src tensor handles type mismatch"); + for (size_t j = 0; j < DEVICE_TENSOR_MAX_SRC && _ggml_tensor->src[j]; ++j) { - auto * src = host_tensor::from_ggml_tensor(_ggml_tensor->src[j]); + auto * ggml_src = _ggml_tensor->src[j]; + auto * src = host_tensor::from_ggml_tensor(ggml_src); src_tensor_handles[j] = src->get_device_tensor_handle(); - LOG_DEBUG("host_tensor(%p) set_src[%zu]: %p\n", (void *) this, j, (void *) src); + LOG_DEBUG("host_tensor(%p) set_src[%zu]: %p(%s)\n", (void *) this, j, (void *) src, ggml_src->name); } - static_assert(std::is_same::value, - "src tensor handles type mismatch"); - if (memcmp(_info_update.src_handles, src_tensor_handles, sizeof(_info_update.src_handles)) != 0) { params_changed = true; memcpy(_info_update.src_handles, src_tensor_handles, sizeof(_info_update.src_handles)); @@ -128,14 +129,14 @@ class host_tensor { GGML_ASSERT(ggml_tensor == _ggml_tensor); auto new_op = op_to_npu_op(_ggml_tensor->op); - _info.op = new_op; _info_update.op = new_op; memcpy(_info_update.params, _ggml_tensor->op_params, sizeof(_info_update.params)); for (size_t j = 0; j < DEVICE_TENSOR_MAX_SRC && _ggml_tensor->src[j]; ++j) { - auto * src = host_tensor::from_ggml_tensor(_ggml_tensor->src[j]); + auto * ggml_src = _ggml_tensor->src[j]; + auto * src = host_tensor::from_ggml_tensor(ggml_src); _info_update.src_handles[j] = src->get_device_tensor_handle(); - LOG_DEBUG("host_tensor(%p) set_src[%zu]: %p\n", (void *) this, j, (void *) src); + LOG_DEBUG("host_tensor(%p) set_src[%zu]: %p(%s)\n", (void *) this, j, (void *) src, ggml_src->name); } LOG_DEBUG("host_tensor(%p) update_params, op: %s, params: [%x, %x, %x, %x]\n", (void *) this, @@ -146,6 +147,15 @@ class host_tensor { bool is_valid() const { return _device_tensor_handle != 0; } + int64_t get_ne(size_t index) const { + if (index >= DEVICE_TENSOR_MAX_DIMS) { + LOG_ERROR("host_tensor(%p) get_ne: index out of bounds: %zu\n", (void *) this, index); + return 0; + } + + return _info.ne[index]; + } + private: remote_handle64 _device_handle = 0; npu_device_tensor_handle_t _device_tensor_handle = 0; diff --git a/ggml/src/ggml-qnn/npu/host/util.cpp b/ggml/src/ggml-qnn/npu/host/util.cpp index b62370d1ad845..0b005123333d2 100644 --- a/ggml/src/ggml-qnn/npu/host/util.cpp +++ b/ggml/src/ggml-qnn/npu/host/util.cpp @@ -6,7 +6,7 @@ #include "ggml-common.h" #undef GGML_COMMON_DECL_CPP -static_assert(sizeof(npu_device_block_q4_K) == sizeof(block_q4_K), "npu_device_block_q4_K size mismatch"); +static_assert(sizeof(npu_device_block_q4_k) == sizeof(block_q4_K), "npu_device_block_q4_k size mismatch"); static_assert(sizeof(npu_device_block_q4_0) == sizeof(block_q4_0), "npu_device_block_q4_0 size mismatch"); static_assert(sizeof(npu_device_block_q8_0) == sizeof(block_q8_0), "npu_device_block_q8_0 size mismatch"); static_assert(QUANT_K_SCALE_SIZE == K_SCALE_SIZE, "QUANT_K_SCALE_SIZE size mismatch"); @@ -27,6 +27,8 @@ enum npu_device_tensor_op op_to_npu_op(ggml_op op) { return NPU_OP_MUL; case GGML_OP_RMS_NORM: return NPU_OP_RMS_NORM; + case GGML_OP_FLASH_ATTN_EXT: + return NPU_OP_FLASH_ATTN; default: return NPU_OP_COUNT; } @@ -44,6 +46,8 @@ const char * get_npu_op_desc(enum npu_device_tensor_op op) { return ggml_op_name(GGML_OP_MUL); case NPU_OP_RMS_NORM: return ggml_op_name(GGML_OP_RMS_NORM); + case NPU_OP_FLASH_ATTN: + return ggml_op_name(GGML_OP_FLASH_ATTN_EXT); default: return "UNKNOWN"; } @@ -160,27 +164,65 @@ void get_op_tensor_desc(const ggml_tensor * dst, char * out, size_t max_len) { } }; - auto * src0 = dst->src[0]; - if (src0 == nullptr) { - print_tensor(dst, out, max_len); - return; - } + constexpr const auto get_src_tensor_count = [](const ggml_tensor * tensor) -> size_t { + for (size_t i = 0; i < GGML_MAX_SRC; ++i) { + if (!tensor->src[i]) { + return i; + } + } + + return GGML_MAX_SRC; + }; char dst_desc[256]; print_tensor(dst, dst_desc, sizeof(dst_desc)); - - char src0_desc[256]; - print_tensor(src0, src0_desc, sizeof(src0_desc)); - - auto * src1 = dst->src[1]; - if (src1 == nullptr) { - snprintf(out, max_len, "dst: %s, src0: %s", dst_desc, src0_desc); - return; + switch (get_src_tensor_count(dst)) { + case 4: + { + char src0_desc[256]; + print_tensor(dst->src[0], src0_desc, sizeof(src0_desc)); + char src1_desc[256]; + print_tensor(dst->src[1], src1_desc, sizeof(src1_desc)); + char src2_desc[256]; + print_tensor(dst->src[2], src2_desc, sizeof(src2_desc)); + char src3_desc[256]; + print_tensor(dst->src[3], src3_desc, sizeof(src3_desc)); + snprintf(out, max_len, "dst: %s, src0: %s, src1: %s, src2: %s, src3: %s", dst_desc, src0_desc, + src1_desc, src2_desc, src3_desc); + return; + } + case 3: + { + char src0_desc[256]; + print_tensor(dst->src[0], src0_desc, sizeof(src0_desc)); + char src1_desc[256]; + print_tensor(dst->src[1], src1_desc, sizeof(src1_desc)); + char src2_desc[256]; + print_tensor(dst->src[2], src2_desc, sizeof(src2_desc)); + snprintf(out, max_len, "dst: %s, src0: %s, src1: %s, src2: %s", dst_desc, src0_desc, src1_desc, + src2_desc); + return; + } + case 2: + { + char src0_desc[256]; + print_tensor(dst->src[0], src0_desc, sizeof(src0_desc)); + char src1_desc[256]; + print_tensor(dst->src[1], src1_desc, sizeof(src1_desc)); + snprintf(out, max_len, "dst: %s, src0: %s, src1: %s", dst_desc, src0_desc, src1_desc); + return; + } + case 1: + { + char src0_desc[256]; + print_tensor(dst->src[0], src0_desc, sizeof(src0_desc)); + snprintf(out, max_len, "dst: %s, src0: %s", dst_desc, src0_desc); + return; + } + default: + snprintf(out, max_len, "dst: %s", dst_desc); + return; } - - char src1_desc[256]; - print_tensor(src1, src1_desc, sizeof(src1_desc)); - snprintf(out, max_len, "dst: %s, src0: %s, src1: %s", dst_desc, src0_desc, src1_desc); } } // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/host/util.hpp b/ggml/src/ggml-qnn/npu/host/util.hpp index f8ec5c3b9f537..b4c2355cac298 100644 --- a/ggml/src/ggml-qnn/npu/host/util.hpp +++ b/ggml/src/ggml-qnn/npu/host/util.hpp @@ -26,4 +26,6 @@ void enable_unsigned_dsp_module(common::rpc_interface_ptr rpc_interface, uint32_ void get_op_tensor_desc(const ggml_tensor * dst, char * out, size_t max_len); +constexpr const size_t kMaxNpuRpcStructSize = 100; // TODO: figure out the actual size + } // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl b/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl index ed20c125b379c..70626c90cbbed 100644 --- a/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl +++ b/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl @@ -3,7 +3,7 @@ #include "remote.idl" const uint32_t DEVICE_TENSOR_MAX_DIMS = 4; -const uint32_t DEVICE_TENSOR_MAX_SRC = 2; +const uint32_t DEVICE_TENSOR_MAX_SRC = 4; const uint32_t DEVICE_TENSOR_MAX_OP_PARAMS = 4; const uint32_t QUANT_BLOCK_SIZE = 32; const uint32_t QUANT_K_BLOCK_SIZE = 256; @@ -12,6 +12,7 @@ const uint32_t QUANT_K_SCALE_SIZE = 12; interface npu_device : remote_handle64{ typedef int64_t ne_type[DEVICE_TENSOR_MAX_DIMS]; + typedef uint64_t nb_type[DEVICE_TENSOR_MAX_DIMS]; typedef uint64_t tensor_handle_t; typedef uint64_t graph_handle_t; @@ -22,7 +23,7 @@ interface npu_device : remote_handle64{ uint8_t qs[QUANT_BLOCK_SIZE / 2]; }; - struct block_q4_K { + struct block_q4_k { fp16_t d; fp16_t dmin; uint8_t scales[QUANT_K_SCALE_SIZE]; @@ -40,6 +41,7 @@ interface npu_device : remote_handle64{ NPU_OP_SUB, NPU_OP_MUL, NPU_OP_RMS_NORM, + NPU_OP_FLASH_ATTN, NPU_OP_COUNT }; @@ -54,6 +56,7 @@ interface npu_device : remote_handle64{ struct tensor_spec { ne_type ne; + nb_type nb; tensor_data_type type; }; @@ -65,12 +68,12 @@ interface npu_device : remote_handle64{ struct tensor_config { ne_type ne; - uint64_t nb[DEVICE_TENSOR_MAX_DIMS]; + nb_type nb; long buffer_fd; uint64_t offset; uint64_t size; tensor_data_type type; - tensor_op op; + boolean is_constant; }; AEEResult device_get_alignment( @@ -78,10 +81,9 @@ interface npu_device : remote_handle64{ ); AEEResult device_support_op( - in tensor_spec src0, - in tensor_spec src1, - in tensor_spec dst, in tensor_op op, + in tensor_spec dst, + in sequence srcs, rout boolean is_supported ); diff --git a/ggml/src/ggml-qnn/shared/CMakeLists.txt b/ggml/src/ggml-qnn/shared/CMakeLists.txt index b08b2f07eb11c..c8f9cf7a84c99 100644 --- a/ggml/src/ggml-qnn/shared/CMakeLists.txt +++ b/ggml/src/ggml-qnn/shared/CMakeLists.txt @@ -20,6 +20,8 @@ if(GGML_QNN_ENABLE_HEXAGON_BACKEND) if(DEFINED ENV{QNN_SDK_PATH}) set(HEXAGON_SDK_ROOT $ENV{HEXAGON_SDK_ROOT}) message("found HEXAGON_SDK_ROOT, setting to ${HEXAGON_SDK_ROOT}") + elseif(EXISTS ${HEXAGON_SDK_ROOT}) + message("HEXAGON_SDK_ROOT: ${HEXAGON_SDK_ROOT}") else() message(FATAL_ERROR "HEXAGON_SDK_ROOT not defined") endif() From 332514cd5c837bdf8fd568469f478b4018a2fa76 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 20 Jun 2025 20:16:23 +0800 Subject: [PATCH 160/166] qnn fix: update device capabilities for quantized types in qnn-lib to improve compatibility --- ggml/src/ggml-qnn/qnn/ggml-qnn.cpp | 3 +++ ggml/src/ggml-qnn/qnn/qnn-lib.cpp | 6 +++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-qnn/qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/qnn/ggml-qnn.cpp index e559cfdb28627..4a13f3ec05734 100644 --- a/ggml/src/ggml-qnn/qnn/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn/qnn/ggml-qnn.cpp @@ -283,8 +283,11 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, qnn::get_backend_desc(dev_ctx->device)); dev_ctx->description = buffer; } + +#ifdef GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS // TODO: remove npu from here if hardware quantization is supported dev_ctx->enable_cpu_dequantize = device == QNN_BACKEND_CPU; +#endif ggml_backend_t qnn_backend = new ggml_backend{ /* .guid = */ ggml_backend_qnn_guid(), diff --git a/ggml/src/ggml-qnn/qnn/qnn-lib.cpp b/ggml/src/ggml-qnn/qnn/qnn-lib.cpp index e32bab5f9247d..7dbcaf968eb1a 100644 --- a/ggml/src/ggml-qnn/qnn/qnn-lib.cpp +++ b/ggml/src/ggml-qnn/qnn/qnn-lib.cpp @@ -38,7 +38,7 @@ constexpr const qnn::device_caps kDeviceCaps[] = { // all quantized types can be offload to CPU, at current implementation, those types will be dequantized into float32 on cpu 0xFFFFFE, #else - 0, + (1L << GGML_TYPE_F32), #endif 0, // 0 for no limitation @@ -50,7 +50,7 @@ constexpr const qnn::device_caps kDeviceCaps[] = { // all quantized types can be offload to GPU, at current implementation, those types will be dequantized into float32 on cpu 0xFFFFFE, #else - 0, + (1L << GGML_TYPE_F32) | (1L << GGML_TYPE_F16), #endif (128256L * 4096 * sizeof(float)), // tested on 8 gen 2, failed to allocate tensor with size 128256x4096 and float32 @@ -62,7 +62,7 @@ constexpr const qnn::device_caps kDeviceCaps[] = { (1L << GGML_TYPE_F32) | (1L << GGML_TYPE_F16) | (1L << GGML_TYPE_I16), (1L << GGML_TYPE_Q2_K) | (1L << GGML_TYPE_Q3_K) | (1L << GGML_TYPE_Q4_K) | (1L << GGML_TYPE_Q8_K), #else - 0, + (1L << GGML_TYPE_F32) | (1L << GGML_TYPE_F16), #endif (8192L * 2048 + 8192 * 512 + 2048 * 512) * sizeof(float), // TODO: should have a better way to get this value }, From 989772c7bcd7a967773d7800ce19d1f1df9b7de5 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Mon, 30 Jun 2025 14:45:02 +0800 Subject: [PATCH 161/166] fix compiling error --- ggml/src/ggml-qnn/qnn/backend-ops.cpp | 3 +++ ggml/src/ggml-qnn/qnn/op-config-caps.cpp | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/ggml/src/ggml-qnn/qnn/backend-ops.cpp b/ggml/src/ggml-qnn/qnn/backend-ops.cpp index 784e1deec77d6..669e5bd854bf8 100644 --- a/ggml/src/ggml-qnn/qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/qnn/backend-ops.cpp @@ -101,6 +101,7 @@ constexpr const bool kQnnSupportedOps[] = { false, // GGML_OP_TRANSPOSE false, // GGML_OP_GET_ROWS false, // GGML_OP_GET_ROWS_BACK + false, // GGML_OP_SET_ROWS false, // GGML_OP_DIAG false, // GGML_OP_DIAG_MASK_INF false, // GGML_OP_DIAG_MASK_ZERO @@ -119,6 +120,7 @@ constexpr const bool kQnnSupportedOps[] = { false, // GGML_OP_POOL_2D_BACK false, // GGML_OP_UPSCALE false, // GGML_OP_PAD + false, // GGML_OP_ROLL false, // GGML_OP_PAD_REFLECT_1D false, // GGML_OP_ARANGE false, // GGML_OP_TIMESTEP_EMBEDDING @@ -148,6 +150,7 @@ constexpr const bool kQnnSupportedOps[] = { false, // GGML_OP_CROSS_ENTROPY_LOSS false, // GGML_OP_CROSS_ENTROPY_LOSS_BACK false, // GGML_OP_OPT_STEP_ADAMW + false, // GGML_OP_GLU // ggml_unary_op false, // GGML_UNARY_OP_ABS diff --git a/ggml/src/ggml-qnn/qnn/op-config-caps.cpp b/ggml/src/ggml-qnn/qnn/op-config-caps.cpp index 2a6f7abca4044..9a5abef8e9d8b 100644 --- a/ggml/src/ggml-qnn/qnn/op-config-caps.cpp +++ b/ggml/src/ggml-qnn/qnn/op-config-caps.cpp @@ -146,6 +146,7 @@ constexpr const qnn_op_caps_t kOpCaps[] = { {}, // GGML_OP_TRANSPOSE {}, // GGML_OP_GET_ROWS {}, // GGML_OP_GET_ROWS_BACK + {}, // GGML_OP_SET_ROWS {}, // GGML_OP_DIAG {}, // GGML_OP_DIAG_MASK_INF {}, // GGML_OP_DIAG_MASK_ZERO @@ -164,6 +165,7 @@ constexpr const qnn_op_caps_t kOpCaps[] = { {}, // GGML_OP_POOL_2D_BACK {}, // GGML_OP_UPSCALE {}, // GGML_OP_PAD + {}, // GGML_OP_ROLL {}, // GGML_OP_PAD_REFLECT_1D {}, // GGML_OP_ARANGE @@ -194,6 +196,7 @@ constexpr const qnn_op_caps_t kOpCaps[] = { {}, // GGML_OP_CROSS_ENTROPY_LOSS {}, // GGML_OP_CROSS_ENTROPY_LOSS_BACK {}, // GGML_OP_OPT_STEP_ADAMW + {}, // GGML_OP_GLU // ggml_unary_op {}, // GGML_UNARY_OP_ABS @@ -320,6 +323,7 @@ constexpr const op_constructor_t kOpConstructors[] = { nullptr, // GGML_OP_TRANSPOSE nullptr, // GGML_OP_GET_ROWS nullptr, // GGML_OP_GET_ROWS_BACK + nullptr, // GGML_OP_SET_ROWS nullptr, // GGML_OP_DIAG nullptr, // GGML_OP_DIAG_MASK_INF nullptr, // GGML_OP_DIAG_MASK_ZERO @@ -338,6 +342,7 @@ constexpr const op_constructor_t kOpConstructors[] = { nullptr, // GGML_OP_POOL_2D_BACK nullptr, // GGML_OP_UPSCALE nullptr, // GGML_OP_PAD + nullptr, // GGML_OP_ROLL nullptr, // GGML_OP_PAD_REFLECT_1D nullptr, // GGML_OP_ARANGE nullptr, // GGML_OP_TIMESTEP_EMBEDDING @@ -367,6 +372,7 @@ constexpr const op_constructor_t kOpConstructors[] = { nullptr, // GGML_OP_CROSS_ENTROPY_LOSS nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK nullptr, // GGML_OP_OPT_STEP_ADAMW + nullptr, // GGML_OP_GLU // ggml_unary_op nullptr, // GGML_UNARY_OP_ABS From a29243e7a498e3eb55dda9e182d56738078769a1 Mon Sep 17 00:00:00 2001 From: nullname Date: Fri, 11 Jul 2025 16:58:45 +0800 Subject: [PATCH 162/166] feat: perf opt quant (#47) * feat: add mixed precision dot product implementation and function declaration * feat: implement mixed precision vector dot product and conversion functions * fix: update data type handling in matrix multiplication implementation * fix: adjust row count handling in matrix multiplication implementation for accurate slicing * fix: optimize matrix multiplication implementation by unroll loop * update performance tracking for matrix multiplication implementation * add fetching * wip * fix: support F16 * F32 multiplication in is_mul_mat_supported function * fix: improve src0 fetching logic in vec_dot_product_mixed_impl for better alignment handling * fix test failure for row width 67 * try fix failed test * fix: rename aligned_address to align_down for clarity in vector alignment handling * wip * qnn fix: update device capabilities for quantized types in qnn-lib to improve compatibility * fix test failure at width == 193 * fix: replace zero vector initialization with previous vector in mixed dot product implementation * wip * fix: improve handling of last vector in mixed dot product implementation * wip * wip * wip * wip * Enhance mul_mat_f32 function to support quantized types and improve static assertions * rename * Refactor dequantization functions to use npu_device_fp16_t and improve type handling * Optimize dequantization in dequantize_row_q8_0 by replacing qf32 multiplication with qf16 * Optimize dequantization in dequantize_row_q4_0 by replacing qf32 multiplication with qf16 * Add hvx_vsf_convert_vhf function for improved vector conversion * add perf logs * Refactor dequantize_row_q4_0 for alignment * Update logging in supports_op_impl and supports_op to use ggml_op_desc for better clarity * Add support for ROPE operation in NPU capabilities and related functions * Implement ROPE operation in tensor and op_rope, including cache initialization and correction dimension calculations * enable ROPE by adding operation validation * add support to freq is null case * wip * Refactor rope_f32 to improve indexing by introducing total_planes calculation * reformat * Refactor rope_f32 to optimize data access patterns by introducing row and plane pointers * Add performance tracking to rope_f32 function for enhanced profiling * Refactor rope_f32 to use a templated implementation * Refactor rope_impl to replace loop with memcpy for improved performance * Refactor mul_mat_impl to support quantization as a template parameter * wip * wip * Refactor rope_impl to optimize plane indexing in the processing loop * Add aligned vector dot product implementation for mixed precision types * wip * Enhance matrix multiplication for F32 and F16 types with alignment checks * Optimize vec_dot_product_mix_aligned_impl for improved performance with additional vector sums * Add alignment checks for matrix multiplication and vector dot products * Refactor matrix multiplication to use function pointers for improved readability and maintainability * Fix alignment check in is_dot_product_aligned to ensure correct vector size handling * Remove unused f16_to_f32_table parameter from quantization and dequantization functions * wip * Add L2 fetch for src1 plane rows in matrix multiplication implementation * wip * Refactor hvx_vsf_convert_vhf to accept an additional parameter for flexibility in vector multiplication * Refactor vec_dot_product_mix_aligned_impl to improve variable naming for clarity * Refactor load_dual_block_generic and dequantize_row_q4_0 to improve performance * Refactor vector operation functions to improve clarity and consistency in variable usage * wip * wip * Refactor dequantize_row_q4_0_impl for improved clarity and performance in vector operations * wip * Update load_dual_block_generic to use intrinsics * Refactor load_dual_block_generic and load_qual_block_generic for improved performance and clarity * wip * wip * Optimize dequantize_row_q8_0 for improved performance by unrolling for loop * wip * wip * fix typo --- .../src/ggml-qnn/npu/device/op_flash_attn.cpp | 9 +- ggml/src/ggml-qnn/npu/device/op_impl.cpp | 40 +- ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp | 208 +++++++--- ggml/src/ggml-qnn/npu/device/op_rope.cpp | 368 ++++++++++++++++++ ggml/src/ggml-qnn/npu/device/op_rope.hpp | 11 + ggml/src/ggml-qnn/npu/device/tensor.hpp | 9 + ggml/src/ggml-qnn/npu/device/type_traits.cpp | 262 ++++++++----- ggml/src/ggml-qnn/npu/device/type_traits.hpp | 14 +- ggml/src/ggml-qnn/npu/device/util.hpp | 16 + ggml/src/ggml-qnn/npu/device/vec_ops.cpp | 193 ++++++++- ggml/src/ggml-qnn/npu/device/vec_ops.hpp | 90 ++++- ggml/src/ggml-qnn/npu/host/graph.cpp | 12 +- ggml/src/ggml-qnn/npu/host/host_device.cpp | 10 +- ggml/src/ggml-qnn/npu/host/tensor.hpp | 33 +- ggml/src/ggml-qnn/npu/host/util.cpp | 10 + ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl | 8 +- ggml/src/ggml-qnn/shared/profiler.hpp | 2 + 17 files changed, 1096 insertions(+), 199 deletions(-) create mode 100644 ggml/src/ggml-qnn/npu/device/op_rope.cpp create mode 100644 ggml/src/ggml-qnn/npu/device/op_rope.hpp diff --git a/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp b/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp index 0c1ac778ba6f0..af0a122a7eefe 100644 --- a/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp @@ -39,7 +39,6 @@ void flash_attn_impl(hexagon::tensor * out, const hexagon::tensor * q, const hex const auto q_to_vec_dot = hexagon::get_type_traits(k->get_type()).from_float; // TODO: fix this const auto kq_vec_dot = hexagon::get_type_traits(k->get_type()).vec_dot; - const auto v_to_float = hexagon::get_type_traits(v->get_type()).to_float; if (!q_to_vec_dot || !kq_vec_dot) { DEVICE_LOG_ERROR("flash_attn_impl: unsupported data type for q, k, or v\n"); return; @@ -95,7 +94,6 @@ void flash_attn_impl(hexagon::tensor * out, const hexagon::tensor * q, const hex float M = -INFINITY; // maximum KQ value float * VKQ32 = reinterpret_cast(cache_ptr); // FP32 VKQ accumulator - float * V32 = VKQ32 + aligned_dv; // (temporary) FP32 V buffer auto * VKQ16 = reinterpret_cast(VKQ32 + aligned_dv); // (temporary) FP16 VKQ accumulator auto * Q_q = reinterpret_cast( VKQ32 + 2 * aligned_dv); // (temporary) buffer for Q converted to quantized/FP16 @@ -122,7 +120,7 @@ void flash_attn_impl(hexagon::tensor * out, const hexagon::tensor * q, const hex hexagon::l2fetch_row(q_data + q->get_nb(1), row_bytes_q); } - q_to_vec_dot(reinterpret_cast(q_data), Q_q, DK, params->f16_to_f32_table); + q_to_vec_dot(reinterpret_cast(q_data), Q_q, DK); // online softmax / attention // loop over n_kv and n_head_kv @@ -192,10 +190,7 @@ void flash_attn_impl(hexagon::tensor * out, const hexagon::tensor * q, const hex // V += v*expf(s - M) DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(flash_attn, 2, mad); - if (v_to_float) { - v_to_float(v_data, V32, DV, params->f16_to_f32_table); - hexagon::vec_mad_f32(V32, vs, VKQ32, DV); - } else { + { // V is F32 hexagon::vec_mad_f32(reinterpret_cast(v_data), vs, VKQ32, DV); } diff --git a/ggml/src/ggml-qnn/npu/device/op_impl.cpp b/ggml/src/ggml-qnn/npu/device/op_impl.cpp index 4d271a4899aec..6f89f454598ba 100644 --- a/ggml/src/ggml-qnn/npu/device/op_impl.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_impl.cpp @@ -6,6 +6,7 @@ #include "op_flash_attn.hpp" #include "op_mul_mat.hpp" +#include "op_rope.hpp" #include "type_traits.hpp" #include "vec_ops.hpp" @@ -62,7 +63,7 @@ inline void vec_op_impl(const _TyData * src0, const _TyData * src1, size_t count (leftover_bytes + hexagon::unaligned_bytes(iptr1) > hexagon::kBytesPerVector) ? *iptr1 : prev1; curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); - q6op_vstu_variable_ARV(optr, leftover_bytes, _OpIntrinsic(curr0, curr1)); + hexagon::q6op_vstu_variable_ARV(optr, leftover_bytes, _OpIntrinsic(curr0, curr1)); } } @@ -179,16 +180,6 @@ template bool element_wise_op(hexagon::tensor * out, hexagon::co return true; } -bool is_same_shape(const npu_device_tensor_spec & src, const npu_device_tensor_spec & dst) { - for (size_t i = 0; i < DEVICE_TENSOR_MAX_DIMS; ++i) { - if (src.ne[i] != dst.ne[i]) { - return false; - } - } - - return true; -} - bool is_element_wise_op_supported(npu_device_tensor_op op, const npu_device_tensor_spec * dst, const npu_device_tensor_spec * srcs, size_t src_len) { if (op != NPU_OP_ADD && op != NPU_OP_SUB && op != NPU_OP_MUL) { @@ -228,7 +219,7 @@ bool is_element_wise_op_supported(npu_device_tensor_op op, const npu_device_tens return false; } - if (!is_same_shape(src0, *dst)) { + if (!hexagon::is_same_shape(src0, *dst)) { DEVICE_LOG_DEBUG("[%s]src0 and dst have different shape\n", hexagon::op_get_name(op)); return false; } @@ -271,7 +262,7 @@ void rms_norm_vec_f32(const float * src, size_t count, float eps, float * dst) { Q6_V_valign_VVR(Q6_Vqf32_vmpy_VsfVsf(curr, curr), Q6_V_vzero(), leftover_bytes)); } - const float mean = hexagon::vec_reduction_qf32_f32(sum) / count; // TODO: figure out how to do division in vector + const float mean = hexagon::vec_reduction_f32_qf32(sum) / count; // TODO: figure out how to do division in vector const float scale = 1.0f / sqrtf(mean + eps); // TODO: use buildin blas sqrtf? hexagon::vec_scale_f32(src, scale, dst, count); } @@ -354,7 +345,7 @@ bool is_unary_op_supported(npu_device_tensor_op op, const npu_device_tensor_spec return false; } - if (!is_same_shape(src0, *dst)) { + if (!hexagon::is_same_shape(src0, *dst)) { DEVICE_LOG_DEBUG("[%s]src0 and dst have different shape\n", hexagon::op_get_name(op)); return false; } @@ -396,7 +387,7 @@ constexpr const op_capabilities kOpCapabilities[] = { { element_wise_op>, // NPU_DATA_TYPE_F32 element_wise_op>, // NPU_DATA_TYPE_F16 - }, false, // requires_thread_barrier + }, false, // requires_thread_barrier }, { NPU_OP_RMS_NORM, is_unary_op_supported, @@ -412,6 +403,13 @@ constexpr const op_capabilities kOpCapabilities[] = { nullptr, // NPU_DATA_TYPE_F16 }, true, // requires_thread_barrier }, + { + NPU_OP_ROPE, hexagon::is_rope_supported, + { + hexagon::rope_f32, // NPU_DATA_TYPE_F32 + nullptr, // NPU_DATA_TYPE_F16 + }, false, // requires_thread_barrier + }, }; static_assert(kOpCapabilities[NPU_OP_MUL_MAT].compute_funcs[NPU_DATA_TYPE_F32] == hexagon::mul_mat_f32, @@ -424,6 +422,7 @@ static_assert(kOpCapabilities[NPU_OP_RMS_NORM].op == NPU_OP_RMS_NORM, "kOpArray[NPU_OP_RMS_NORM].op != NPU_OP_RMS_NORM"); static_assert(kOpCapabilities[NPU_OP_FLASH_ATTN].op == NPU_OP_FLASH_ATTN, "kOpArray[NPU_OP_FLASH_ATTN].op != NPU_OP_FLASH_ATTN"); +static_assert(kOpCapabilities[NPU_OP_ROPE].op == NPU_OP_ROPE, "kOpArray[NPU_OP_ROPE].op != NPU_OP_ROPE"); hexagon::compute_func_type get_compute_func_impl(npu_device_tensor_op op, npu_device_tensor_data_type type) { if (op >= NPU_OP_COUNT) { @@ -451,17 +450,18 @@ bool requires_thread_barrier(npu_device_tensor_op op) { bool support_op(npu_device_tensor_op op, const npu_device_tensor_spec * dst, const npu_device_tensor_spec * srcs, size_t src_len) { - if (get_compute_func_impl(op, dst->type) == nullptr) { - DEVICE_LOG_ERROR("[%s]unsupported, get_compute_func failed\n", op_get_name(op)); - return false; - } - auto is_supported_func = kOpCapabilities[op].is_supported; if (!is_supported_func || !is_supported_func(op, dst, srcs, src_len)) { DEVICE_LOG_DEBUG("[%s]unsupported, is_supported_func return false\n", op_get_name(op)); return false; } + if (get_compute_func_impl(op, dst->type) == nullptr) { + DEVICE_LOG_DEBUG("[%s]unsupported, get_compute_func failed, type: %s\n", op_get_name(op), + get_type_name(dst->type)); + return false; + } + return true; } diff --git a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp index 449f0edee1544..f8b4da8a21bcc 100644 --- a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp @@ -9,19 +9,24 @@ namespace { template struct get_data_type {}; -template struct get_data_type { - using type = _TyData; +template +struct get_data_type { + using data_type0 = _TyData0; + using data_type1 = _TyData1; }; -template +template void mul_mat_impl(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tensor * dst, hexagon::compute_params * params) { - using data_type = typename get_data_type::type; + using data_type0 = typename get_data_type::data_type0; + using data_type1 = typename get_data_type::data_type1; + + static_assert(!_IsQuantized || std::is_same_v, + "data_type0 must be the same as hexagon::dequant_target_type"); - const bool is_quantized = hexagon::is_quantized_type(src0->get_type()); const auto src0_actual_row_size = hexagon::get_dequantized_row_size(src0); auto * dequantize_row_func = hexagon::get_type_traits(src0->get_type()).to_float; - if (is_quantized && dequantize_row_func == nullptr) { + if (_IsQuantized && dequantize_row_func == nullptr) { DEVICE_LOG_ERROR("Unsupported quantized src0 type: %d, dequantize_row_func is null\n", src0->get_type()); return; } @@ -36,10 +41,10 @@ void mul_mat_impl(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tenso if (total_planes >= params->get_thread_count()) { start_end_plane = params->get_work_slice(total_planes); - } else if (dst->get_ne(1) >= params->get_thread_count()) { - start_end_row = params->get_work_slice(dst->get_ne(1)); - } else { + } else if (dst->get_ne(0) >= params->get_thread_count()) { start_end_element = params->get_work_slice(dst->get_ne(0)); + } else { + start_end_row = params->get_work_slice(dst->get_ne(1)); } if (start_end_plane.second <= start_end_plane.first || start_end_row.second <= start_end_row.first || @@ -57,30 +62,29 @@ void mul_mat_impl(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tenso size_t src0_plane_cache_size = 0; uint8_t * src0_plane_cache_ptr = nullptr; const uint8_t * last_cached_plane_ptr = nullptr; - bool is_mem_cache = false; - if (is_quantized) { + if constexpr (_IsQuantized) { src0_plane_slice_row_count = std::min(params->get_vtcm_quota_size() / src0_actual_row_size, src0_plane_slice_row_count); src0_plane_cache_size = src0_actual_row_size * src0_plane_slice_row_count; src0_plane_cache_ptr = params->get_vtcm_cache(src0_plane_cache_size); if (src0_plane_cache_ptr == nullptr) { - DEVICE_LOG_DEBUG( + DEVICE_LOG_ERROR( "mul_mat_impl: failed to get VTCM cache for src0, size: %zu, src0_plane_slice_row_count: %zu, " "src0_actual_row_size: %zu, will fallback to mem cache\n", src0_plane_cache_size, src0_plane_slice_row_count, src0_actual_row_size); - src0_plane_cache_ptr = params->get_mem_cache(src0_plane_cache_size); - is_mem_cache = true; + return; } } DEVICE_LOG_DEBUG( "mul_mat_impl src0_actual_row_size: %zu, src0_plane_slice_row_count: %zu, is_quantized: %d, vtcm_mem: " "%p(%zu)\n", - src0_actual_row_size, src0_plane_slice_row_count, is_quantized, (void *) src0_plane_cache_ptr, + src0_actual_row_size, src0_plane_slice_row_count, _IsQuantized, (void *) src0_plane_cache_ptr, src0_plane_cache_size); - const size_t valid_row_bytes = src1->get_ne(0) * sizeof(data_type); - DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_WITH_SUB_PROC(dst, params->get_thread_index(), dequant); + const size_t valid_row0_bytes = src0->get_ne(0) * sizeof(data_type0); + const size_t valid_row1_bytes = src1->get_ne(0) * sizeof(data_type1); + DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_WITH_MULTI_SUB_PROC(dst, params->get_thread_index(), mul_mat); uint8_t * dst_ptr = dst->get_write_buffer(); if (!dst_ptr) { @@ -89,8 +93,9 @@ void mul_mat_impl(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tenso return; } - const uint8_t * src0_ptr = src0->get_read_buffer(); - const uint8_t * src1_ptr = src1->get_read_buffer(); + constexpr bool should_fetch_src0_row = !_IsQuantized; + const uint8_t * src0_ptr = src0->get_read_buffer(); + const uint8_t * src1_ptr = src1->get_read_buffer(); for (int64_t ip = start_end_plane.first; ip < start_end_plane.second; ip++) { const auto i3 = ip / dst->get_ne(2); const auto i2 = ip - i3 * dst->get_ne(2); @@ -98,21 +103,25 @@ void mul_mat_impl(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tenso auto * dst_plane = dst_ptr + i3 * dst->get_nb(3) + i2 * dst->get_nb(2); for (int64_t col_idx = start_end_element.first; col_idx < start_end_element.second; col_idx += src0_plane_slice_row_count) { - const auto * src0_plane = + const auto actual_row_count = + std::min(src0_plane_slice_row_count, + start_end_element.second - col_idx); // number of rows in this slice + const uint8_t * src0_plane = src0_ptr + i3 / r03 * src0->get_nb(3) + i2 / r02 * src0->get_nb(2) + col_idx * src0->get_nb(1); - if (src0_plane_cache_ptr) { + if constexpr (_IsQuantized) { if (last_cached_plane_ptr != src0_plane) { - DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_SUB_PROC(dequant); + DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(mul_mat, 0, dequant); - for (int64_t ir = 0; ir < (int64_t) src0_plane_slice_row_count; ir++) { + for (int64_t ir = 0; ir < (int64_t) actual_row_count; ir++) { auto * src0_row = src0_plane + ir * src0->get_nb(1); - if (ir + 1 < src0_plane_slice_row_count) { + if (ir + 1 < actual_row_count) { hexagon::l2fetch_row(src0_row + src0->get_nb(1), src0->get_nb(1)); } - auto * dst_row = reinterpret_cast(src0_plane_cache_ptr + ir * src0_actual_row_size); - dequantize_row_func(src0_row, reinterpret_cast(dst_row), src0->get_ne(0), - params->f16_to_f32_table); + auto * dst_row = reinterpret_cast(src0_plane_cache_ptr + + ir * src0_actual_row_size); + dequantize_row_func(src0_row, reinterpret_cast(dst_row), + src0->get_ne(0)); } last_cached_plane_ptr = src0_plane; @@ -121,22 +130,43 @@ void mul_mat_impl(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tenso src0_plane = src0_plane_cache_ptr; } + if (start_end_row.second > start_end_row.first) { + hexagon::l2fetch_row(src1_plane + start_end_row.first * src1->get_nb(1), valid_row1_bytes); + } + for (int64_t i1 = start_end_row.first; i1 < start_end_row.second; i1++) { - auto * src1_row = src1_plane + i1 * src1->get_nb(1); - auto * dst_row = reinterpret_cast(dst_plane + i1 * dst->get_nb(1)) + col_idx; - for (int64_t i0 = 0; i0 < (int64_t) src0_plane_slice_row_count; i0++) { + DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(mul_mat, 1, vec_dot); + auto * src1_row = src1_plane + i1 * src1->get_nb(1); + auto * dst_row = reinterpret_cast(dst_plane + i1 * dst->get_nb(1)) + col_idx; + int64_t i0 = 0; + for (; i0 + 1 < (int64_t) actual_row_count; i0 += 2) { auto * src0_row = src0_plane + i0 * src0_actual_row_size; - if (i0 + 1 < src0_plane_slice_row_count) { - if (!src0_plane_cache_ptr || is_mem_cache) { - hexagon::l2fetch_row(src0_row + src0_actual_row_size, valid_row_bytes); - } - } else if (ip + 1 < start_end_plane.second) { - hexagon::l2fetch_row(src1_row + src1->get_nb(1), valid_row_bytes); + if constexpr (should_fetch_src0_row) { + hexagon::l2fetch_row(src0_row + src0_actual_row_size, valid_row0_bytes); + } + + // TODO: figure dst how to handle a entire row + dst_row[i0] = _DotFunc(reinterpret_cast(src0_row), + reinterpret_cast(src1_row), (size_t) src0->get_ne(0)); + + if (should_fetch_src0_row && i0 + 2 < (int64_t) actual_row_count) { + hexagon::l2fetch_row(src0_row + src0_actual_row_size + src0_actual_row_size, valid_row0_bytes); } // TODO: figure dst how to handle a entire row - dst_row[i0] = _DotFunc(reinterpret_cast(src0_row), - reinterpret_cast(src1_row), (size_t) src0->get_ne(0)); + dst_row[i0 + 1] = + _DotFunc(reinterpret_cast(src0_row + src0_actual_row_size), + reinterpret_cast(src1_row), (size_t) src0->get_ne(0)); + } + + if (ip + 1 < start_end_plane.second) { + hexagon::l2fetch_row(src1_row + src1->get_nb(1), valid_row1_bytes); + } + + if (i0 < (int64_t) actual_row_count) { + auto * src0_row = src0_plane + i0 * src0_actual_row_size; + dst_row[i0] = _DotFunc(reinterpret_cast(src0_row), + reinterpret_cast(src1_row), (size_t) src0->get_ne(0)); } } } @@ -146,7 +176,7 @@ void mul_mat_impl(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tenso } bool is_quantized_mul_mat_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1) { - if (src1.type != NPU_DATA_TYPE_F32) { + if (src1.type != NPU_DATA_TYPE_F32 && src1.type != NPU_DATA_TYPE_F16) { DEVICE_LOG_DEBUG("[MUL_MAT]src0.type(%s) and src1.type(%s) mismatch and src1 is not F32\n", hexagon::get_type_name(src0.type), hexagon::get_type_name(src1.type)); return false; @@ -166,7 +196,7 @@ bool is_quantized_mul_mat_supported(const npu_device_tensor_spec & src0, const n } const auto vtcm_thread_quota_size = hexagon::default_thread_pool::get_per_thread_vtcm_quota(); - if (src0.ne[0] * sizeof(hexagon::dequantized_element_type) > vtcm_thread_quota_size) { + if (src0.ne[0] * sizeof(hexagon::dequant_target_type) > vtcm_thread_quota_size) { DEVICE_LOG_DEBUG("[MUL_MAT]src0.type(%s) ne[0] is too large: %ld, vtcm_thread_quota_size: %zu\n", hexagon::get_type_name(src0.type), (long) src0.ne[0], vtcm_thread_quota_size); return false; @@ -177,29 +207,113 @@ bool is_quantized_mul_mat_supported(const npu_device_tensor_spec & src0, const n return true; } +bool is_mul_mat_f16_f32_src_tensors_aligned(hexagon::tensor * src0, hexagon::tensor * src1, bool is_src0_quantized) { + const auto * src1_ptr = src1->get_read_buffer_as(); + const auto * src0_ptr = is_src0_quantized ? + src1->get_read_buffer_as() : + src0->get_read_buffer_as(); // skip src0 for quantized tensors + + if (!hexagon::is_f16_f32_dot_product_aligned(src0_ptr, src1_ptr, src0->get_ne(0))) { + DEVICE_LOG_DEBUG("[MUL_MAT]src_tensors_unaligned: ne[0]: %ld\n", (long) src0->get_ne(0)); + return false; + } + + DEVICE_LOG_DEBUG("[MUL_MAT]src_tensors_aligned: ne[0]: %ld\n", (long) src0->get_ne(0)); + return true; +} + +bool is_mul_mat_f16_f16_src_tensors_aligned(hexagon::tensor * src0, hexagon::tensor * src1, bool is_src0_quantized) { + const auto * src1_ptr = src1->get_read_buffer_as(); + const auto * src0_ptr = is_src0_quantized ? src1_ptr : src0->get_read_buffer_as(); + + if (!hexagon::is_f16_f16_dot_product_aligned(src0_ptr, src1_ptr, src0->get_ne(0))) { + DEVICE_LOG_DEBUG("[MUL_MAT]src_tensors_unaligned: ne[0]: %ld\n", (long) src0->get_ne(0)); + return false; + } + + DEVICE_LOG_DEBUG("[MUL_MAT]src_tensors_aligned: ne[0]: %ld\n", (long) src0->get_ne(0)); + return true; +} + +bool is_mul_mat_f32_f32_src_tensors_aligned(hexagon::tensor * src0, hexagon::tensor * src1) { + const auto * src1_ptr = src1->get_read_buffer_as(); + const auto * src0_ptr = src0->get_read_buffer_as(); + + if (!hexagon::is_f32_f32_dot_product_aligned(src0_ptr, src1_ptr, src0->get_ne(0))) { + DEVICE_LOG_DEBUG("[MUL_MAT]src_tensors_unaligned: ne[0]: %ld\n", (long) src0->get_ne(0)); + return false; + } + + DEVICE_LOG_DEBUG("[MUL_MAT]src_tensors_aligned: ne[0]: %ld\n", (long) src0->get_ne(0)); + return true; +} + +typedef void (*mul_mat_func_type)(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tensor * dst, + hexagon::compute_params * params); + +constexpr const mul_mat_func_type kMulMatF16F32Funcs[2][2] = { + { + // non-quantized + mul_mat_impl, // F32 * F32 unaligned + mul_mat_impl, // F32 * F32 aligned + }, + { + // quantized + mul_mat_impl, // F32 * F32 quantized unaligned + mul_mat_impl, // F32 * F32 quantized aligned + }, +}; + +constexpr const mul_mat_func_type kMulMatF16Funcs[2][2] = { + { + // non-quantized + mul_mat_impl, // F16 * F16 unaligned + mul_mat_impl, // F16 * F16 aligned + }, + { + // quantized + mul_mat_impl, // F16 * F16 quantized unaligned + mul_mat_impl, // F16 * F16 quantized aligned + }, +}; + } // namespace namespace hexagon { bool mul_mat_f32(hexagon::tensor * out, compute_params * params) { + static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "mul_mat_f32 requires max dims 4"); + static_assert(std::is_same::value || + std::is_same::value, + "dequant_target_type must be float or npu_device_fp16_t"); + if (!out) { return false; } - static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "mul_mat_f32 requires max dims 4"); auto * src0 = out->get_src(0); auto * src1 = out->get_src(1); if (!src0 || !src1) { return true; // skip if no src } + const bool is_src0_quantized = is_quantized_type(src0->get_type()); switch (src1->get_type()) { case NPU_DATA_TYPE_F32: - mul_mat_impl(src0, src1, out, params); + if (is_src0_quantized || src0->get_type() == NPU_DATA_TYPE_F16) { + kMulMatF16F32Funcs[is_src0_quantized][is_mul_mat_f16_f32_src_tensors_aligned( + src0, src1, is_src0_quantized)](src0, src1, out, params); + } else { + if (is_mul_mat_f32_f32_src_tensors_aligned(src0, src1)) { + mul_mat_impl(src0, src1, out, params); + } else { + mul_mat_impl(src0, src1, out, params); + } + } return true; - case NPU_DATA_TYPE_F16: - mul_mat_impl(src0, src1, out, params); + kMulMatF16Funcs[is_src0_quantized][is_mul_mat_f16_f16_src_tensors_aligned(src0, src1, is_src0_quantized)]( + src0, src1, out, params); return true; default: break; @@ -229,6 +343,12 @@ bool is_mul_mat_supported(npu_device_tensor_op op, const npu_device_tensor_spec const auto & src0 = srcs[0]; const auto & src1 = srcs[1]; if (src0.type != src1.type) { + if (src1.type == NPU_DATA_TYPE_F32 && src0.type == NPU_DATA_TYPE_F16) { + DEVICE_LOG_DEBUG("[%s]src0.type(%s) and src1.type(%s) mismatch, but src0 is F16 and src1 is F32\n", + op_get_name(op), get_type_name(src0.type), get_type_name(src1.type)); + return true; // F16 * F32 is supported + } + #ifdef GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS if (!is_quantized_mul_mat_supported(src0, src1)) { return false; diff --git a/ggml/src/ggml-qnn/npu/device/op_rope.cpp b/ggml/src/ggml-qnn/npu/device/op_rope.cpp new file mode 100644 index 0000000000000..514c445290ef2 --- /dev/null +++ b/ggml/src/ggml-qnn/npu/device/op_rope.cpp @@ -0,0 +1,368 @@ +#include "op_rope.hpp" + +#include "type_traits.hpp" + +#ifndef M_PI +# define M_PI (3.14159265358979323846) +#endif + +namespace { + +// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get +// `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))` +float rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) { + return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float) M_PI)) / (2 * logf(base)); +} + +void rope_yarn_corr_dims(int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]) { + // start and end correction dims + float start = floorf(rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base)); + float end = ceilf(rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base)); + dims[0] = std::max(0, start); + dims[1] = std::min(n_dims - 1, end); +} + +float rope_yarn_ramp(const float low, const float high, const int i0) { + const float y = (i0 / 2 - low) / std::max(0.001f, high - low); + return 1 - std::min(1, std::max(0, y)); +} + +// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn +// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng. +void rope_yarn(float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale, + float * cos_theta, float * sin_theta) { + // Get n-d rotational scaling corrected for extrapolation + float theta_interp = freq_scale * theta_extrap; + float theta = theta_interp; + if (ext_factor != 0.0f) { + float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor; + theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; + + // Get n-d magnitude scaling corrected for interpolation + mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale); + } + *cos_theta = cosf(theta) * mscale; + *sin_theta = sinf(theta) * mscale; +} + +void rope_cache_init(float theta_base, float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, + float ext_factor, float mscale, float * cache, float sin_sign, float theta_scale) { + // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py + float theta = theta_base; + for (int64_t i0 = 0; i0 < ne0; i0 += 2) { + const float ff = freq_factors ? freq_factors[i0 / 2] : 1.0f; + rope_yarn(theta / ff, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]); + cache[i0 + 1] *= sin_sign; + + theta *= theta_scale; + } +} + +void mrope_cache_init(float theta_base_t, float theta_base_h, float theta_base_w, float theta_base_e, + const int sections[4], bool indep_sects, float freq_scale, const float * freq_factors, + float corr_dims[2], int64_t ne0, float ext_factor, float mscale, float * cache, float sin_sign, + float theta_scale) { + // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py + float theta_t = theta_base_t; + float theta_h = theta_base_h; + float theta_w = theta_base_w; + float theta_e = theta_base_e; // extra position id for vision encoder + int sect_dims = sections[0] + sections[1] + sections[2] + sections[3]; + int sec_w = sections[1] + sections[0]; + int sec_e = sections[2] + sec_w; + + for (int64_t i0 = 0; i0 < ne0; i0 += 2) { + const float ff = freq_factors ? freq_factors[i0 / 2] : 1.0f; + + int sector = (i0 / 2) % sect_dims; + if (indep_sects) { + // compute theta independently for each dim sections + // (i.e. reset corresponding theta when `i0` go from one section to another) + if (sector == 0) { + theta_t = theta_base_t; + } else if (sector == sections[0]) { + theta_h = theta_base_h; + } else if (sector == sec_w) { + theta_w = theta_base_w; + } else if (sector == sec_e) { + theta_e = theta_base_e; + } + } + + float theta = theta_t; + if (sector >= sections[0] && sector < sec_w) { + theta = theta_h; + } else if (sector >= sec_w && sector < sec_w + sections[2]) { + theta = theta_w; + } else if (sector >= sec_w + sections[2]) { + theta = theta_e; + } + + rope_yarn(theta / ff, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]); + cache[i0 + 1] *= sin_sign; + + theta_t *= theta_scale; + theta_w *= theta_scale; + theta_h *= theta_scale; + theta_e *= theta_scale; + } +} + +template +bool rope_impl(hexagon::tensor * out, hexagon::compute_params * params) { + const auto * src0 = out->get_src(0); + const auto * src1 = out->get_src(1); + const auto * src2 = out->get_src(2); + + const int n_dims = out->get_op_param(1); + const int n_ctx_orig = out->get_op_param(4); + const int sections[4] = { + out->get_op_param(11), + out->get_op_param(12), + out->get_op_param(13), + out->get_op_param(14), + }; + + const float freq_base = out->get_op_param(5); + const float freq_scale = out->get_op_param(6); + const float ext_factor = out->get_op_param(7); + const float attn_factor = out->get_op_param(8); + const float beta_fast = out->get_op_param(9); + const float beta_slow = out->get_op_param(10); + const float theta_scale = powf(freq_base, -2.0f / n_dims); + + float corr_dims[2]; + rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); + + if (_IsMrope && sections[0] <= 0 && sections[1] <= 0 && sections[2] <= 0) { + DEVICE_LOG_ERROR("[ROPE]invalid sections for MROPE: %d, %d, %d\n", sections[0], sections[1], sections[2]); + return false; // invalid sections for MROPE + } + + if (n_dims % 2 || (_IsVision && n_dims != out->get_ne(0) / 2)) { + DEVICE_LOG_ERROR("[ROPE]invalid n_dims for vision ROPE: %d, expected: %d\n", n_dims, out->get_ne(0) / 2); + return false; // invalid n_dims for vision ROPE + } + + // cache size is (ne0 + CACHE_LINE_SIZE_F32) + const size_t total_cache_size = hexagon::get_aligned_size(out->get_ne(0) * sizeof(float)); + auto * cache_ptr = params->get_vtcm_cache(total_cache_size); + if (!cache_ptr) { + DEVICE_LOG_ERROR("[ROPE]Failed to allocate VTCM cache for flash_attn: %zu bytes\n", total_cache_size); + return false; // failed to allocate cache + } + + const float * freq_factors = nullptr; + if (src2 != nullptr) { + if (src2->get_type() != NPU_DATA_TYPE_F32 || src2->get_ne(0) < n_dims / 2) { + DEVICE_LOG_ERROR("[ROPE]src2 type is not F32 or F16: %s\n", hexagon::get_type_name(src2->get_type())); + return false; // unsupported src2 type + } + + freq_factors = src2->get_read_buffer_as(); + } + + const int64_t total_rows = out->get_ne(3) * out->get_ne(2) * out->get_ne(1); + const auto start_end_row = params->get_work_slice(total_rows); + const auto start_end_plane = + std::pair{ start_end_row.first / out->get_ne(1), + (start_end_row.second + out->get_ne(1) - 1) / out->get_ne(1) }; + + DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_WITH_MULTI_SUB_PROC(out, params->get_thread_index(), rope); + + const float sin_sign = 1.0f; + const int32_t * pos = src1->get_read_buffer_as(); + const uint8_t * src0_data_ptr = src0->get_read_buffer(); + uint8_t * dst_data_ptr = out->get_write_buffer(); + for (int64_t ip = start_end_plane.first; ip < start_end_plane.second; ip++) { + int64_t i3 = ip / out->get_ne(2); // batch + int64_t i2 = ip % out->get_ne(2); // seq-len + float * cache = reinterpret_cast(cache_ptr); + if constexpr (!_IsMrope) { + DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(rope, 0, cache); + const int64_t p = pos[i2]; + rope_cache_init(p, freq_scale, freq_factors, corr_dims, out->get_ne(0), ext_factor, attn_factor, cache, + sin_sign, theta_scale); + } else { + DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(rope, 0, cache); + const int64_t p_t = pos[i2]; + const int64_t p_h = pos[i2 + out->get_ne(2)]; + const int64_t p_w = pos[i2 + out->get_ne(2) * 2]; + const int64_t p_e = pos[i2 + out->get_ne(2) * 3]; + mrope_cache_init(p_t, p_h, p_w, p_e, sections, _IsVision, freq_scale, freq_factors, corr_dims, + out->get_ne(0), ext_factor, attn_factor, cache, sin_sign, theta_scale); + } + + DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(rope, 1, loop); + const uint8_t * src0_plane = src0_data_ptr + i3 * src0->get_nb(3) + i2 * src0->get_nb(2); + uint8_t * dst_plane = dst_data_ptr + i3 * out->get_nb(3) + i2 * out->get_nb(2); + const int64_t start_row = ip == start_end_plane.first ? (start_end_row.first % out->get_ne(1)) : 0; + const int64_t end_row = ip == start_end_plane.second ? (start_end_row.second % out->get_ne(1)) : + out->get_ne(1); // end row is exclusive + for (int64_t i1 = start_row; i1 < end_row; i1++) { // attn-heads + const uint8_t * src0_row = src0_plane + i1 * src0->get_nb(1); + uint8_t * dst_row = dst_plane + i1 * out->get_nb(1); + if constexpr (_IsNeoX || _IsMrope) { + if constexpr (_IsVision) { + for (int64_t i0 = 0; i0 < n_dims; i0 += 2) { + const int64_t ic = i0 / 2; + + const float cos_theta = cache[i0 + 0]; + const float sin_theta = cache[i0 + 1]; + + const float * const src = (float *) (src0_row + ic * src0->get_nb(0)); + float * dst_data = (float *) (dst_row + ic * out->get_nb(0)); + + const float x0 = src[0]; + const float x1 = src[n_dims]; + + dst_data[0] = x0 * cos_theta - x1 * sin_theta; + dst_data[n_dims] = x0 * sin_theta + x1 * cos_theta; + } + } else { + for (int64_t i0 = 0; i0 < n_dims; i0 += 2) { + const int64_t ic = i0 / 2; + + const float cos_theta = cache[i0 + 0]; + const float sin_theta = cache[i0 + 1]; + + const float * const src = (float *) (src0_row + ic * src0->get_nb(0)); + float * dst_data = (float *) (dst_row + ic * out->get_nb(0)); + + const float x0 = src[0]; + const float x1 = src[n_dims / 2]; + + dst_data[0] = x0 * cos_theta - x1 * sin_theta; + dst_data[n_dims / 2] = x0 * sin_theta + x1 * cos_theta; + } + } + } else { + for (int64_t i0 = 0; i0 < n_dims; i0 += 2) { + const float cos_theta = cache[i0 + 0]; + const float sin_theta = cache[i0 + 1]; + + const float * const src = (float *) (src0_row + i0 * src0->get_nb(0)); + float * dst_data = (float *) (dst_row + i0 * out->get_nb(0)); + + const float x0 = src[0]; + const float x1 = src[1]; + + dst_data[0] = x0 * cos_theta - x1 * sin_theta; + dst_data[1] = x0 * sin_theta + x1 * cos_theta; + } + } + + if constexpr (_IsVision) { + for (int64_t i0 = n_dims; i0 < out->get_ne(0); i0 += 2) { + const int64_t ic = i0 / 2; + + const float cos_theta = cache[i0 + 0]; + const float sin_theta = cache[i0 + 1]; + + const float * const src = (float *) (src0_row + ic * src0->get_nb(0)); + float * dst_data = (float *) (dst_row + ic * out->get_nb(0)); + + const float x0 = src[0]; + const float x1 = src[n_dims]; + + dst_data[0] = x0 * cos_theta - x1 * sin_theta; + dst_data[n_dims] = x0 * sin_theta + x1 * cos_theta; + } + } else { + // fill the remain channels with data from src tensor + memcpy(dst_row + n_dims * out->get_nb(0), src0_row + n_dims * src0->get_nb(0), + (out->get_ne(0) - n_dims) * sizeof(float)); + } + } + } + + out->release_write_buffer(); + return true; +} + +typedef bool (*rope_impl_func)(hexagon::tensor * out, hexagon::compute_params * params); + +constexpr const rope_impl_func kRopeImplFuncs[8] = { + rope_impl, // IsNotNeoX, IsNotMrope, IsNotVision + rope_impl, // IsNotNeoX, IsNotMrope, IsVision + rope_impl, // IsNotNeoX, IsMrope, IsNotVision + rope_impl, // IsNotNeoX, IsMrope, IsVision + rope_impl, // IsNeoX, IsNotMrope, IsNotVision + rope_impl, // IsNeoX, IsNotMrope, IsVision + rope_impl, // IsNeoX, IsMrope, IsNotVision + rope_impl, // IsNeoX, IsMrope, IsVision +}; + +} // namespace + +namespace hexagon { + +bool rope_f32(tensor * out, compute_params * params) { + const int mode = out->get_op_param(2); + const bool is_neox = mode & NPU_ROPE_TYPE_NEOX; + const bool is_mrope = mode & NPU_ROPE_TYPE_MROPE; // ggml_rope_multi, multimodal rotary position embedding + const bool is_vision = mode == NPU_ROPE_TYPE_VISION; + + size_t impl_index = is_neox ? 4 : 0; + impl_index += is_mrope ? 2 : 0; + impl_index += is_vision ? 1 : 0; + + if (impl_index >= sizeof(kRopeImplFuncs) / sizeof(kRopeImplFuncs[0])) { + DEVICE_LOG_ERROR("[ROPE]invalid impl_index: %zu\n", impl_index); + return false; // invalid impl index + } + + return kRopeImplFuncs[impl_index](out, params); +} + +bool is_rope_supported(npu_device_tensor_op op, const npu_device_tensor_spec * dst, const npu_device_tensor_spec * srcs, + size_t src_len) { + if (op != NPU_OP_ROPE) { + DEVICE_LOG_DEBUG("[%s]op is not ROPE\n", op_get_name(op)); + return false; + } + + if (src_len < 2 || !dst || !srcs) { + // freq can be optional, but we require at least 2 srcs: src0 and src1 + DEVICE_LOG_DEBUG("[%s]invalid dst or srcs\n", op_get_name(op)); + return false; + } + + if (dst->type != NPU_DATA_TYPE_F32) { + DEVICE_LOG_DEBUG("[%s]dst type is not F32: %s\n", op_get_name(op), get_type_name(dst->type)); + return false; // add more dst type if needed + } + + const auto & src0 = srcs[0]; + if (src0.type != dst->type) { + DEVICE_LOG_DEBUG("[%s]src0 type is not the same as dst type: %s vs %s\n", op_get_name(op), + get_type_name(src0.type), get_type_name(dst->type)); + return false; // unsupported src0 type + } + + const auto & src1 = srcs[1]; + if (src1.type != NPU_DATA_TYPE_I32) { + DEVICE_LOG_DEBUG("[%s]src1 type is not I32: %s\n", op_get_name(op), get_type_name(src1.type)); + return false; // unsupported src1 type + } + + if (src_len > 2) { + const auto & src2 = srcs[2]; + if (src2.type != NPU_DATA_TYPE_F32) { + DEVICE_LOG_DEBUG("[%s]src2 type is not F32: %s\n", op_get_name(op), get_type_name(src2.type)); + return false; // unsupported src2 type + } + + DEVICE_LOG_DEBUG("[%s]freq is present\n", op_get_name(op)); + } + + if (!is_same_shape(src0, *dst)) { + DEVICE_LOG_DEBUG("[%s]src0 and dst have different shape\n", op_get_name(op)); + return false; + } + + // TODO: check the params for ROPE operation + return true; // ROPE operation is not supported yet +} + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/device/op_rope.hpp b/ggml/src/ggml-qnn/npu/device/op_rope.hpp new file mode 100644 index 0000000000000..f2be465ae1a3d --- /dev/null +++ b/ggml/src/ggml-qnn/npu/device/op_rope.hpp @@ -0,0 +1,11 @@ +#pragma once + +#include "op_types.hpp" + +namespace hexagon { + +bool rope_f32(tensor * out, compute_params * params); +bool is_rope_supported(npu_device_tensor_op op, const npu_device_tensor_spec * dst, const npu_device_tensor_spec * srcs, + size_t src_len); + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/device/tensor.hpp b/ggml/src/ggml-qnn/npu/device/tensor.hpp index bad260e5e50c2..3bf834f826f4c 100644 --- a/ggml/src/ggml-qnn/npu/device/tensor.hpp +++ b/ggml/src/ggml-qnn/npu/device/tensor.hpp @@ -110,6 +110,15 @@ class tensor { return _data + _info.offset; } + template const _Ty * get_read_buffer_as() const { + const auto * buffer = get_read_buffer(); + if (!buffer) { + return nullptr; + } + + return reinterpret_cast(buffer); + } + uint8_t * get_write_buffer() const { if (_info.is_constant) { DEVICE_LOG_ERROR("Attempt to write to a constant tensor: %p", (void *) this); diff --git a/ggml/src/ggml-qnn/npu/device/type_traits.cpp b/ggml/src/ggml-qnn/npu/device/type_traits.cpp index 87d361819d620..704607167fec5 100644 --- a/ggml/src/ggml-qnn/npu/device/type_traits.cpp +++ b/ggml/src/ggml-qnn/npu/device/type_traits.cpp @@ -29,24 +29,38 @@ inline npu_device_fp16_t to_fp16(const float src) { } template inline HVX_Vector load_block_generic(const _TBlock & src) { - uint8_t buffer[hexagon::kBytesPerVector]; + static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock), "wrong q4_0 block size/padding"); - static_assert(sizeof(buffer) == sizeof(HVX_Vector), "wrong cvt size/padding"); - static_assert(sizeof(buffer) >= sizeof(src.qs), "wrong q4_0 block size/padding"); + const HVX_Vector * qs0 = reinterpret_cast(src.qs); + const HVX_Vector * qs1 = qs0 + 1; + return Q6_V_valign_VVR(*qs1, *qs0, (size_t) src.qs); +} + +template inline HVX_Vector load_dual_block_generic(const _TBlock * srcs) { + static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock) * 2, "wrong q4_0 block size/padding"); + constexpr const uint32_t kSizeOfQs = sizeof(_TBlock::qs); - memcpy(&buffer[0], src.qs, sizeof(src.qs)); - return *reinterpret_cast(buffer); + const HVX_Vector * qs0 = reinterpret_cast(srcs->qs); + const HVX_Vector * qs1 = qs0 + 1; + HVX_Vector blocks = Q6_V_valign_VVR(*qs1, *qs0, (size_t) srcs->qs); + HVX_Vector block1 = Q6_V_valign_VVR(Q6_V_vzero(), blocks, sizeof(_TBlock)); + return Q6_V_lo_W(Q6_W_vshuff_VVR(block1, blocks, kSizeOfQs)); } -template inline HVX_Vector load_dual_block_generic(const _TBlock & src1, const _TBlock & src2) { - uint8_t buffer[hexagon::kBytesPerVector]; +template inline HVX_Vector load_qual_block_generic(const _TBlock * srcs) { + static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock) * 4, "wrong q4_0 block size/padding"); + constexpr const uint32_t kSizeOfQs = sizeof(_TBlock::qs); - static_assert(sizeof(buffer) == sizeof(HVX_Vector), "wrong cvt size/padding"); - static_assert(sizeof(buffer) >= sizeof(src1.qs) * 2, "wrong q4_0 block size/padding"); + const HVX_Vector * qs0 = reinterpret_cast(srcs->qs); + const HVX_Vector * qs1 = qs0 + 1; + HVX_Vector blocks = Q6_V_valign_VVR(*qs1, *qs0, (size_t) srcs->qs); + HVX_Vector block1 = Q6_V_valign_VVR(Q6_V_vzero(), blocks, sizeof(_TBlock)); + HVX_Vector block2 = Q6_V_valign_VVR(Q6_V_vzero(), blocks, sizeof(_TBlock) * 2); + HVX_Vector block3 = Q6_V_valign_VVR(Q6_V_vzero(), blocks, sizeof(_TBlock) * 3); - memcpy(&buffer[0], src1.qs, sizeof(src1.qs)); - memcpy(&buffer[sizeof(src1.qs)], src2.qs, sizeof(src2.qs)); - return *reinterpret_cast(buffer); + HVX_VectorPair qp0 = Q6_W_vshuff_VVR(block1, blocks, kSizeOfQs); + HVX_VectorPair qp1 = Q6_W_vshuff_VVR(block3, block2, kSizeOfQs); + return Q6_V_lo_W(Q6_W_vshuff_VVR(Q6_V_lo_W(qp1), Q6_V_lo_W(qp0), kSizeOfQs * 2)); } inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8_t * m) { @@ -148,7 +162,7 @@ float make_qkx2_quants(int n, int nmax, const float * x, const float * weights, return scale; } -void quantize_row_fp16(const float * src, void * dst, size_t count, const float * f16_to_f32_table) { +void quantize_row_fp16(const float * src, void * dst, size_t count) { auto * out = reinterpret_cast(dst); // TODO: use hvx intrinsics for better performance for (size_t i = 0; i < count; i++) { @@ -156,7 +170,7 @@ void quantize_row_fp16(const float * src, void * dst, size_t count, const float } } -void quantize_row_q8_0(const float * src, void * dst, size_t count, const float * f16_to_f32_table) { +void quantize_row_q8_0(const float * src, void * dst, size_t count) { const int nb = count / QUANT_BLOCK_SIZE; auto * out = reinterpret_cast(dst); @@ -181,7 +195,7 @@ void quantize_row_q8_0(const float * src, void * dst, size_t count, const float } } -void quantize_row_q4_0(const float * src, void * dst, size_t count, const float * f16_to_f32_table) { +void quantize_row_q4_0(const float * src, void * dst, size_t count) { constexpr const int qk = QUANT_BLOCK_SIZE; const int nb = count / qk; @@ -217,7 +231,7 @@ void quantize_row_q4_0(const float * src, void * dst, size_t count, const float } } -void quantize_row_q4_K(const float * src, void * dst, size_t count, const float * f16_to_f32_table) { +void quantize_row_q4_K(const float * src, void * dst, size_t count) { const int nb = count / QUANT_K_BLOCK_SIZE; auto * out = reinterpret_cast(dst); @@ -274,11 +288,11 @@ void quantize_row_q4_K(const float * src, void * dst, size_t count, const float uint8_t sc, m; for (int j = 0; j < QUANT_K_BLOCK_SIZE / 32; ++j) { get_scale_min_k4(j, out[i].scales, &sc, &m); - const float d = f16_to_f32_table[out[i].d] * sc; + const float d = to_float(out[i].d) * sc; if (!d) { continue; } - const float dm = f16_to_f32_table[out[i].dmin] * m; + const float dm = to_float(out[i].dmin) * m; for (int ii = 0; ii < 32; ++ii) { int l = nearest_int((src[32 * j + ii] + dm) / d); l = std::max(0, std::min(15, l)); @@ -298,90 +312,158 @@ void quantize_row_q4_K(const float * src, void * dst, size_t count, const float } } -void dequantize_row_q8_0(const void * src, float * dst, size_t count, const float * f16_to_f32_table) { +void dequantize_row_q8_0(const void * src, hexagon::dequant_target_type * dst, size_t count) { constexpr const int qk = QUANT_BLOCK_SIZE; static_assert(QUANT_BLOCK_SIZE == hexagon::kBytesPerVector / sizeof(float)); - const int nb = count / qk; - const auto * src_ptr = reinterpret_cast(src); - HVX_UVector * out = ((HVX_UVector *) dst); // TODO: opt for aligned access + const int nb = count / qk; + const auto * src_ptr = reinterpret_cast(src); + auto * dst_ptr = ((hexagon::dequant_target_type *) dst); // TODO: opt for aligned access - for (int i = 0; i < nb; i++) { + int i = 0; + for (; i + 1 < nb; i += 2) { + const auto & src0 = src_ptr[i]; + const auto & src1 = src_ptr[i + 1]; + + HVX_Vector scales01 = + Q6_V_valign_VVR(Q6_Vh_vsplat_R(src1.d), Q6_Vh_vsplat_R(src0.d), hexagon::kBytesPerVector / 2); + + HVX_Vector qs = load_dual_block_generic(src_ptr + i); + HVX_Vector q_lo = Q6_Vhf_equals_Vh(Q6_V_lo_W(Q6_Wh_vunpack_Vb(qs))); + HVX_Vector result = Q6_Vqf16_vmpy_VhfVhf(q_lo, scales01); + + *reinterpret_cast(dst_ptr) = Q6_Vhf_equals_Vqf16(result); + dst_ptr += qk * 2; + } + + if (i < nb) { const auto & src = src_ptr[i]; - HVX_Vector d = Q6_Vh_vsplat_R(src.d); - - HVX_Vector q_lo = load_block_generic(src); - HVX_VectorPair q = Q6_Wh_vunpack_Vb(q_lo); - q = Q6_Wh_vunpack_Vb(Q6_V_lo_W(q)); - q_lo = Q6_Vhf_equals_Vh(Q6_V_lo_W(q)); - q = Q6_Wqf32_vmpy_VhfVhf(q_lo, d); - out[i] = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(q)); + + HVX_Vector scales = Q6_Vh_vsplat_R(src.d); + + HVX_Vector q_lo = load_block_generic(src); + q_lo = Q6_Vhf_equals_Vh(Q6_V_lo_W(Q6_Wh_vunpack_Vb(q_lo))); + HVX_Vector result = Q6_Vqf16_vmpy_VhfVhf(q_lo, scales); + hexagon::q6op_vstu_variable_ARV( + dst_ptr, + Q6_Vhf_equals_Vqf16(result)); // TODO: opt the store } } -void dequantize_row_q4_0(const void * src, float * dst, size_t count, const float * f16_to_f32_table) { +template +void dequantize_row_q4_0_impl(const void * src, hexagon::dequant_target_type * dst, size_t count) { constexpr const int qk = QUANT_BLOCK_SIZE; static_assert(qk % 2 == 0, "qk must be even"); static_assert(QUANT_BLOCK_SIZE == hexagon::kBytesPerVector / sizeof(float)); constexpr const uint32_t kSizeOfQs = sizeof(npu_device_block_q4_0::qs); - const int nb = count / qk; - const auto * src_ptr = reinterpret_cast(src); - HVX_Vector mask = Q6_Vb_vsplat_R(0x0F); - HVX_Vector minus = Q6_Vb_vsplat_R(8); - HVX_UVector * out = ((HVX_UVector *) dst); // TODO: opt for aligned access - - const int loop_count = nb - (nb % 2); - for (int i = 0; i < loop_count; i += 2) { - const auto & src1 = src_ptr[i]; - const auto & src2 = src_ptr[i + 1]; - - HVX_Vector d1 = Q6_Vh_vsplat_R(src1.d); - HVX_Vector d2 = Q6_Vh_vsplat_R(src2.d); - HVX_Vector d = Q6_Vh_vshuff_Vh(Q6_V_valign_VVR(d2, d1, hexagon::kBytesPerVector / 2)); - - HVX_Vector q_lo = load_dual_block_generic(src1, src2); - HVX_Vector q_hi = Q6_Vub_vlsr_VubR(q_lo, 4); - HVX_VectorPair q = Q6_W_vshuff_VVR(q_hi, Q6_V_vand_VV(q_lo, mask), kSizeOfQs); - q_lo = Q6_V_valign_VVR(Q6_V_lo_W(q), Q6_V_vzero(), hexagon::kBytesPerVector / 2); - q_lo = Q6_V_valign_VVR(Q6_V_hi_W(q), q_lo, hexagon::kBytesPerVector / 2); - q_lo = Q6_Vb_vshuff_Vb(q_lo); - q_lo = Q6_Vb_vsub_VbVb(q_lo, minus); - q = Q6_Wh_vunpack_Vb(q_lo); - q_lo = Q6_Vhf_equals_Vh(Q6_V_lo_W(q)); - q = Q6_Wqf32_vmpy_VhfVhf(q_lo, d); - out[i] = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(q)); - out[i + 1] = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(q)); + const int nb = count / qk; + const auto * src_ptr = reinterpret_cast(src); + const HVX_Vector mask = Q6_Vb_vsplat_R(0x0F); + const HVX_Vector minus = Q6_Vb_vsplat_R(8); + hexagon::dequant_target_type * dst_ptr = dst; // TODO: opt for aligned access + + int i = 0; + for (; i + 3 < nb; i += 4) { + const auto & src0 = src_ptr[i]; + const auto & src1 = src_ptr[i + 1]; + const auto & src2 = src_ptr[i + 2]; + const auto & src3 = src_ptr[i + 3]; + + HVX_Vector scales01 = + Q6_V_valign_VVR(Q6_Vh_vsplat_R(src1.d), Q6_Vh_vsplat_R(src0.d), hexagon::kBytesPerVector / 2); + HVX_Vector scales23 = + Q6_V_valign_VVR(Q6_Vh_vsplat_R(src3.d), Q6_Vh_vsplat_R(src2.d), hexagon::kBytesPerVector / 2); + + HVX_Vector qs = load_qual_block_generic(src_ptr + i); + HVX_Vector q_lo = Q6_V_vand_VV(qs, mask); + HVX_Vector q_hi = Q6_Vub_vlsr_VubR(qs, 4); + HVX_VectorPair qp0 = Q6_W_vshuff_VVR(q_hi, q_lo, kSizeOfQs * (1 + 2 + 4)); + q_lo = Q6_Vb_vsub_VbVb(Q6_V_lo_W(qp0), minus); + qp0 = Q6_Wh_vunpack_Vb(q_lo); + q_lo = Q6_Vhf_equals_Vh(Q6_V_lo_W(qp0)); + q_hi = Q6_Vhf_equals_Vh(Q6_V_hi_W(qp0)); + q_lo = Q6_Vqf16_vmpy_VhfVhf(q_lo, scales01); + q_hi = Q6_Vqf16_vmpy_VhfVhf(q_hi, scales23); + + if constexpr (_IsDstAligned) { + reinterpret_cast(dst_ptr)[0] = Q6_Vhf_equals_Vqf16(q_lo); + reinterpret_cast(dst_ptr)[1] = Q6_Vhf_equals_Vqf16(q_hi); + } else { + reinterpret_cast(dst_ptr)[0] = Q6_Vhf_equals_Vqf16(q_lo); + reinterpret_cast(dst_ptr)[1] = Q6_Vhf_equals_Vqf16(q_hi); + } + + dst_ptr += hexagon::kBytesPerVector / sizeof(hexagon::dequant_target_type) * 2; + } + + for (; i + 1 < nb; i += 2) { + const auto & src0 = src_ptr[i]; + const auto & src1 = src_ptr[i + 1]; + + HVX_Vector scales01 = + Q6_V_valign_VVR(Q6_Vh_vsplat_R(src1.d), Q6_Vh_vsplat_R(src0.d), hexagon::kBytesPerVector / 2); + + HVX_Vector qs = load_dual_block_generic(src_ptr + i); + HVX_Vector q_lo = Q6_V_vand_VV(qs, mask); + HVX_Vector q_hi = Q6_Vub_vlsr_VubR(qs, 4); + HVX_VectorPair qp0 = Q6_W_vshuff_VVR(q_hi, q_lo, kSizeOfQs * (1 + 2)); + q_lo = Q6_Vb_vsub_VbVb(Q6_V_lo_W(qp0), minus); + qp0 = Q6_Wh_vunpack_Vb(q_lo); + q_lo = Q6_Vhf_equals_Vh(Q6_V_lo_W(qp0)); + q_lo = Q6_Vqf16_vmpy_VhfVhf(q_lo, scales01); + + if constexpr (_IsDstAligned) { + *reinterpret_cast(dst_ptr) = Q6_Vhf_equals_Vqf16(q_lo); + } else { + *reinterpret_cast(dst_ptr) = Q6_Vhf_equals_Vqf16(q_lo); + } + + dst_ptr += hexagon::kBytesPerVector / sizeof(hexagon::dequant_target_type); } - if (loop_count < nb) { + if (i < nb) { const auto & curr_blk = src_ptr[nb - 1]; - HVX_Vector d = Q6_Vh_vsplat_R(curr_blk.d); - - HVX_Vector q_lo = load_block_generic(curr_blk); - HVX_Vector q_hi = Q6_Vub_vlsr_VubR(q_lo, 4); - q_lo = Q6_V_valign_VVR(Q6_V_vand_VV(q_lo, mask), Q6_V_vzero(), sizeof(curr_blk.qs)); - q_lo = Q6_V_valign_VVR(q_hi, q_lo, hexagon::kBytesPerVector - sizeof(curr_blk.qs)); - q_lo = Q6_Vb_vsub_VbVb(q_lo, minus); - - HVX_VectorPair q = Q6_Wh_vunpack_Vb(q_lo); - q = Q6_Wh_vunpack_Vb(Q6_V_lo_W(q)); - q_lo = Q6_Vhf_equals_Vh(Q6_V_lo_W(q)); - q = Q6_Wqf32_vmpy_VhfVhf(q_lo, d); - out[nb - 1] = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(q)); + HVX_Vector scales = Q6_Vh_vsplat_R(curr_blk.d); + + HVX_Vector qs = load_block_generic(curr_blk); + HVX_Vector q_lo = Q6_V_vand_VV(qs, mask); + HVX_Vector q_hi = Q6_Vub_vlsr_VubR(qs, 4); + HVX_VectorPair qp0 = Q6_W_vshuff_VVR(q_hi, q_lo, kSizeOfQs); + q_lo = Q6_Vb_vsub_VbVb(Q6_V_lo_W(qp0), minus); + qp0 = Q6_Wh_vunpack_Vb(q_lo); + q_lo = Q6_Vhf_equals_Vh(Q6_V_lo_W(qp0)); + q_lo = Q6_Vqf16_vmpy_VhfVhf(q_lo, scales); + if constexpr (_IsDstAligned) { + hexagon::q6op_vstu_variable_aligned(dst_ptr, Q6_Vhf_equals_Vqf16(q_lo)); + } else { + hexagon::q6op_vstu_variable_ARV( + dst_ptr, + Q6_Vhf_equals_Vqf16(q_lo)); // TODO: opt the store + } } } -void dequantize_row_q4_K(const void * src, float * dst, size_t count, const float * f16_to_f32_table) { +void dequantize_row_q4_0(const void * src, hexagon::dequant_target_type * dst, size_t count) { + const bool dst_aligned = hexagon::is_addr_aligned(dst); + if (dst_aligned) { + dequantize_row_q4_0_impl(src, dst, count); + } else { + dequantize_row_q4_0_impl(src, dst, count); + } +} + +void dequantize_row_q4_K(const void * src, hexagon::dequant_target_type * dst, size_t count) { const int nb = count / QUANT_K_BLOCK_SIZE; const auto * src_ptr = reinterpret_cast(src); + auto * dst_ptr = reinterpret_cast<__fp16 *>(dst); // TODO: use intrinsics for (int i = 0; i < nb; i++) { const uint8_t * q = src_ptr[i].qs; - const float d = f16_to_f32_table[src_ptr[i].d]; - const float min = f16_to_f32_table[src_ptr[i].dmin]; + const __fp16 d = reinterpret_cast(src_ptr[i].d); + const __fp16 min = reinterpret_cast(src_ptr[i].dmin); int is = 0; uint8_t sc = 0; @@ -389,17 +471,17 @@ void dequantize_row_q4_K(const void * src, float * dst, size_t count, const floa const auto * scales = src_ptr[i].scales; for (int j = 0; j < QUANT_K_BLOCK_SIZE; j += 64) { get_scale_min_k4(is + 0, scales, &sc, &m); - const float d1 = d * sc; - const float m1 = min * m; + const __fp16 d1 = d * sc; + const __fp16 m1 = min * m; get_scale_min_k4(is + 1, scales, &sc, &m); - const float d2 = d * sc; - const float m2 = min * m; + const __fp16 d2 = d * sc; + const __fp16 m2 = min * m; for (int l = 0; l < 32; ++l) { - dst[0] = d1 * (q[l] & 0xF) - m1; - dst[32] = d2 * ((q[l] >> 4) & 0xF) - m2; - dst++; + dst_ptr[0] = d1 * (q[l] & 0xF) - m1; + dst_ptr[32] = d2 * ((q[l] >> 4) & 0xF) - m2; + dst_ptr++; } - dst += 32; + dst_ptr += 32; q += 32; is += 2; } @@ -412,9 +494,12 @@ template struct dot_func_traits>; }; -template float wrap_dot_func(const void * src0, const void * src1, size_t count) { - using param_type = typename dot_func_traits::param_type; - return _Func(reinterpret_cast(src0), reinterpret_cast(src1), count); +template float wrap_dot_func(const void * src0, const void * src1, size_t count) { + using param_type = typename dot_func_traits::param_type; + + auto * src0_typed = reinterpret_cast(src0); + auto * src1_typed = reinterpret_cast(src1); + return _DotFunc(src0_typed, src1_typed, count); } constexpr const hexagon::device_type_traits kDeviceTypeTraits[] = { @@ -422,6 +507,7 @@ constexpr const hexagon::device_type_traits kDeviceTypeTraits[] = { wrap_dot_func }, { NPU_DATA_TYPE_F16, "F16", 1, sizeof(npu_device_fp16_t), false, nullptr, quantize_row_fp16, wrap_dot_func }, + { NPU_DATA_TYPE_I32, "I32", 1, sizeof(int32_t), false, nullptr, nullptr, nullptr }, { NPU_DATA_TYPE_Q8_0, "Q8_0", QUANT_BLOCK_SIZE, sizeof(npu_device_block_q8_0), true, dequantize_row_q8_0, quantize_row_q8_0 }, { NPU_DATA_TYPE_Q4_0, "Q4_0", QUANT_BLOCK_SIZE, sizeof(npu_device_block_q4_0), true, dequantize_row_q4_0, @@ -436,6 +522,8 @@ static_assert(kDeviceTypeTraits[NPU_DATA_TYPE_F32].type == NPU_DATA_TYPE_F32, "kDeviceTypeTraits F32 type mismatch with npu_device_tensor_data_type enum"); static_assert(kDeviceTypeTraits[NPU_DATA_TYPE_F16].type == NPU_DATA_TYPE_F16, "kDeviceTypeTraits F16 type mismatch with npu_device_tensor_data_type enum"); +static_assert(kDeviceTypeTraits[NPU_DATA_TYPE_I32].type == NPU_DATA_TYPE_I32, + "kDeviceTypeTraits I32 type mismatch with npu_device_tensor_data_type enum"); static_assert(kDeviceTypeTraits[NPU_DATA_TYPE_Q8_0].type == NPU_DATA_TYPE_Q8_0, "kDeviceTypeTraits Q8_0 type mismatch with npu_device_tensor_data_type enum"); static_assert(kDeviceTypeTraits[NPU_DATA_TYPE_Q4_0].type == NPU_DATA_TYPE_Q4_0, diff --git a/ggml/src/ggml-qnn/npu/device/type_traits.hpp b/ggml/src/ggml-qnn/npu/device/type_traits.hpp index 1a0b1665aeaad..aa6e7d11ed500 100644 --- a/ggml/src/ggml-qnn/npu/device/type_traits.hpp +++ b/ggml/src/ggml-qnn/npu/device/type_traits.hpp @@ -5,10 +5,12 @@ namespace hexagon { +using dequant_target_type = npu_device_fp16_t; + bool init_f16_f32_table(float * table, size_t count); -typedef void (*quantize_row_type)(const float * src, void * dst, size_t count, const float * f16_to_f32_table); -typedef void (*dequantize_row_type)(const void * src, float * dst, size_t count, const float * f16_to_f32_table); +typedef void (*quantize_row_type)(const float * src, void * dst, size_t count); +typedef void (*dequantize_row_type)(const void * src, dequant_target_type * dst, size_t count); typedef float (*vec_dot_type)(const void * src0, const void * src1, size_t count); struct device_type_traits { @@ -29,15 +31,13 @@ inline bool is_quantized_type(npu_device_tensor_data_type type) { return get_type_traits(type).is_quantized; } -using dequantized_element_type = float; - inline size_t get_dequantized_row_size(const tensor * tensor) { if (!is_quantized_type(tensor->get_type())) { return tensor->get_nb(1); // for f32 and f16 } auto row_elems_count = tensor->get_ne(0); - return row_elems_count * sizeof(dequantized_element_type); // currently only f32 is supported + return row_elems_count * sizeof(dequant_target_type); // currently only f32 is supported } inline const char * get_type_name(npu_device_tensor_data_type type) { @@ -77,14 +77,14 @@ inline auto make_scoped_op_perf_timer(tensor * op, size_t tidx) { # define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_SUB_PROC(sub_prefix) \ hexagon::npu_sub_process_scoped_timer \ - __npu_op_sub_timer##sub_prefix(__npu_op_timer_##sub_prefix, #sub_prefix) + __npu_op_sub_timer##sub_prefix(__npu_op_timer_##sub_prefix, #sub_prefix) # define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_WITH_MULTI_SUB_PROC(op, tidx, tracker_name) \ auto __npu_op_timer_##tracker_name = hexagon::make_scoped_op_perf_timer(op, tidx) # define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(tracker_name, idx, sub_prefix) \ hexagon::npu_sub_process_scoped_timer \ - __npu_op_sub_timer##sub_prefix(__npu_op_timer_##tracker_name, #sub_prefix) + __npu_op_sub_timer##sub_prefix(__npu_op_timer_##tracker_name, #sub_prefix) #else # define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER(op, tidx) ((void) 0) diff --git a/ggml/src/ggml-qnn/npu/device/util.hpp b/ggml/src/ggml-qnn/npu/device/util.hpp index 8c819fe5838b2..86da92b9a3130 100644 --- a/ggml/src/ggml-qnn/npu/device/util.hpp +++ b/ggml/src/ggml-qnn/npu/device/util.hpp @@ -54,6 +54,8 @@ inline constexpr const char * op_get_name(npu_device_tensor_op op) { return "RMS_NORM"; case NPU_OP_FLASH_ATTN: return "FLASH_ATTN_EXT"; + case NPU_OP_ROPE: + return "ROPE"; default: return "UNKNOWN"; } @@ -64,6 +66,20 @@ inline bool is_transposed_or_permuted(const npu_device_nb_type & nb) { return (nb[0] > nb[1]) || (nb[1] > nb[2]) || (nb[2] > nb[3]); } +inline bool is_same_shape(const npu_device_ne_type & src, const npu_device_ne_type & dst) { + for (size_t i = 0; i < DEVICE_TENSOR_MAX_DIMS; ++i) { + if (src[i] != dst[i]) { + return false; + } + } + + return true; +} + +inline bool is_same_shape(const npu_device_tensor_spec & src, const npu_device_tensor_spec & dst) { + return is_same_shape(src.ne, dst.ne); +} + class power_utils { public: power_utils() { diff --git a/ggml/src/ggml-qnn/npu/device/vec_ops.cpp b/ggml/src/ggml-qnn/npu/device/vec_ops.cpp index 5bdf183e5b185..4375bb7d5b7ae 100644 --- a/ggml/src/ggml-qnn/npu/device/vec_ops.cpp +++ b/ggml/src/ggml-qnn/npu/device/vec_ops.cpp @@ -1,7 +1,5 @@ #include "vec_ops.hpp" -#include - #include "util.hpp" namespace { @@ -100,15 +98,20 @@ inline float vec_dot_product_aligned_impl(const _TElem * src0, const _TElem * sr HVX_Vector sum1 = Q6_V_vzero(); while (src0_vec_ptr_end - src0_vec_ptr > 1) { - HVX_Vector curr0_lo = src0_vec_ptr[0]; - HVX_Vector curr0_hi = src0_vec_ptr[1]; - HVX_Vector curr1_lo = src1_vec_ptr[0]; - HVX_Vector curr1_hi = src1_vec_ptr[1]; + HVX_VectorPair curr0 = reinterpret_cast(src0_vec_ptr)[0]; + HVX_VectorPair curr1 = reinterpret_cast(src1_vec_ptr)[0]; src0_vec_ptr += 2; src1_vec_ptr += 2; - sum0 = _AddFunc(_MpyFunc(curr0_lo, curr1_lo), sum0); - sum1 = _AddFunc(_MpyFunc(curr0_hi, curr1_hi), sum1); + sum0 = _AddFunc(_MpyFunc(Q6_V_lo_W(curr0), Q6_V_lo_W(curr1)), sum0); + sum1 = _AddFunc(_MpyFunc(Q6_V_hi_W(curr0), Q6_V_hi_W(curr1)), sum1); + } + + if (src0_vec_ptr_end - src0_vec_ptr > 0) { + HVX_Vector curr0 = src0_vec_ptr[0]; + HVX_Vector curr1 = src1_vec_ptr[0]; + + sum0 = _AddFunc(_MpyFunc(curr0, curr1), sum0); } return _ReduceFunc(_AddFunc(sum0, sum1)); @@ -130,27 +133,189 @@ inline HVX_Vector vec_add_qf16(HVX_Vector sum, HVX_Vector result) { return Q6_Vqf16_vadd_Vqf16Vqf16(sum, result); } +template +inline float vec_dot_product_mixed_impl(const _TElem0 * src0, const _TElem1 * src1, size_t count) { + static_assert(sizeof(_TElem0) < sizeof(_TElem1), "Element size mismatch: _TElem0 must be smaller than _TElem1"); + static_assert((sizeof(_TElem1) / sizeof(_TElem0)) == 2, + "Element size mismatch: _TElem1 must be twice the size of _TElem0"); + static_assert((sizeof(_TElem1) % sizeof(_TElem0)) == 0, + "Element size mismatch: _TElem1 must be a multiple of _TElem0"); + + constexpr const size_t kElementsPerVector0 = hexagon::kBytesPerVector / sizeof(_TElem0); + constexpr const size_t kElementsPerVector1 = hexagon::kBytesPerVector / sizeof(_TElem1); + + constexpr const __fp16 kOne = 1.0f; + const HVX_Vector kOneV = Q6_Vh_vsplat_R(reinterpret_cast(kOne)); + + const _TElem0 * const src0_ptr_end = src0 + count; + HVX_Vector * src0_vec_ptr = ((HVX_Vector *) src0); + HVX_Vector * src1_vec_ptr = ((HVX_Vector *) src1); + HVX_Vector * const src1_vec_ptr_end = ((HVX_Vector *) src1) + count / kElementsPerVector1; + HVX_Vector prev0 = *src0_vec_ptr++; + HVX_Vector prev1 = *src1_vec_ptr++; + HVX_Vector sum = Q6_V_vzero(); + HVX_Vector sum0 = Q6_V_vzero(); + HVX_Vector sum1 = Q6_V_vzero(); + + while (src1_vec_ptr_end - src1_vec_ptr > 1) { + HVX_Vector curr0 = src0_vec_ptr[0]; + HVX_VectorPair curr1 = reinterpret_cast(src1_vec_ptr)[0]; + + HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + HVX_Vector l1 = Q6_V_valign_VVR(Q6_V_lo_W(curr1), prev1, (size_t) src1); + HVX_Vector h1 = Q6_V_valign_VVR(Q6_V_hi_W(curr1), Q6_V_lo_W(curr1), (size_t) src1); + HVX_VectorPair s0_pair = _ExpandFunc(s0, kOneV); + prev0 = curr0; + prev1 = Q6_V_hi_W(curr1); + src0_vec_ptr++; + src1_vec_ptr += 2; + + sum0 = _AddFunc(_MpyFunc(Q6_V_lo_W(s0_pair), l1), sum0); + sum1 = _AddFunc(_MpyFunc(Q6_V_hi_W(s0_pair), h1), sum1); + } + + sum = _AddFunc(sum0, sum1); + const size_t leftover1 = count % kElementsPerVector1; + if ((src1_vec_ptr_end - ((HVX_Vector *) src1)) > 0) { + // handle the last vector + const bool should_fetch_src0 = + reinterpret_cast(hexagon::align_down(src0_vec_ptr)) < src0_ptr_end; + HVX_Vector curr0 = should_fetch_src0 ? *src0_vec_ptr : prev0; + src0_vec_ptr += should_fetch_src0 ? 1 : 0; + + HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + HVX_VectorPair s0_pair = _ExpandFunc(s0, kOneV); + + const bool has_remaining_src1_vector = src1_vec_ptr_end - src1_vec_ptr > 0; + if (has_remaining_src1_vector) { + HVX_Vector curr1 = *src1_vec_ptr++; + HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); + prev1 = curr1; + + // should_handle_last_vector will be always true here + sum = _AddFunc(_MpyFunc(Q6_V_lo_W(s0_pair), s1), sum); + } + + bool should_fetch_src1 = leftover1 != 0 || !hexagon::is_addr_aligned(src1_vec_ptr); + HVX_Vector curr1 = should_fetch_src1 ? *src1_vec_ptr : prev1; + src1_vec_ptr += should_fetch_src1 ? 1 : 0; + HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); + prev0 = curr0; + prev1 = curr1; + + sum = _AddFunc(_MpyFunc(has_remaining_src1_vector ? Q6_V_hi_W(s0_pair) : Q6_V_lo_W(s0_pair), s1), sum); + } + + const size_t leftover0 = count % kElementsPerVector0; + const size_t leftover_bytes1 = leftover1 * sizeof(_TElem1); + if (leftover1 > 0) { + // handle the leftover elements + HVX_Vector curr0 = + reinterpret_cast(hexagon::align_down(src0_vec_ptr)) < src0_ptr_end ? *src0_vec_ptr : prev0; + HVX_Vector curr1 = (leftover_bytes1 + hexagon::unaligned_bytes(src1_vec_ptr) > hexagon::kBytesPerVector) ? + *src1_vec_ptr : + prev1; + curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); + HVX_VectorPair curr0_pair = _ExpandFunc(curr0, kOneV); + + curr0 = leftover1 == leftover0 ? Q6_V_lo_W(curr0_pair) : Q6_V_hi_W(curr0_pair); + sum = _AddFunc(Q6_V_valign_VVR(_MpyFunc(curr0, curr1), Q6_V_vzero(), leftover_bytes1), sum); + } + + return _ReduceFunc(sum); +} + +template +inline float vec_dot_product_mix_aligned_impl(const _TElem0 * src0, const _TElem1 * src1, size_t count) { + static_assert(sizeof(_TElem0) < sizeof(_TElem1), "Element size mismatch: _TElem0 must be smaller than _TElem1"); + static_assert((sizeof(_TElem1) / sizeof(_TElem0)) == 2, + "Element size mismatch: _TElem1 must be twice the size of _TElem0"); + static_assert((sizeof(_TElem1) % sizeof(_TElem0)) == 0, + "Element size mismatch: _TElem1 must be a multiple of _TElem0"); + + constexpr const size_t kElementsPerVector1 = hexagon::kBytesPerVector / sizeof(_TElem1); + + constexpr const __fp16 kOne = 1.0f; + const HVX_Vector kOneV = Q6_Vh_vsplat_R(reinterpret_cast(kOne)); + + HVX_Vector * src0_vec_ptr = ((HVX_Vector *) src0); + HVX_Vector * src1_vec_ptr = ((HVX_Vector *) src1); + HVX_Vector * const src1_vec_ptr_end = ((HVX_Vector *) src1) + count / kElementsPerVector1; + HVX_Vector sum0 = Q6_V_vzero(); + HVX_Vector sum1 = Q6_V_vzero(); + + { + HVX_Vector sum2 = Q6_V_vzero(); + HVX_Vector sum3 = Q6_V_vzero(); + + while (src1_vec_ptr_end - src1_vec_ptr > 3) { + HVX_VectorPair curr0 = reinterpret_cast(src0_vec_ptr)[0]; + HVX_VectorPair curr10 = reinterpret_cast(src1_vec_ptr)[0]; + HVX_VectorPair curr11 = reinterpret_cast(src1_vec_ptr)[1]; + + HVX_VectorPair curr00 = _ExpandFunc(Q6_V_lo_W(curr0), kOneV); + HVX_VectorPair curr01 = _ExpandFunc(Q6_V_hi_W(curr0), kOneV); + src0_vec_ptr += 2; + src1_vec_ptr += 4; + + sum0 = _AddFunc(_MpyFunc(Q6_V_lo_W(curr00), Q6_V_lo_W(curr10)), sum0); + sum1 = _AddFunc(_MpyFunc(Q6_V_hi_W(curr00), Q6_V_hi_W(curr10)), sum1); + sum2 = _AddFunc(_MpyFunc(Q6_V_lo_W(curr01), Q6_V_lo_W(curr11)), sum2); + sum3 = _AddFunc(_MpyFunc(Q6_V_hi_W(curr01), Q6_V_hi_W(curr11)), sum3); + } + + sum0 = _AddFunc(sum0, sum2); + sum1 = _AddFunc(sum1, sum3); + } + + if (src1_vec_ptr_end - src1_vec_ptr > 1) { + HVX_Vector curr0 = src0_vec_ptr[0]; + HVX_VectorPair curr1 = reinterpret_cast(src1_vec_ptr)[0]; + + HVX_VectorPair s0_pair = _ExpandFunc(curr0, kOneV); + + sum0 = _AddFunc(_MpyFunc(Q6_V_lo_W(s0_pair), Q6_V_lo_W(curr1)), sum0); + sum1 = _AddFunc(_MpyFunc(Q6_V_hi_W(s0_pair), Q6_V_hi_W(curr1)), sum1); + } + + return _ReduceFunc(_AddFunc(sum0, sum1)); +} + } // namespace namespace hexagon { float vec_dot_product_f32_f32(const float * src0, const float * src1, size_t count) { - return vec_dot_product_impl(src0, src1, count); + return vec_dot_product_impl(src0, src1, count); } float vec_dot_product_aligned_f32_f32(const float * src0, const float * src1, size_t count) { - return vec_dot_product_aligned_impl(src0, src1, - count); + return vec_dot_product_aligned_impl(src0, src1, count); } float vec_dot_product_f16_f16(const npu_device_fp16_t * src0, const npu_device_fp16_t * src1, size_t count) { - return vec_dot_product_impl( - src0, src1, count); + return vec_dot_product_impl(src0, src1, + count); } float vec_dot_product_aligned_f16_f16(const npu_device_fp16_t * src0, const npu_device_fp16_t * src1, size_t count) { - return vec_dot_product_aligned_impl( + return vec_dot_product_aligned_impl( src0, src1, count); } +float vec_dot_product_f16_f32(const npu_device_fp16_t * src0, const float * src1, size_t count) { + return vec_dot_product_mixed_impl(src0, src1, count); +} + +float vec_dot_product_aligned_f16_f32(const npu_device_fp16_t * src0, const float * src1, size_t count) { + return vec_dot_product_mix_aligned_impl(src0, src1, count); +} + } // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/device/vec_ops.hpp b/ggml/src/ggml-qnn/npu/device/vec_ops.hpp index 406075fc9cdde..220dc8f77c02d 100644 --- a/ggml/src/ggml-qnn/npu/device/vec_ops.hpp +++ b/ggml/src/ggml-qnn/npu/device/vec_ops.hpp @@ -1,7 +1,6 @@ #pragma once #include -#include #include @@ -12,10 +11,22 @@ namespace hexagon { constexpr const size_t kBytesPerVector = sizeof(HVX_Vector); // 128 for v73 constexpr const size_t kAlignMask = kBytesPerVector - 1; +inline size_t get_aligned_size(size_t size) { + return (size + kAlignMask) & ~kAlignMask; +} + inline size_t unaligned_bytes(const void * addr) { return ((size_t) addr) & kAlignMask; } +template inline const _TyData * align_down(const _TyData * addr) { + return reinterpret_cast(reinterpret_cast(addr) - unaligned_bytes(addr)); +} + +inline size_t bytes_to_vector_boundary(const void * addr) { + return kBytesPerVector - unaligned_bytes(addr); +} + inline bool is_addr_aligned(const void * addr) { return unaligned_bytes(addr) == 0; } @@ -109,6 +120,50 @@ inline HVX_VectorPair qhmath_hvx_vqf32_convert_vqf16(HVX_Vector vxl) { return Q6_W_vcombine_VV(vxh_w, vxl_w); } +template inline void q6op_vstu_variable_ARV(void * addr, HVX_Vector vin) { + vin = Q6_V_vlalign_VVR(vin, vin, (size_t) addr); //rotate as needed. + uint32_t left_off = unaligned_bytes(addr); + uint32_t right_off = left_off + _TyBytes; + HVX_VectorPred qL_not = Q6_Q_vsetq_R((size_t) addr); + HVX_VectorPred qR = Q6_Q_vsetq2_R(right_off); + if (right_off > 128) { + Q6_vmaskedstoreq_QAV(qR, (HVX_Vector *) addr + 1, vin); + qR = Q6_Q_vcmp_eq_VbVb(vin, vin); // all 1's + } + qL_not = Q6_Q_or_QQn(qL_not, qR); + Q6_vmaskedstorenq_QAV(qL_not, (HVX_Vector *) addr, vin); +} + +template inline void q6op_vstu_variable_aligned(void * addr, HVX_Vector vin) { + HVX_VectorPred qR = Q6_Q_vsetq2_R(_TyBytes); + Q6_vmaskedstorenq_QAV(qR, (HVX_Vector *) addr, vin); +} + +inline void q6op_vstu_variable_ARV(void * addr, int n, HVX_Vector vin) { + vin = Q6_V_vlalign_VVR(vin, vin, (size_t) addr); //rotate as needed. + unsigned left_off = unaligned_bytes(addr); + unsigned right_off = left_off + n; + HVX_VectorPred qL_not = Q6_Q_vsetq_R((size_t) addr); + HVX_VectorPred qR = Q6_Q_vsetq2_R(right_off); + if (right_off > 128) { + Q6_vmaskedstoreq_QAV(qR, (HVX_Vector *) addr + 1, vin); + qR = Q6_Q_vcmp_eq_VbVb(vin, vin); // all 1's + } + qL_not = Q6_Q_or_QQn(qL_not, qR); + Q6_vmaskedstorenq_QAV(qL_not, (HVX_Vector *) addr, vin); +} + +inline HVX_VectorPair hvx_vqf32_convert_vhf(HVX_Vector vxl) { + return qhmath_hvx_vqf32_convert_vqf16(qhmath_hvx_vqf16_convert_vhf(vxl)); +} + +inline HVX_VectorPair hvx_vsf_convert_vhf(HVX_Vector vxl, HVX_Vector one) { + HVX_VectorPair res = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vxl), one); + HVX_Vector vxl_w = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(res)); + HVX_Vector vxh_w = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(res)); + return Q6_W_vcombine_VV(vxh_w, vxl_w); +} + inline HVX_Vector vec_reduction_qf32(HVX_Vector sums) { constexpr const size_t kFloatsPerVector = hexagon::kBytesPerVector / sizeof(float); static_assert(kFloatsPerVector == 32 || kFloatsPerVector == 16, "kFloatsPerVector should be 16 or 32"); @@ -130,7 +185,7 @@ inline HVX_Vector vec_reduction_qf32(HVX_Vector sums) { return sums; } -inline float vec_reduction_qf32_f32(HVX_Vector sums) { +inline float vec_reduction_f32_qf32(HVX_Vector sums) { return get_flt0_from_fltv(Q6_Vsf_equals_Vqf32(vec_reduction_qf32(sums))); } @@ -265,10 +320,41 @@ inline void vec_mad_f16(const npu_device_fp16_t * src, float scale, npu_device_f vec_scale_impl(src, scale, dst, count); } +template +inline bool is_dot_product_aligned(const _TElem0 * src0, const _TElem1 * src1, size_t count) { + static_assert(sizeof(_TElem0) <= sizeof(_TElem1), "src0 should be smaller than src1"); + + if (!hexagon::is_addr_aligned(src0) || !hexagon::is_addr_aligned(src1)) { + return false; + } + + if (count % (hexagon::kBytesPerVector / sizeof(_TElem0)) != 0) { + return false; + } + + return true; +} + float vec_dot_product_f32_f32(const float * src0, const float * src1, size_t count); float vec_dot_product_aligned_f32_f32(const float * src0, const float * src1, size_t count); +inline bool is_f32_f32_dot_product_aligned(const float * src0, const float * src1, size_t count) { + return is_dot_product_aligned(src0, src1, count); +} + float vec_dot_product_f16_f16(const npu_device_fp16_t * src0, const npu_device_fp16_t * src1, size_t count); float vec_dot_product_aligned_f16_f16(const npu_device_fp16_t * src0, const npu_device_fp16_t * src1, size_t count); +inline bool is_f16_f16_dot_product_aligned(const npu_device_fp16_t * src0, const npu_device_fp16_t * src1, + size_t count) { + return is_dot_product_aligned(src0, src1, count); +} + +float vec_dot_product_f16_f32(const npu_device_fp16_t * src0, const float * src1, size_t count); +float vec_dot_product_aligned_f16_f32(const npu_device_fp16_t * src0, const float * src1, size_t count); + +inline bool is_f16_f32_dot_product_aligned(const npu_device_fp16_t * src0, const float * src1, size_t count) { + return is_dot_product_aligned(src0, src1, count); +} + } // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/host/graph.cpp b/ggml/src/ggml-qnn/npu/host/graph.cpp index 9ac69924d3f49..1d40fe0dd5176 100644 --- a/ggml/src/ggml-qnn/npu/host/graph.cpp +++ b/ggml/src/ggml-qnn/npu/host/graph.cpp @@ -29,8 +29,7 @@ bool host_graph::update(ggml_cgraph * cgraph) { return false; } - LOG_DEBUG("[%p]host_graph::update started\n", (void *) this); - + PROFILER_LOG_DEBUG("[%p]host_graph::update started\n", (void *) this); SCOPED_PERFORMANCE_TRACKER("[hexagon-npu][%p]update, handle(%p)", (void *) this, (void *) _graph_handle); _tensor_handles.clear(); @@ -57,10 +56,11 @@ bool host_graph::update(ggml_cgraph * cgraph) { _tensor_handles.push_back(tensor_obj->get_device_tensor_handle()); _tensor_update_configs.push_back(tensor_obj->update_hosts_params_only(node)); - LOG_DEBUG("node[%d]%s(%s), addr: %p, type: %s, dims: %ldx%ldx%ldx%ld, tensor_handle: %p\n", i, - ggml_get_name(node), ggml_op_desc(node), (void *) node, ggml_type_name(node->type), - (long) tensor_obj->get_ne(0), (long) tensor_obj->get_ne(1), (long) tensor_obj->get_ne(2), - (long) tensor_obj->get_ne(3), (void *) tensor_obj->get_device_tensor_handle()); + + PROFILER_LOG_DEBUG("node[%d]%s(%s), addr(%p), %s_%ldx%ldx%ldx%ld, handle(%p)\n", i, ggml_get_name(node), + ggml_op_desc(node), (void *) tensor_obj, ggml_type_name(node->type), + (long) tensor_obj->get_ne(0), (long) tensor_obj->get_ne(1), (long) tensor_obj->get_ne(2), + (long) tensor_obj->get_ne(3), (void *) tensor_obj->get_device_tensor_handle()); } GGML_ASSERT(_tensor_handles.size() == _tensor_update_configs.size()); diff --git a/ggml/src/ggml-qnn/npu/host/host_device.cpp b/ggml/src/ggml-qnn/npu/host/host_device.cpp index e88ef002bf0c1..7b9a13c82f59e 100644 --- a/ggml/src/ggml-qnn/npu/host/host_device.cpp +++ b/ggml/src/ggml-qnn/npu/host/host_device.cpp @@ -199,7 +199,7 @@ bool npu_device::supports_op_impl(const ggml_tensor * op) { #ifndef NDEBUG auto * src0_type = i ? ggml_type_name(op->src[0]->type) : "null"; auto * src1_type = (i > 1) ? ggml_type_name(op->src[1]->type) : "null"; - LOG_DEBUG("[%s][%s]unsupported %s(%s,%s), ret: 0x%x, supported: %d\n", get_name(), ggml_op_name(op->op), + LOG_DEBUG("[%s][%s]unsupported %s(%s,%s), ret: 0x%x, supported: %d\n", get_name(), ggml_op_desc(op), ggml_type_name(op->type), src0_type, src1_type, ret, supported); #endif return false; @@ -275,16 +275,16 @@ bool npu_device::supports_op(const ggml_tensor * op) { if (op->op != GGML_OP_NONE && op->op != GGML_OP_VIEW && op->op != GGML_OP_RESHAPE && op->op != GGML_OP_PERMUTE) { _supported_op++; - LOG_DEBUG("[%s][%s]supported, %s, supported/unsupported: %u/%u\n", get_name(), ggml_op_name(op->op), - op_desc, _supported_op.load(), _unsupported_op.load()); + LOG_DEBUG("[%s][%s][%s]supported, %s, supported/unsupported: %u/%u\n", get_name(), ggml_op_desc(op), + ggml_get_name(op), op_desc, _supported_op.load(), _unsupported_op.load()); } return true; } _unsupported_op++; - LOG_DEBUG("[%s][%s]unsupported, %s, supported/unsupported: %u/%u\n", get_name(), ggml_op_name(op->op), op_desc, - _supported_op.load(), _unsupported_op.load()); + LOG_DEBUG("[%s][%s][%s]unsupported, %s, supported/unsupported: %u/%u\n", get_name(), ggml_op_desc(op), + ggml_get_name(op), op_desc, _supported_op.load(), _unsupported_op.load()); return false; } #else diff --git a/ggml/src/ggml-qnn/npu/host/tensor.hpp b/ggml/src/ggml-qnn/npu/host/tensor.hpp index 07e092049ce14..7e8ee8f34cc09 100644 --- a/ggml/src/ggml-qnn/npu/host/tensor.hpp +++ b/ggml/src/ggml-qnn/npu/host/tensor.hpp @@ -48,10 +48,14 @@ class host_tensor { tensor->extra = this; _ggml_tensor = tensor; - LOG_DEBUG("host_tensor(%p), ggml_tensor(%s[%ldx%ldx%ldx%ld], nb[%ld][%ld][%ld][%ld], %s, %p), handle(%p)\n", - (void *) this, tensor->name, (long) tensor->ne[0], (long) tensor->ne[1], (long) tensor->ne[2], - (long) tensor->ne[3], (long) tensor->nb[0], (long) tensor->nb[1], (long) tensor->nb[2], - (long) tensor->nb[3], ggml_type_name(tensor->type), (void *) tensor, (void *) _device_tensor_handle); + +#ifndef NDEBUG + { + char desc[1024]; + get_desc(desc, sizeof(desc)); + LOG_DEBUG("host_tensor(%s)\n", desc); + } +#endif } ~host_tensor() { @@ -99,7 +103,11 @@ class host_tensor { auto * ggml_src = _ggml_tensor->src[j]; auto * src = host_tensor::from_ggml_tensor(ggml_src); src_tensor_handles[j] = src->get_device_tensor_handle(); - LOG_DEBUG("host_tensor(%p) set_src[%zu]: %p(%s)\n", (void *) this, j, (void *) src, ggml_src->name); +#ifndef NDEBUG + char desc[1024]; + src->get_desc(desc, sizeof(desc)); + LOG_DEBUG("host_tensor(%p) set_src[%zu]: (%s)\n", (void *) this, j, desc); +#endif } if (memcmp(_info_update.src_handles, src_tensor_handles, sizeof(_info_update.src_handles)) != 0) { @@ -136,7 +144,11 @@ class host_tensor { auto * ggml_src = _ggml_tensor->src[j]; auto * src = host_tensor::from_ggml_tensor(ggml_src); _info_update.src_handles[j] = src->get_device_tensor_handle(); - LOG_DEBUG("host_tensor(%p) set_src[%zu]: %p(%s)\n", (void *) this, j, (void *) src, ggml_src->name); +#ifndef NDEBUG + char desc[1024]; + src->get_desc(desc, sizeof(desc)); + LOG_DEBUG("host_tensor(%p) set_src[%zu]: (%s)\n", (void *) this, j, desc); +#endif } LOG_DEBUG("host_tensor(%p) update_params, op: %s, params: [%x, %x, %x, %x]\n", (void *) this, @@ -156,6 +168,15 @@ class host_tensor { return _info.ne[index]; } + int get_desc(char * buffer, size_t size) const { + return snprintf(buffer, size, "%s[%ldx%ldx%ldx%ld], nb[%ld,%ld,%ld,%ld], %s, addr: %p, ggml: %p, handle:%p", + _ggml_tensor->name, (long) _ggml_tensor->ne[0], (long) _ggml_tensor->ne[1], + (long) _ggml_tensor->ne[2], (long) _ggml_tensor->ne[3], (long) _ggml_tensor->nb[0], + (long) _ggml_tensor->nb[1], (long) _ggml_tensor->nb[2], (long) _ggml_tensor->nb[3], + ggml_type_name(_ggml_tensor->type), (void *) this, (void *) _ggml_tensor, + (void *) _device_tensor_handle); + } + private: remote_handle64 _device_handle = 0; npu_device_tensor_handle_t _device_tensor_handle = 0; diff --git a/ggml/src/ggml-qnn/npu/host/util.cpp b/ggml/src/ggml-qnn/npu/host/util.cpp index 0b005123333d2..a07b4d0ed6007 100644 --- a/ggml/src/ggml-qnn/npu/host/util.cpp +++ b/ggml/src/ggml-qnn/npu/host/util.cpp @@ -13,6 +13,10 @@ static_assert(QUANT_K_SCALE_SIZE == K_SCALE_SIZE, "QUANT_K_SCALE_SIZE size misma static_assert(QUANT_K_BLOCK_SIZE == QK_K, "QUANT_K_BLOCK_SIZE size mismatch"); static_assert(QUANT_BLOCK_SIZE == QK4_0, "QUANT_BLOCK_SIZE size mismatch"); +static_assert(NPU_ROPE_TYPE_NEOX == GGML_ROPE_TYPE_NEOX, "NPU_ROPE_TYPE_NEOX mismatch"); +static_assert(NPU_ROPE_TYPE_MROPE == GGML_ROPE_TYPE_MROPE, "NPU_ROPE_TYPE_MROPE mismatch"); +static_assert(NPU_ROPE_TYPE_VISION == GGML_ROPE_TYPE_VISION, "NPU_ROPE_TYPE_VISION mismatch"); + namespace hexagon { enum npu_device_tensor_op op_to_npu_op(ggml_op op) { @@ -29,6 +33,8 @@ enum npu_device_tensor_op op_to_npu_op(ggml_op op) { return NPU_OP_RMS_NORM; case GGML_OP_FLASH_ATTN_EXT: return NPU_OP_FLASH_ATTN; + case GGML_OP_ROPE: + return NPU_OP_ROPE; default: return NPU_OP_COUNT; } @@ -48,6 +54,8 @@ const char * get_npu_op_desc(enum npu_device_tensor_op op) { return ggml_op_name(GGML_OP_RMS_NORM); case NPU_OP_FLASH_ATTN: return ggml_op_name(GGML_OP_FLASH_ATTN_EXT); + case NPU_OP_ROPE: + return ggml_op_name(GGML_OP_ROPE); default: return "UNKNOWN"; } @@ -59,6 +67,8 @@ enum npu_device_tensor_data_type type_to_npu_type(ggml_type type) { return NPU_DATA_TYPE_F32; case GGML_TYPE_F16: return NPU_DATA_TYPE_F16; + case GGML_TYPE_I32: + return NPU_DATA_TYPE_I32; case GGML_TYPE_Q4_K: return NPU_DATA_TYPE_Q4_K; case GGML_TYPE_Q4_0: diff --git a/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl b/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl index 70626c90cbbed..0aa8d8e8ab48b 100644 --- a/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl +++ b/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl @@ -4,11 +4,15 @@ const uint32_t DEVICE_TENSOR_MAX_DIMS = 4; const uint32_t DEVICE_TENSOR_MAX_SRC = 4; -const uint32_t DEVICE_TENSOR_MAX_OP_PARAMS = 4; +const uint32_t DEVICE_TENSOR_MAX_OP_PARAMS = 16; const uint32_t QUANT_BLOCK_SIZE = 32; const uint32_t QUANT_K_BLOCK_SIZE = 256; const uint32_t QUANT_K_SCALE_SIZE = 12; +const uint32_t NPU_ROPE_TYPE_NEOX = 2; +const uint32_t NPU_ROPE_TYPE_MROPE = 8; +const uint32_t NPU_ROPE_TYPE_VISION = 24; + interface npu_device : remote_handle64{ typedef int64_t ne_type[DEVICE_TENSOR_MAX_DIMS]; @@ -42,12 +46,14 @@ interface npu_device : remote_handle64{ NPU_OP_MUL, NPU_OP_RMS_NORM, NPU_OP_FLASH_ATTN, + NPU_OP_ROPE, NPU_OP_COUNT }; enum tensor_data_type { NPU_DATA_TYPE_F32, NPU_DATA_TYPE_F16, + NPU_DATA_TYPE_I32, NPU_DATA_TYPE_Q8_0, NPU_DATA_TYPE_Q4_0, NPU_DATA_TYPE_Q4_K, diff --git a/ggml/src/ggml-qnn/shared/profiler.hpp b/ggml/src/ggml-qnn/shared/profiler.hpp index 7180dc02957bc..0e9e54051e7d5 100644 --- a/ggml/src/ggml-qnn/shared/profiler.hpp +++ b/ggml/src/ggml-qnn/shared/profiler.hpp @@ -56,6 +56,8 @@ inline scoped_timer make_scope_perf_timer(const char * format, ...) { #ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING # define SCOPED_PERFORMANCE_TRACKER(fmt, ...) \ auto __scoped_timer_##__LINE__ = profiler::make_scope_perf_timer(fmt, __VA_ARGS__) +# define PROFILER_LOG_DEBUG(fmt, ...) GGML_LOG_INFO("[profiler]" fmt, __VA_ARGS__) #else # define SCOPED_PERFORMANCE_TRACKER(fmt, ...) ((void) 0) +# define PROFILER_LOG_DEBUG(...) ((void) 0) #endif From b720e476068f2251275979747d4e582e3a887209 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 11 Jul 2025 17:31:21 +0800 Subject: [PATCH 163/166] fix compiling error --- ggml/src/ggml-qnn/qnn/backend-ops.cpp | 1 + ggml/src/ggml-qnn/qnn/op-config-caps.cpp | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-qnn/qnn/backend-ops.cpp b/ggml/src/ggml-qnn/qnn/backend-ops.cpp index 669e5bd854bf8..f3e06cf09e326 100644 --- a/ggml/src/ggml-qnn/qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/qnn/backend-ops.cpp @@ -113,6 +113,7 @@ constexpr const bool kQnnSupportedOps[] = { false, // GGML_OP_CONV_TRANSPOSE_1D false, // GGML_OP_IM2COL false, // GGML_OP_IM2COL_BACK + false, // GGML_OP_CONV_2D false, // GGML_OP_CONV_2D_DW false, // GGML_OP_CONV_TRANSPOSE_2D false, // GGML_OP_POOL_1D diff --git a/ggml/src/ggml-qnn/qnn/op-config-caps.cpp b/ggml/src/ggml-qnn/qnn/op-config-caps.cpp index 9a5abef8e9d8b..95c4067655e5b 100644 --- a/ggml/src/ggml-qnn/qnn/op-config-caps.cpp +++ b/ggml/src/ggml-qnn/qnn/op-config-caps.cpp @@ -158,6 +158,7 @@ constexpr const qnn_op_caps_t kOpCaps[] = { {}, // GGML_OP_CONV_TRANSPOSE_1D {}, // GGML_OP_IM2COL {}, // GGML_OP_IM2COL_BACK + {}, // GGML_OP_CONV_2D {}, // GGML_OP_CONV_2D_DW {}, // GGML_OP_CONV_TRANSPOSE_2D {}, // GGML_OP_POOL_1D @@ -335,6 +336,7 @@ constexpr const op_constructor_t kOpConstructors[] = { nullptr, // GGML_OP_CONV_TRANSPOSE_1D nullptr, // GGML_OP_IM2COL nullptr, // GGML_OP_IM2COL_BACK + nullptr, // GGML_OP_CONV_2D nullptr, // GGML_OP_CONV_2D_DW nullptr, // GGML_OP_CONV_TRANSPOSE_2D nullptr, // GGML_OP_POOL_1D @@ -389,7 +391,7 @@ constexpr const op_constructor_t kOpConstructors[] = { nullptr, // GGML_UNARY_OP_HARDSWISH nullptr, // GGML_UNARY_OP_HARDSIGMOID nullptr, // GGML_UNARY_OP_EXP - nullptr, // GGML_UNARY_OP_GELU_ERF + nullptr, // GGML_UNARY_OP_GELU_ERF }; static_assert(kOpConstructors[GGML_OP_NONE] == nullptr, "GGML_OP_NONE does not match the nullptr function"); From 560729ed6fa0f335c6388d6dbab365ef805c8ffb Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 12 Jul 2025 00:39:14 +0800 Subject: [PATCH 164/166] fix unit test failure --- ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp b/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp index af0a122a7eefe..1c5ccd9001549 100644 --- a/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp @@ -105,7 +105,9 @@ void flash_attn_impl(hexagon::tensor * out, const hexagon::tensor * q, const hex } const npu_device_fp16_t * mp = - mask_ptr ? reinterpret_cast(mask_ptr + iq1 * mask->get_nb(1)) : nullptr; + mask_ptr ? reinterpret_cast(mask_ptr + iq1 * mask->get_nb(1) + + (iq3 % mask->get_ne(2)) * mask->get_nb(2)) : + nullptr; // k indices const int ik3 = iq3 / rk3; From 4a3a87409b244ae11984f0e1465aa87f5acd0503 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 12 Jul 2025 11:46:02 +0800 Subject: [PATCH 165/166] disable broadcast on flash_attn_ext --- ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp b/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp index 1c5ccd9001549..9c264654c1c9e 100644 --- a/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp @@ -312,6 +312,14 @@ bool is_flash_attn_supported(npu_device_tensor_op op, const npu_device_tensor_sp return false; } + if (q->ne[2] != k->ne[2] || q->ne[3] != k->ne[3] || q->ne[3] != 1) { + // TODO: add broadcast support + DEVICE_LOG_DEBUG("[%s]q and k shapes do not match: q ne: %ld, %ld, %ld, %ld, k ne: %ld, %ld, %ld, %ld\n", + op_get_name(op), q->ne[0], q->ne[1], q->ne[2], q->ne[3], k->ne[0], k->ne[1], k->ne[2], + k->ne[3]); + return false; + } + return true; } From 9a43a23e0ba4c9b740dd4f6e6916c4bcf8e06b61 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Tue, 15 Jul 2025 10:17:52 +0800 Subject: [PATCH 166/166] fix compiling error at new hexagon sdk --- ggml/src/ggml-qnn/npu/CMakeLists.txt | 20 ++++++----- ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp | 1 - ggml/src/ggml-qnn/npu/device/thread_pool.hpp | 5 +-- ggml/src/ggml-qnn/npu/device/vtcm_mem.hpp | 37 +++++++++++++++++--- 4 files changed, 46 insertions(+), 17 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/CMakeLists.txt b/ggml/src/ggml-qnn/npu/CMakeLists.txt index e8ce255fec6a0..1723fd3d4fb73 100644 --- a/ggml/src/ggml-qnn/npu/CMakeLists.txt +++ b/ggml/src/ggml-qnn/npu/CMakeLists.txt @@ -10,6 +10,15 @@ else() message(FATAL_ERROR "HEXAGON_SDK_ROOT not defined") endif() +if(${CMAKE_SYSTEM_NAME} MATCHES "Android") + set(PREBUILT_LIB_DIR "android_aarch64") +elseif(${CMAKE_SYSTEM_NAME} MATCHES "Linux") + set(PREBUILT_LIB_DIR "UbuntuARM_aarch64") +elseif(${CMAKE_SYSTEM_NAME} MATCHES "Windows") + # Windows + set(PREBUILT_LIB_DIR "windows_aarch64") +endif() + if(HEXAGON_SDK_ROOT) include(${HEXAGON_SDK_ROOT}/build/cmake/hexagon_fun.cmake) else() @@ -138,21 +147,13 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Android|Linux|Windows") link_options(hexagon-npu-host) - if(${CMAKE_SYSTEM_NAME} MATCHES "Android") - set(PREBUILT_LIB_DIR "android_aarch64") - elseif(${CMAKE_SYSTEM_NAME} MATCHES "Linux") - set(PREBUILT_LIB_DIR "UbuntuARM_aarch64") - else() - # Windows - set(PREBUILT_LIB_DIR "windows_aarch64") - endif() - choose_dsprpc("3" dsprpc) # cdsprpc link_custom_library(hexagon-npu-host ${dsprpc}) cmake_host_system_information(RESULT BUILD_CPU_COUNT QUERY NUMBER_OF_PHYSICAL_CORES) add_dsp_targets_for_host(hexagon-npu-host "v73" ${BUILD_CPU_COUNT}) add_dsp_targets_for_host(hexagon-npu-host "v75" ${BUILD_CPU_COUNT}) + add_dsp_targets_for_host(hexagon-npu-host "v79" ${BUILD_CPU_COUNT}) list(APPEND NPU_RUNTIME_LIBS "${HEXAGON_SDK_ROOT}/tools/utils/sysmon/sysMonApp") list(APPEND NPU_RUNTIME_LIBS "${HEXAGON_SDK_ROOT}/tools/utils/sysmon/sysMonAppLE") @@ -249,6 +250,7 @@ else() target_link_libraries(hexagon_npu_skel ${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++abi.a ${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++.a + ${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc.a ) set_target_properties(hexagon_npu_skel PROPERTIES OUTPUT_NAME "hexagon_npu_skel_${HEXAGON_ARCH}") target_link_libraries(hexagon_npu_skel qprintf_static) diff --git a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp index f8b4da8a21bcc..ff1335ace2731 100644 --- a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp @@ -3,7 +3,6 @@ #include "thread_pool.hpp" // TODO: remove this dependency #include "type_traits.hpp" #include "vec_ops.hpp" -#include "vtcm_mem.hpp" namespace { diff --git a/ggml/src/ggml-qnn/npu/device/thread_pool.hpp b/ggml/src/ggml-qnn/npu/device/thread_pool.hpp index 9661c006707c3..455d4eec301b5 100644 --- a/ggml/src/ggml-qnn/npu/device/thread_pool.hpp +++ b/ggml/src/ggml-qnn/npu/device/thread_pool.hpp @@ -2,6 +2,7 @@ #include +#include #include #include #include @@ -237,8 +238,8 @@ template class thread_pool { DEVICE_LOG_DEBUG("thread_func_impl.end: %zu", param->tidx); } - std::atomic_bool _thread_exit = false; - std::array _threads; + std::atomic_bool _thread_exit = false; + std::array _threads = {}; qurt_barrier_t _pending = {}; qurt_barrier_t _completed = {}; thread_params _thread_params[kMaxThreadCount] = {}; diff --git a/ggml/src/ggml-qnn/npu/device/vtcm_mem.hpp b/ggml/src/ggml-qnn/npu/device/vtcm_mem.hpp index ab1041f626205..b66ea7f348bbc 100644 --- a/ggml/src/ggml-qnn/npu/device/vtcm_mem.hpp +++ b/ggml/src/ggml-qnn/npu/device/vtcm_mem.hpp @@ -1,5 +1,6 @@ #pragma once +#include #include #include "util.hpp" @@ -9,15 +10,28 @@ namespace hexagon { class vtcm_mem { public: explicit vtcm_mem(size_t size, bool single_page) { + constexpr const unsigned int kTimeoutUs = 10000; // 10ms timeout + size_t avail_size = single_page ? get_avail_page_size() : get_avail_block_size(); if (size > avail_size) { DEVICE_LOG_ERROR("Requested VTCM size %zu exceeds available size %zu\n", size, avail_size); return; } - _vtcm_mem = HAP_request_VTCM((unsigned int) size, single_page ? 1 : 0); + compute_res_attr_t compute_res; + HAP_compute_res_attr_init(&compute_res); + HAP_compute_res_attr_set_serialize(&compute_res, false); + HAP_compute_res_attr_set_vtcm_param(&compute_res, size, single_page ? 1 : 0); + + _vtcm_context_id = HAP_compute_res_acquire(&compute_res, kTimeoutUs); // 10ms timeout + if (_vtcm_context_id == 0) { + DEVICE_LOG_ERROR("Failed to acquire VTCM context: %zu bytes, timeout %zu us\n", size, kTimeoutUs); + return; + } + + _vtcm_mem = HAP_compute_res_attr_get_vtcm_ptr(&compute_res); if (_vtcm_mem == nullptr) { - DEVICE_LOG_ERROR("Failed to allocate VTCM memory: %zu bytes\n", size); + DEVICE_LOG_ERROR("Failed to allocate VTCM memory: %zu bytes, timeout %zu us\n", size, kTimeoutUs); return; } @@ -26,7 +40,18 @@ class vtcm_mem { } explicit vtcm_mem(size_t size, bool single_page, size_t timeout_us) { - _vtcm_mem = HAP_request_async_VTCM((unsigned int) size, single_page ? 1 : 0, (unsigned int) timeout_us); + compute_res_attr_t compute_res; + HAP_compute_res_attr_init(&compute_res); + HAP_compute_res_attr_set_serialize(&compute_res, false); + HAP_compute_res_attr_set_vtcm_param(&compute_res, size, single_page ? 1 : 0); + + _vtcm_context_id = HAP_compute_res_acquire(&compute_res, timeout_us); + if (_vtcm_context_id == 0) { + DEVICE_LOG_ERROR("Failed to acquire VTCM context: %zu bytes, timeout %zu us\n", size, timeout_us); + return; + } + + _vtcm_mem = HAP_compute_res_attr_get_vtcm_ptr(&compute_res); if (_vtcm_mem == nullptr) { DEVICE_LOG_ERROR("Failed to allocate VTCM memory: %zu bytes, timeout %zu us\n", size, timeout_us); return; @@ -37,8 +62,8 @@ class vtcm_mem { } ~vtcm_mem() { - if (is_valid()) { - auto ret = HAP_release_VTCM(_vtcm_mem); + if (_vtcm_context_id != 0) { + auto ret = HAP_compute_res_release(_vtcm_context_id); if (ret != AEE_SUCCESS) { DEVICE_LOG_ERROR("Failed to release VTCM memory: %d\n", ret); } @@ -95,6 +120,8 @@ class vtcm_mem { void * _vtcm_mem = nullptr; size_t _vtcm_size = 0; + unsigned int _vtcm_context_id = 0; + DISABLE_COPY_AND_MOVE(vtcm_mem); };