Skip to content

Bgemm for arm64 #5287

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 9 commits into from
Closed
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions cblas.h
Original file line number Diff line number Diff line change
@@ -1,3 +1,31 @@
/***************************************************************************
* Copyright (c) 2025, The OpenBLAS Project
* All rights reserved.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* 3. Neither the name of the OpenBLAS project nor the names of
* its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
* *****************************************************************************/

#ifndef CBLAS_H
#define CBLAS_H

Expand Down Expand Up @@ -446,6 +474,9 @@ void cblas_sbgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum C
void cblas_sbgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array,
OPENBLAS_CONST float * alpha_array, OPENBLAS_CONST bfloat16 ** A_array, OPENBLAS_CONST blasint * lda_array, OPENBLAS_CONST bfloat16 ** B_array, OPENBLAS_CONST blasint * ldb_array, OPENBLAS_CONST float * beta_array, float ** C_array, OPENBLAS_CONST blasint * ldc_array, OPENBLAS_CONST blasint group_count, OPENBLAS_CONST blasint * group_size);

void cblas_bgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
OPENBLAS_CONST bfloat16 alpha, OPENBLAS_CONST bfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST bfloat16 beta, bfloat16 *C, OPENBLAS_CONST blasint ldc);

#ifdef __cplusplus
}
#endif /* __cplusplus */
Expand Down
8 changes: 8 additions & 0 deletions common.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2025 The OpenBLAS Project. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
Expand Down Expand Up @@ -306,6 +307,13 @@ typedef int blasint;
#define SIZE 8
#define BASE_SHIFT 3
#define ZBASE_SHIFT 4
#elif defined(BFLOAT16_ONLY)
Copy link
Contributor

@annop-w annop-w May 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do not think we need to introduce BFLOAT16_ONLY build flag.
The type of FLOAT is the only difference. Can we simplyl use XFLOAT instead of FLOAT for C matrix, for example, in kernel/arm64/bgemm_beta.c ?

#define IFLOAT bfloat16
#define XFLOAT IFLOAT
#define FLOAT bfloat16
#define SIZE 2
#define BASE_SHIFT 1
#define ZBASE_SHIFT 2
#elif defined(BFLOAT16)
#define IFLOAT bfloat16
#define XFLOAT IFLOAT
Expand Down
86 changes: 86 additions & 0 deletions common_b.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
/***************************************************************************
* Copyright (c) 2025, The OpenBLAS Project
* All rights reserved.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* 3. Neither the name of the OpenBLAS project nor the names of
* its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
* *****************************************************************************/

#ifndef COMMON_B_H
#define COMMON_B_H

// for now, only support DYNAMIC_ARCH = 0 case.
#ifndef DYNAMIC_ARCH
#define BGEMM_ONCOPY bgemm_oncopy
#define BGEMM_OTCOPY bgemm_otcopy
#define BGEMM_INCOPY bgemm_incopy
#define BGEMM_ITCOPY bgemm_itcopy

#define BGEMM_BETA bgemm_beta
#define BGEMM_KERNEL bgemm_kernel

#else

#define BGEMM_ONCOPY gotoblas -> bgemm_oncopy
#define BGEMM_OTCOPY gotoblas -> bgemm_otcopy
#define BGEMM_INCOPY gotoblas -> bgemm_incopy
#define BGEMM_ITCOPY gotoblas -> bgemm_itcopy
#define BGEMM_BETA gotoblas -> bgemm_beta
#define BGEMM_KERNEL gotoblas -> bgemm_kernel

#endif

#define BGEMM_NN bgemm_nn
#define BGEMM_CN bgemm_tn
#define BGEMM_TN bgemm_tn
#define BGEMM_NC bgemm_nt
#define BGEMM_NT bgemm_nt
#define BGEMM_CC bgemm_tt
#define BGEMM_CT bgemm_tt
#define BGEMM_TC bgemm_tt
#define BGEMM_TT bgemm_tt
#define BGEMM_NR bgemm_nn
#define BGEMM_TR bgemm_tn
#define BGEMM_CR bgemm_tn
#define BGEMM_RN bgemm_nn
#define BGEMM_RT bgemm_nt
#define BGEMM_RC bgemm_nt
#define BGEMM_RR bgemm_nn

#define BGEMM_THREAD_NN bgemm_thread_nn
#define BGEMM_THREAD_CN bgemm_thread_tn
#define BGEMM_THREAD_TN bgemm_thread_tn
#define BGEMM_THREAD_NC bgemm_thread_nt
#define BGEMM_THREAD_NT bgemm_thread_nt
#define BGEMM_THREAD_CC bgemm_thread_tt
#define BGEMM_THREAD_CT bgemm_thread_tt
#define BGEMM_THREAD_TC bgemm_thread_tt
#define BGEMM_THREAD_TT bgemm_thread_tt
#define BGEMM_THREAD_NR bgemm_thread_nn
#define BGEMM_THREAD_TR bgemm_thread_tn
#define BGEMM_THREAD_CR bgemm_thread_tn
#define BGEMM_THREAD_RN bgemm_thread_nn
#define BGEMM_THREAD_RT bgemm_thread_nt
#define BGEMM_THREAD_RC bgemm_thread_nt
#define BGEMM_THREAD_RR bgemm_thread_nn
#endif
4 changes: 3 additions & 1 deletion common_interface.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2025 The OpenBLAS Project. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
Expand Down Expand Up @@ -480,7 +481,8 @@ void BLASFUNC(xhbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint
xdouble *, blasint *, xdouble *, xdouble *, blasint *);

/* Level 3 routines */

void BLASFUNC(bgemm)(char *, char *, blasint *, blasint *, blasint *, bfloat16 *,
bfloat16 *, blasint *, bfloat16 *, blasint *, bfloat16 *, bfloat16 *, blasint *);
void BLASFUNC(sbgemm)(char *, char *, blasint *, blasint *, blasint *, float *,
bfloat16 *, blasint *, bfloat16 *, blasint *, float *, float *, blasint *);
void BLASFUNC(sgemm)(char *, char *, blasint *, blasint *, blasint *, float *,
Expand Down
21 changes: 20 additions & 1 deletion common_level3.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@ void sgemm_direct(BLASLONG M, BLASLONG N, BLASLONG K,

int sgemm_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K);


int bgemm_beta(BLASLONG, BLASLONG, BLASLONG, bfloat16,
bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG);
int sbgemm_beta(BLASLONG, BLASLONG, BLASLONG, float,
bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG);
int sgemm_beta(BLASLONG, BLASLONG, BLASLONG, float,
Expand All @@ -78,6 +79,12 @@ int xgemm_beta(BLASLONG, BLASLONG, BLASLONG, xdouble *,
xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
#endif

// add bgemm copy functions
int bgemm_incopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b);
int bgemm_itcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b);
int bgemm_oncopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b);
int bgemm_otcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b);

int sbgemm_incopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b);
int sbgemm_itcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b);
int sbgemm_oncopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b);
Expand Down Expand Up @@ -505,6 +512,8 @@ int xher2k_kernel_UC(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdoubl
int xher2k_kernel_LN(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag);
int xher2k_kernel_LC(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag);

// add bgemm kernel
int bgemm_kernel(BLASLONG, BLASLONG, BLASLONG, bfloat16, bfloat16 *, bfloat16 *, bfloat16 *, BLASLONG);
int sbgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG);
int sgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG);
int dgemm_kernel(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG);
Expand Down Expand Up @@ -657,6 +666,11 @@ int cgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float
int zgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG);
int xgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG);

int bgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
int bgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
int bgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
int bgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);

int sbgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
int sbgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
int sbgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
Expand Down Expand Up @@ -754,6 +768,11 @@ int xgemm_cr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLON
int xgemm_cc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
#endif

int bgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
int bgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
int bgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
int bgemm_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);

int sbgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
int sbgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
int sbgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
Expand Down
51 changes: 50 additions & 1 deletion common_macro.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2025 The OpenBLAS Project. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
Expand Down Expand Up @@ -39,6 +40,7 @@
#ifndef COMMON_MACRO
#define COMMON_MACRO

#include "common_b.h"
#include "common_sb.h"
#include "common_s.h"
#include "common_d.h"
Expand Down Expand Up @@ -657,8 +659,52 @@
#define GEMM_SMALL_KERNEL_B0_TN DGEMM_SMALL_KERNEL_B0_TN
#define GEMM_SMALL_KERNEL_B0_TT DGEMM_SMALL_KERNEL_B0_TT

#elif defined(BFLOAT16)
#elif defined(BFLOAT16_ONLY)
#define GEMM_BETA BGEMM_BETA
#define GEMM_KERNEL_N BGEMM_KERNEL
#define GEMM_KERNEL_L BGEMM_KERNEL
#define GEMM_KERNEL_R BGEMM_KERNEL
#define GEMM_KERNEL_B BGEMM_KERNEL

#define GEMM_NN BGEMM_NN
#define GEMM_CN BGEMM_TN
#define GEMM_TN BGEMM_TN
#define GEMM_NC BGEMM_NT
#define GEMM_NT BGEMM_NT
#define GEMM_CC BGEMM_TT
#define GEMM_CT BGEMM_TT
#define GEMM_TC BGEMM_TT
#define GEMM_TT BGEMM_TT
#define GEMM_NR BGEMM_NN
#define GEMM_TR BGEMM_TN
#define GEMM_CR BGEMM_TN
#define GEMM_RN BGEMM_NN
#define GEMM_RT BGEMM_NT
#define GEMM_RC BGEMM_NT
#define GEMM_RR BGEMM_NN
#define GEMM_ONCOPY BGEMM_ONCOPY
#define GEMM_OTCOPY BGEMM_OTCOPY
#define GEMM_INCOPY BGEMM_INCOPY
#define GEMM_ITCOPY BGEMM_ITCOPY

#define GEMM_THREAD_NN BGEMM_THREAD_NN
#define GEMM_THREAD_CN BGEMM_THREAD_TN
#define GEMM_THREAD_TN BGEMM_THREAD_TN
#define GEMM_THREAD_NC BGEMM_THREAD_NT
#define GEMM_THREAD_NT BGEMM_THREAD_NT
#define GEMM_THREAD_CC BGEMM_THREAD_TT
#define GEMM_THREAD_CT BGEMM_THREAD_TT
#define GEMM_THREAD_TC BGEMM_THREAD_TT
#define GEMM_THREAD_TT BGEMM_THREAD_TT
#define GEMM_THREAD_NR BGEMM_THREAD_NN
#define GEMM_THREAD_TR BGEMM_THREAD_TN
#define GEMM_THREAD_CR BGEMM_THREAD_TN
#define GEMM_THREAD_RN BGEMM_THREAD_NN
#define GEMM_THREAD_RT BGEMM_THREAD_NT
#define GEMM_THREAD_RC BGEMM_THREAD_NT
#define GEMM_THREAD_RR BGEMM_THREAD_NN

#elif defined(BFLOAT16)
#define D_TO_BF16_K SBDTOBF16_K
#define D_BF16_TO_K DBF16TOD_K
#define S_TO_BF16_K SBSTOBF16_K
Expand Down Expand Up @@ -2618,6 +2664,9 @@
|| defined(ARCH_LOONGARCH64) || defined(ARCH_E2K) || defined(ARCH_ALPHA))
extern BLASLONG gemm_offset_a;
extern BLASLONG gemm_offset_b;
extern BLASLONG bgemm_p;
extern BLASLONG bgemm_q;
extern BLASLONG bgemm_r;
extern BLASLONG sbgemm_p;
extern BLASLONG sbgemm_q;
extern BLASLONG sbgemm_r;
Expand Down
51 changes: 50 additions & 1 deletion common_param.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project. */
/* Copyright 2023, 2025 The OpenBLAS Project. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
Expand Down Expand Up @@ -49,6 +49,21 @@ typedef struct {
int switch_ratio;
int offsetA, offsetB, align;

#if BUILD_BFLOAT16_ONLY == 1
int bgemm_p, bgemm_q, bgemm_r;
int bgemm_unroll_m, bgemm_unroll_n, bgemm_unroll_mn;
int bgemm_align_k;

int (*bgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, bfloat16, bfloat16 *, bfloat16 *, bfloat16 *, BLASLONG);
int (*bgemm_beta )(BLASLONG, BLASLONG, BLASLONG, bfloat16, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG);

int (*bgemm_incopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *);
int (*bgemm_itcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *);
int (*bgemm_oncopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *);
int (*bgemm_otcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *);

#endif

#if BUILD_BFLOAT16 == 1
int sbgemm_p, sbgemm_q, sbgemm_r;
int sbgemm_unroll_m, sbgemm_unroll_n, sbgemm_unroll_mn;
Expand Down Expand Up @@ -1229,6 +1244,15 @@ extern gotoblas_t *gotoblas;

#define HAVE_EX_L2 gotoblas -> exclusive_cache

#if (BUILD_BFLOAT16_ONLY==1)
#define BGEMM_P gotoblas -> bgemm_p
#define BGEMM_Q gotoblas -> bgemm_q
#define BGEMM_R gotoblas -> bgemm_r
#define BGEMM_UNROLL_M gotoblas -> bgemm_unroll_m
#define BGEMM_UNROLL_N gotoblas -> bgemm_unroll_n
#define BGEMM_UNROLL_MN gotoblas -> bgemm_unroll_mn
#endif

#if (BUILD_BFLOAT16==1)
#define SBGEMM_P gotoblas -> sbgemm_p
#define SBGEMM_Q gotoblas -> sbgemm_q
Expand Down Expand Up @@ -1357,6 +1381,19 @@ extern gotoblas_t *gotoblas;
#define HAVE_EX_L2 0
#endif

#if (BUILD_BFLOAT16_ONLY == 1)
#define BGEMM_P BGEMM_DEFAULT_P
#define BGEMM_Q BGEMM_DEFAULT_Q
#define BGEMM_R BGEMM_DEFAULT_R
#define BGEMM_UNROLL_M BGEMM_DEFAULT_UNROLL_M
#define BGEMM_UNROLL_N BGEMM_DEFAULT_UNROLL_N
#ifdef BGEMM_DEFAULT_UNROLL_MN
#define BGEMM_UNROLL_MN BGEMM_DEFAULT_UNROLL_MN
#else
#define BGEMM_UNROLL_MN MAX((BGEMM_UNROLL_M), (BGEMM_UNROLL_N))
#endif
#endif

#if (BUILD_BFLOAT16 == 1)
#define SBGEMM_P SBGEMM_DEFAULT_P
#define SBGEMM_Q SBGEMM_DEFAULT_Q
Expand Down Expand Up @@ -1517,6 +1554,18 @@ extern gotoblas_t *gotoblas;
#define GEMM_DEFAULT_R SBGEMM_DEFAULT_R
#define GEMM_DEFAULT_UNROLL_M SBGEMM_DEFAULT_UNROLL_M
#define GEMM_DEFAULT_UNROLL_N SBGEMM_DEFAULT_UNROLL_N
#elif defined(BFLOAFT16_ONLY)
#define GEMM_P BGEMM_P
#define GEMM_Q BGEMM_Q
#define GEMM_R BGEMM_R
#define GEMM_UNROLL_M BGEMM_UNROLL_M
#define GEMM_UNROLL_N BGEMM_UNROLL_N
#define GEMM_UNROLL_MN BGEMM_UNROLL_MN
#define GEMM_DEFAULT_P BGEMM_DEFAULT_P
#define GEMM_DEFAULT_Q BGEMM_DEFAULT_Q
#define GEMM_DEFAULT_R BGEMM_DEFAULT_R
#define GEMM_DEFAULT_UNROLL_M BGEMM_DEFAULT_UNROLL_M
#define GEMM_DEFAULT_UNROLL_N BGEMM_DEFAULT_UNROLL_N
#else
#define GEMM_P SGEMM_P
#define GEMM_Q SGEMM_Q
Expand Down
Loading
Loading