Skip to content

Commit 28b5334

Browse files
committed
Complete implementation of GEMV forwarding
1 parent 3db5dbc commit 28b5334

File tree

1 file changed

+47
-42
lines changed

1 file changed

+47
-42
lines changed

interface/gemm.c

Lines changed: 47 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
/*********************************************************************/
2+
/* Copyright 2024 The OpenBLAS Project */
23
/* Copyright 2009, 2010 The University of Texas at Austin. */
34
/* All rights reserved. */
45
/* */
@@ -63,13 +64,10 @@
6364
#ifndef GEMM3M
6465
#ifdef XDOUBLE
6566
#define ERROR_NAME "XGEMM "
66-
#define GEMV BLASFUNC(xgemv)
6767
#elif defined(DOUBLE)
6868
#define ERROR_NAME "ZGEMM "
69-
#define GEMV BLASFUNC(zgemv)
7069
#else
7170
#define ERROR_NAME "CGEMM "
72-
#define GEMV BLASFUNC(cgemv)
7371
#endif
7472
#else
7573
#ifdef XDOUBLE
@@ -492,42 +490,54 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
492490
}
493491
#endif
494492
#endif // defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16)
495-
// fprintf(stderr,"G E M M interface m n k %d %d %d\n",args.m,args.n,args.k);
496493

497494
if ((args.m == 0) || (args.n == 0)) return;
498495

499-
#if 1
500-
#ifndef GEMM3M
501-
if (args.m == 1) {
502-
char *NT=(char*)malloc(2*sizeof(char));
503-
if (transb&1)strcpy(NT,"T");
504-
else NT="N";
505-
// fprintf(stderr,"G E M V\n");
506-
GEMV(NT, &args.n ,&args.k, args.alpha, args.b, &args.ldb, args.a, &args.m, args.beta, args.c, &args.m);
507-
//SUBROUTINE SGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
508-
return;
509-
} else {
510-
if (args.n == 1) {
511-
#ifndef CBLAS
512-
char *NT=(char*)malloc(2*sizeof(char));
513-
strcpy(NT,"N");
514-
#else
515-
char *NT=(char*)malloc(2*sizeof(char));
516-
if (transb&1)strcpy(NT,"T");
517-
else strcpy(NT,"N");
518-
#endif
519-
// fprintf(stderr,"G E M V ! ! ! lda=%d ldb=%d ldc=%d\n",args.lda,args.ldb,args.ldc);
520-
GEMV(NT, &args.m ,&args.k, args.alpha, args.a, &args.lda, args.b, &args.n, args.beta, args.c, &args.n);
521-
//SUBROUTINE SGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
522-
return;
496+
#if !defined(GEMM3M) && !defined(COMPLEX)
497+
// Check if we can convert GEMM -> GEMV
498+
if (args.k != 0) {
499+
if (args.n == 1) {
500+
blasint inc_x = 1;
501+
blasint inc_y = 1;
502+
// These were passed in as blasint, but the struct translates them to blaslong
503+
blasint m = args.m;
504+
blasint n = args.k;
505+
blasint lda = args.lda;
506+
// Create new transpose parameters
507+
char NT = 'N';
508+
if (transa & 1) {
509+
NT = 'T';
510+
m = args.k;
511+
n = args.m;
512+
}
513+
if (transb & 1) {
514+
inc_x = args.ldb;
515+
}
516+
GEMV(&NT, &m, &n, args.alpha, args.a, &lda, args.b, &inc_x, args.beta, args.c, &inc_y);
517+
return;
518+
}
519+
if (args.m == 1) {
520+
blasint inc_x = args.lda;
521+
blasint inc_y = args.ldc;
522+
// These were passed in as blasint, but the struct translates them to blaslong
523+
blasint m = args.k;
524+
blasint n = args.n;
525+
blasint ldb = args.ldb;
526+
// Create new transpose parameters
527+
char NT = 'T';
528+
if (transa & 1) {
529+
inc_x = 1;
530+
}
531+
if (transb & 1) {
532+
NT = 'N';
533+
m = args.n;
534+
n = args.k;
535+
}
536+
GEMV(&NT, &m, &n, args.alpha, args.b, &ldb, args.a, &inc_x, args.beta, args.c, &inc_y);
537+
return;
523538
}
524539
}
525540
#endif
526-
#endif
527-
#if 0
528-
fprintf(stderr, "m = %4d n = %d k = %d lda = %4d ldb = %4d ldc = %4d\n",
529-
args.m, args.n, args.k, args.lda, args.ldb, args.ldc);
530-
#endif
531541

532542
IDEBUG_START;
533543

@@ -557,15 +567,10 @@ return;
557567

558568
buffer = (XFLOAT *)blas_memory_alloc(0);
559569

560-
//For Loongson servers, like the 3C5000 (featuring 16 cores), applying an
561-
//offset to the buffer is essential for minimizing cache conflicts and optimizing performance.
562-
#if defined(LOONGSON3R5) && !defined(NO_AFFINITY)
563-
char model_name[128];
564-
get_cpu_model(model_name);
565-
if ((strstr(model_name, "3C5000") != NULL) || (strstr(model_name, "3D5000") != NULL))
566-
sa = (XFLOAT *)((BLASLONG)buffer + (WhereAmI() & 0xf) * GEMM_OFFSET_A);
567-
else
568-
sa = (XFLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A);
570+
//For target LOONGSON3R5, applying an offset to the buffer is essential
571+
//for minimizing cache conflicts and optimizing performance.
572+
#if defined(ARCH_LOONGARCH64) && !defined(NO_AFFINITY)
573+
sa = (XFLOAT *)((BLASLONG)buffer + (WhereAmI() & 0xf) * GEMM_OFFSET_A);
569574
#else
570575
sa = (XFLOAT *)((BLASLONG)buffer +GEMM_OFFSET_A);
571576
#endif

0 commit comments

Comments
 (0)