|
1 | 1 | /*********************************************************************/
|
| 2 | +/* Copyright 2024 The OpenBLAS Project */ |
2 | 3 | /* Copyright 2009, 2010 The University of Texas at Austin. */
|
3 | 4 | /* All rights reserved. */
|
4 | 5 | /* */
|
|
63 | 64 | #ifndef GEMM3M
|
64 | 65 | #ifdef XDOUBLE
|
65 | 66 | #define ERROR_NAME "XGEMM "
|
66 |
| -#define GEMV BLASFUNC(xgemv) |
67 | 67 | #elif defined(DOUBLE)
|
68 | 68 | #define ERROR_NAME "ZGEMM "
|
69 |
| -#define GEMV BLASFUNC(zgemv) |
70 | 69 | #else
|
71 | 70 | #define ERROR_NAME "CGEMM "
|
72 |
| -#define GEMV BLASFUNC(cgemv) |
73 | 71 | #endif
|
74 | 72 | #else
|
75 | 73 | #ifdef XDOUBLE
|
@@ -492,42 +490,54 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
|
492 | 490 | }
|
493 | 491 | #endif
|
494 | 492 | #endif // defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16)
|
495 |
| - // fprintf(stderr,"G E M M interface m n k %d %d %d\n",args.m,args.n,args.k); |
496 | 493 |
|
497 | 494 | if ((args.m == 0) || (args.n == 0)) return;
|
498 | 495 |
|
499 |
| -#if 1 |
500 |
| -#ifndef GEMM3M |
501 |
| - if (args.m == 1) { |
502 |
| - char *NT=(char*)malloc(2*sizeof(char)); |
503 |
| - if (transb&1)strcpy(NT,"T"); |
504 |
| - else NT="N"; |
505 |
| -// fprintf(stderr,"G E M V\n"); |
506 |
| - GEMV(NT, &args.n ,&args.k, args.alpha, args.b, &args.ldb, args.a, &args.m, args.beta, args.c, &args.m); |
507 |
| -//SUBROUTINE SGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY) |
508 |
| -return; |
509 |
| - } else { |
510 |
| - if (args.n == 1) { |
511 |
| -#ifndef CBLAS |
512 |
| - char *NT=(char*)malloc(2*sizeof(char)); |
513 |
| - strcpy(NT,"N"); |
514 |
| -#else |
515 |
| - char *NT=(char*)malloc(2*sizeof(char)); |
516 |
| - if (transb&1)strcpy(NT,"T"); |
517 |
| - else strcpy(NT,"N"); |
518 |
| -#endif |
519 |
| -// fprintf(stderr,"G E M V ! ! ! lda=%d ldb=%d ldc=%d\n",args.lda,args.ldb,args.ldc); |
520 |
| - GEMV(NT, &args.m ,&args.k, args.alpha, args.a, &args.lda, args.b, &args.n, args.beta, args.c, &args.n); |
521 |
| -//SUBROUTINE SGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY) |
522 |
| - return; |
| 496 | +#if !defined(GEMM3M) && !defined(COMPLEX) |
| 497 | + // Check if we can convert GEMM -> GEMV |
| 498 | + if (args.k != 0) { |
| 499 | + if (args.n == 1) { |
| 500 | + blasint inc_x = 1; |
| 501 | + blasint inc_y = 1; |
| 502 | + // These were passed in as blasint, but the struct translates them to blaslong |
| 503 | + blasint m = args.m; |
| 504 | + blasint n = args.k; |
| 505 | + blasint lda = args.lda; |
| 506 | + // Create new transpose parameters |
| 507 | + char NT = 'N'; |
| 508 | + if (transa & 1) { |
| 509 | + NT = 'T'; |
| 510 | + m = args.k; |
| 511 | + n = args.m; |
| 512 | + } |
| 513 | + if (transb & 1) { |
| 514 | + inc_x = args.ldb; |
| 515 | + } |
| 516 | + GEMV(&NT, &m, &n, args.alpha, args.a, &lda, args.b, &inc_x, args.beta, args.c, &inc_y); |
| 517 | + return; |
| 518 | + } |
| 519 | + if (args.m == 1) { |
| 520 | + blasint inc_x = args.lda; |
| 521 | + blasint inc_y = args.ldc; |
| 522 | + // These were passed in as blasint, but the struct translates them to blaslong |
| 523 | + blasint m = args.k; |
| 524 | + blasint n = args.n; |
| 525 | + blasint ldb = args.ldb; |
| 526 | + // Create new transpose parameters |
| 527 | + char NT = 'T'; |
| 528 | + if (transa & 1) { |
| 529 | + inc_x = 1; |
| 530 | + } |
| 531 | + if (transb & 1) { |
| 532 | + NT = 'N'; |
| 533 | + m = args.n; |
| 534 | + n = args.k; |
| 535 | + } |
| 536 | + GEMV(&NT, &m, &n, args.alpha, args.b, &ldb, args.a, &inc_x, args.beta, args.c, &inc_y); |
| 537 | + return; |
523 | 538 | }
|
524 | 539 | }
|
525 | 540 | #endif
|
526 |
| -#endif |
527 |
| -#if 0 |
528 |
| - fprintf(stderr, "m = %4d n = %d k = %d lda = %4d ldb = %4d ldc = %4d\n", |
529 |
| - args.m, args.n, args.k, args.lda, args.ldb, args.ldc); |
530 |
| -#endif |
531 | 541 |
|
532 | 542 | IDEBUG_START;
|
533 | 543 |
|
@@ -557,15 +567,10 @@ return;
|
557 | 567 |
|
558 | 568 | buffer = (XFLOAT *)blas_memory_alloc(0);
|
559 | 569 |
|
560 |
| -//For Loongson servers, like the 3C5000 (featuring 16 cores), applying an |
561 |
| -//offset to the buffer is essential for minimizing cache conflicts and optimizing performance. |
562 |
| -#if defined(LOONGSON3R5) && !defined(NO_AFFINITY) |
563 |
| - char model_name[128]; |
564 |
| - get_cpu_model(model_name); |
565 |
| - if ((strstr(model_name, "3C5000") != NULL) || (strstr(model_name, "3D5000") != NULL)) |
566 |
| - sa = (XFLOAT *)((BLASLONG)buffer + (WhereAmI() & 0xf) * GEMM_OFFSET_A); |
567 |
| - else |
568 |
| - sa = (XFLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); |
| 570 | +//For target LOONGSON3R5, applying an offset to the buffer is essential |
| 571 | +//for minimizing cache conflicts and optimizing performance. |
| 572 | +#if defined(ARCH_LOONGARCH64) && !defined(NO_AFFINITY) |
| 573 | + sa = (XFLOAT *)((BLASLONG)buffer + (WhereAmI() & 0xf) * GEMM_OFFSET_A); |
569 | 574 | #else
|
570 | 575 | sa = (XFLOAT *)((BLASLONG)buffer +GEMM_OFFSET_A);
|
571 | 576 | #endif
|
|
0 commit comments