feat: implement float support

suvorovrain · suvorovrain · commit 72564cf012ca · 2025-02-05T19:32:44.000+03:00
diff --git a/GraphBLAS/FactoryKernels/GB_AxB__plus_times_fp32.c b/GraphBLAS/FactoryKernels/GB_AxB__plus_times_fp32.c
@@ -17,6 +17,14 @@
 #include "assign/GB_bitmap_assign_methods.h"
 #include "FactoryKernels/GB_AxB__include2.h"
 
+// riscv intrinsics
+
+#define VSETVL(x) __riscv_vsetvl_e32m8(x)
+#define VLE(x,y) __riscv_vle32_v_f32m8(x, y)
+#define VFMACC(x,y,z,w) __riscv_vfmacc_vf_f32m8(x, y, z, w)
+#define VSE(x,y,z) __riscv_vse32_v_f32m8(x, y, z)
+#define VECTORTYPE vfloat32m8_t
+
 // semiring operators:
 #define GB_MULTADD(z,a,b,i,k,j) z += (a*b)
 #define GB_MULT(z,a,b,i,k,j)    z = (a*b)
diff --git a/GraphBLAS/FactoryKernels/GB_AxB__plus_times_fp64.c b/GraphBLAS/FactoryKernels/GB_AxB__plus_times_fp64.c
@@ -17,6 +17,14 @@
 #include "assign/GB_bitmap_assign_methods.h"
 #include "FactoryKernels/GB_AxB__include2.h"
 
+// riscv intrinsics
+
+#define VSETVL(x) __riscv_vsetvl_e64m8(x)
+#define VLE(x,y) __riscv_vle64_v_f64m8(x, y)
+#define VFMACC(x,y,z,w) __riscv_vfmacc_vf_f64m8(x, y, z, w)
+#define VSE(x,y,z) __riscv_vse64_v_f64m8(x, y, z)
+#define VECTORTYPE vfloat64m8_t
+
 // semiring operators:
 #define GB_MULTADD(z,a,b,i,k,j) z += (a*b)
 #define GB_MULT(z,a,b,i,k,j)    z = (a*b)
@@ -289,7 +297,6 @@ GrB_Info GB (_Asaxpy4B__plus_times_fp64)
         //----------------------------------------------------------------------
         // saxpy5 method with RISC-V vectors
         //----------------------------------------------------------------------
-        
         #if GB_COMPILER_SUPPORTS_RVV1
 
             GB_TARGET_RVV1 static inline void GB_AxB_saxpy5_unrolled_rvv
diff --git a/GraphBLAS/Source/mxm/template/GB_AxB_saxpy5_lv.c b/GraphBLAS/Source/mxm/template/GB_AxB_saxpy5_lv.c
@@ -5,7 +5,7 @@
     const int64_t *restrict Bi = B->i;
     const GB_A_TYPE *restrict Ax = (GB_A_TYPE *)A->x;
     const GB_B_TYPE *restrict Bx = (GB_B_TYPE *)B->x;
-    size_t vl = __riscv_vsetvl_e64m8(m);
+    size_t vl = VSETVL(m);
     GB_C_TYPE *restrict Cx = (GB_C_TYPE *)C->x;
 
 #pragma omp parallel for num_threads(nthreads) schedule(dynamic, 1)
@@ -22,33 +22,31 @@
             const int64_t pB_end = Bp[jB + 1];
             for (int64_t i = 0; i < m && (m - i) >= vl; i += vl)
             {
-                vfloat64m8_t vc = __riscv_vle64_v_f64m8(Cxj + i, vl);
-
+                VECTORTYPE vc = VLE(Cxj + i, vl);
                 for (int64_t pB = pB_start; pB < pB_end; pB++)
                 {
                     const int64_t k = Bi[pB];
                     const GB_B_TYPE bkj = Bx[pB];
-                    vfloat64m8_t va = __riscv_vle64_v_f64m8(Ax + i + k * m, vl);
-                    vc = __riscv_vfmacc_vf_f64m8(vc, bkj, va, vl);
+                    VECTORTYPE va = VLE(Ax + i + k * m, vl);
+                    vc = VFMACC(vc, bkj, va, vl);
                 }
 
-                __riscv_vse64_v_f64m8(Cxj + i, vc, vl);
+                VSE(Cxj + i, vc, vl);
             }
             int64_t remaining = m % vl;
             if (remaining > 0)
             {
                 int64_t i = m - remaining;
-                vfloat64m8_t vc = __riscv_vle64_v_f64m8(Cxj + i, remaining);
-
+                VECTORTYPE vc = VLE(Cxj + i, remaining);
                 for (int64_t pB = pB_start; pB < pB_end; pB++)
                 {
                     const int64_t k = Bi[pB];
                     const GB_B_TYPE bkj = Bx[pB];
-                    vfloat64m8_t va = __riscv_vle64_v_f64m8(Ax + i + k * m, remaining);
-                    vc = __riscv_vfmacc_vf_f64m8(vc, bkj, va, remaining);
+                    VECTORTYPE va = VLE(Ax + i + k * m, remaining);
+                    vc = VFMACC(vc, bkj, va, remaining);
                 }
 
-                __riscv_vse64_v_f64m8(Cxj + i, vc, remaining);
+                VSE(Cxj + i, vc, remaining);
             }
         }
     }

Original file line number	Diff line number	Diff line change
`@@ -5,7 +5,7 @@`
`5`	`5`	`const int64_t *restrict Bi = B->i;`
`6`	`6`	`const GB_A_TYPE restrict Ax = (GB_A_TYPE )A->x;`
`7`	`7`	`const GB_B_TYPE restrict Bx = (GB_B_TYPE )B->x;`
`8`		`- size_t vl = __riscv_vsetvl_e64m8(m);`
	`8`	`+ size_t vl = VSETVL(m);`
`9`	`9`	`GB_C_TYPE restrict Cx = (GB_C_TYPE )C->x;`
`10`	`10`
`11`	`11`	`#pragma omp parallel for num_threads(nthreads) schedule(dynamic, 1)`
`@@ -22,33 +22,31 @@`
`22`	`22`	`const int64_t pB_end = Bp[jB + 1];`
`23`	`23`	`for (int64_t i = 0; i < m && (m - i) >= vl; i += vl)`
`24`	`24`	`{`
`25`		`- vfloat64m8_t vc = __riscv_vle64_v_f64m8(Cxj + i, vl);`
`26`		`-`
	`25`	`+ VECTORTYPE vc = VLE(Cxj + i, vl);`
`27`	`26`	`for (int64_t pB = pB_start; pB < pB_end; pB++)`
`28`	`27`	`{`
`29`	`28`	`const int64_t k = Bi[pB];`
`30`	`29`	`const GB_B_TYPE bkj = Bx[pB];`
`31`		`- vfloat64m8_t va = __riscv_vle64_v_f64m8(Ax + i + k * m, vl);`
`32`		`- vc = __riscv_vfmacc_vf_f64m8(vc, bkj, va, vl);`
	`30`	`+ VECTORTYPE va = VLE(Ax + i + k * m, vl);`
	`31`	`+ vc = VFMACC(vc, bkj, va, vl);`
`33`	`32`	`}`
`34`	`33`
`35`		`- __riscv_vse64_v_f64m8(Cxj + i, vc, vl);`
	`34`	`+ VSE(Cxj + i, vc, vl);`
`36`	`35`	`}`
`37`	`36`	`int64_t remaining = m % vl;`
`38`	`37`	`if (remaining > 0)`
`39`	`38`	`{`
`40`	`39`	`int64_t i = m - remaining;`
`41`		`- vfloat64m8_t vc = __riscv_vle64_v_f64m8(Cxj + i, remaining);`
`42`		`-`
	`40`	`+ VECTORTYPE vc = VLE(Cxj + i, remaining);`
`43`	`41`	`for (int64_t pB = pB_start; pB < pB_end; pB++)`
`44`	`42`	`{`
`45`	`43`	`const int64_t k = Bi[pB];`
`46`	`44`	`const GB_B_TYPE bkj = Bx[pB];`
`47`		`- vfloat64m8_t va = __riscv_vle64_v_f64m8(Ax + i + k * m, remaining);`
`48`		`- vc = __riscv_vfmacc_vf_f64m8(vc, bkj, va, remaining);`
	`45`	`+ VECTORTYPE va = VLE(Ax + i + k * m, remaining);`
	`46`	`+ vc = VFMACC(vc, bkj, va, remaining);`
`49`	`47`	`}`
`50`	`48`
`51`		`- __riscv_vse64_v_f64m8(Cxj + i, vc, remaining);`
	`49`	`+ VSE(Cxj + i, vc, remaining);`
`52`	`50`	`}`
`53`	`51`	`}`
`54`	`52`	`}`