Skip to content

Commit 8ecd80a

Browse files
authored
Merge pull request #14 from xianyi/develop
rebase
2 parents 86a5f98 + 4ba53db commit 8ecd80a

File tree

8 files changed

+583
-31
lines changed

8 files changed

+583
-31
lines changed

c_check

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#!/usr/bin/perl
1+
#!/usr/bin/env perl
22

33
#use File::Basename;
44
# use File::Temp qw(tempfile);

driver/others/memory.c

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1619,10 +1619,12 @@ static int on_process_term(void)
16191619
#else
16201620
#pragma data_seg(".CRT$XLB")
16211621
#endif
1622-
static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
1622+
16231623
#ifdef _WIN64
1624+
static const PIMAGE_TLS_CALLBACK dll_callback(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
16241625
#pragma const_seg()
16251626
#else
1627+
static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
16261628
#pragma data_seg()
16271629
#endif
16281630

@@ -1631,10 +1633,12 @@ static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOI
16311633
#else
16321634
#pragma data_seg(".CRT$XTU")
16331635
#endif
1634-
static int(*p_process_term)(void) = on_process_term;
1636+
16351637
#ifdef _WIN64
1638+
static const int(*p_process_term)(void) = on_process_term;
16361639
#pragma const_seg()
16371640
#else
1641+
static int(*p_process_term)(void) = on_process_term;
16381642
#pragma data_seg()
16391643
#endif
16401644
#endif

exports/gensymbol

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#!/usr/bin/perl
1+
#!/usr/bin/env perl
22

33
# Changelog
44
# 2017/09/03 staticfloat

f_check

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#!/usr/bin/perl
1+
#!/usr/bin/env perl
22

33
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
44

interface/create

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#!/usr/bin/perl
1+
#!/usr/bin/env perl
22

33
$count = 0;
44

kernel/arm/omatcopy_rt.c

Lines changed: 197 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/***************************************************************************
2-
Copyright (c) 2013, The OpenBLAS Project
2+
Copyright (c) 2021, The OpenBLAS Project
33
All rights reserved.
44
Redistribution and use in source and binary forms, with or without
55
modification, are permitted provided that the following conditions are
@@ -27,36 +27,208 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2727

2828
#include "common.h"
2929

30-
/*****************************************************
31-
* 2014/06/09 Saar
32-
*
33-
* Order rowMajor
34-
* Trans
35-
*
36-
******************************************************/
37-
3830
int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
3931
{
40-
BLASLONG i,j;
41-
FLOAT *aptr,*bptr;
32+
BLASLONG i, j;
33+
FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4;
34+
FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3, *b_offset4;
4235

43-
if ( rows <= 0 ) return(0);
44-
if ( cols <= 0 ) return(0);
36+
if (rows <= 0) return 0;
37+
if (cols <= 0) return 0;
4538

46-
aptr = a;
39+
a_offset = a;
40+
b_offset = b;
4741

48-
for ( i=0; i<rows ; i++ )
49-
{
50-
bptr = &b[i];
51-
for(j=0; j<cols; j++)
52-
{
53-
bptr[j*ldb] = alpha * aptr[j];
54-
}
55-
aptr += lda;
56-
}
42+
i = (rows >> 2);
43+
if (i > 0) {
44+
do {
45+
a_offset1 = a_offset;
46+
a_offset2 = a_offset1 + lda;
47+
a_offset3 = a_offset2 + lda;
48+
a_offset4 = a_offset3 + lda;
49+
a_offset += 4 * lda;
5750

58-
return(0);
51+
b_offset1 = b_offset;
52+
b_offset2 = b_offset1 + ldb;
53+
b_offset3 = b_offset2 + ldb;
54+
b_offset4 = b_offset3 + ldb;
55+
b_offset += 4;
56+
57+
j = (cols >> 2);
58+
if (j > 0) {
59+
do {
60+
/* Column 1 of MAT_B */
61+
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha; // Row 1 of MAT_A
62+
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
63+
*(b_offset3 + 0) = *(a_offset1 + 2)*alpha;
64+
*(b_offset4 + 0) = *(a_offset1 + 3)*alpha;
65+
66+
/* Column 2 of MAT_B */
67+
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha; // Row 2 of MAT_A
68+
*(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
69+
*(b_offset3 + 1) = *(a_offset2 + 2)*alpha;
70+
*(b_offset4 + 1) = *(a_offset2 + 3)*alpha;
71+
72+
/* Column 3 of MAT_B */
73+
*(b_offset1 + 2) = *(a_offset3 + 0)*alpha; // Row 3 of MAT_A
74+
*(b_offset2 + 2) = *(a_offset3 + 1)*alpha;
75+
*(b_offset3 + 2) = *(a_offset3 + 2)*alpha;
76+
*(b_offset4 + 2) = *(a_offset3 + 3)*alpha;
77+
78+
/* Column 4 of MAT_B */
79+
*(b_offset1 + 3) = *(a_offset4 + 0)*alpha; // Row 4 of MAT_A
80+
*(b_offset2 + 3) = *(a_offset4 + 1)*alpha;
81+
*(b_offset3 + 3) = *(a_offset4 + 2)*alpha;
82+
*(b_offset4 + 3) = *(a_offset4 + 3)*alpha;
83+
84+
a_offset1 += 4;
85+
a_offset2 += 4;
86+
a_offset3 += 4;
87+
a_offset4 += 4;
88+
b_offset1 += ldb * 4;
89+
b_offset2 += ldb * 4;
90+
b_offset3 += ldb * 4;
91+
b_offset4 += ldb * 4;
92+
93+
j--;
94+
} while (j > 0);
95+
} // if(j > 0)
96+
97+
98+
if (cols & 2) {
99+
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
100+
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
101+
102+
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
103+
*(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
104+
105+
*(b_offset1 + 2) = *(a_offset3 + 0)*alpha;
106+
*(b_offset2 + 2) = *(a_offset3 + 1)*alpha;
107+
108+
*(b_offset1 + 3) = *(a_offset4 + 0)*alpha;
109+
*(b_offset2 + 3) = *(a_offset4 + 1)*alpha;
110+
111+
a_offset1 += 2;
112+
a_offset2 += 2;
113+
a_offset3 += 2;
114+
a_offset4 += 2;
115+
116+
b_offset1 += ldb*2;
117+
118+
}
119+
120+
if (cols & 1) {
121+
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
122+
123+
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
124+
125+
*(b_offset1 + 2) = *(a_offset3 + 0)*alpha;
126+
127+
*(b_offset1 + 3) = *(a_offset4 + 0)*alpha;
128+
}
129+
130+
i--;
131+
} while (i > 0);
132+
}
59133

60-
}
61134

135+
if (rows & 2) {
136+
a_offset1 = a_offset;
137+
a_offset2 = a_offset1 + lda;
138+
a_offset += 2 * lda;
139+
140+
b_offset1 = b_offset;
141+
b_offset2 = b_offset1 + ldb;
142+
b_offset3 = b_offset2 + ldb;
143+
b_offset4 = b_offset3 + ldb;
144+
b_offset += 2;
145+
146+
j = (cols >> 2);
147+
if (j > 0){
148+
do {
149+
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
150+
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
151+
*(b_offset3 + 0) = *(a_offset1 + 2)*alpha;
152+
*(b_offset4 + 0) = *(a_offset1 + 3)*alpha;
153+
154+
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
155+
*(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
156+
*(b_offset3 + 1) = *(a_offset2 + 2)*alpha;
157+
*(b_offset4 + 1) = *(a_offset2 + 3)*alpha;
158+
159+
a_offset1 += 4;
160+
a_offset2 += 4;
161+
b_offset1 += ldb * 4;
162+
b_offset2 += ldb * 4;
163+
b_offset3 += ldb * 4;
164+
b_offset4 += ldb * 4;
165+
166+
j--;
167+
} while (j > 0);
168+
}
169+
170+
171+
if (cols & 2){
172+
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
173+
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
174+
175+
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
176+
*(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
177+
178+
a_offset1 += 2;
179+
a_offset2 += 2;
180+
b_offset1 += ldb*2;
181+
182+
}
183+
184+
185+
if (cols & 1){
186+
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
187+
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
188+
}
189+
} // if (rows & 2)
190+
191+
192+
if (rows & 1) {
193+
a_offset1 = a_offset;
194+
a_offset += lda;
195+
196+
b_offset1 = b_offset;
197+
b_offset2 = b_offset1 + ldb;
198+
b_offset3 = b_offset2 + ldb;
199+
b_offset4 = b_offset3 + ldb;
200+
201+
j = (cols >> 2);
202+
if (j > 0){
203+
do {
204+
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
205+
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
206+
*(b_offset3 + 0) = *(a_offset1 + 2)*alpha;
207+
*(b_offset4 + 0) = *(a_offset1 + 3)*alpha;
208+
209+
a_offset1 += 4;
210+
b_offset1 += ldb * 4;
211+
b_offset2 += ldb * 4;
212+
b_offset3 += ldb * 4;
213+
b_offset4 += ldb * 4;
214+
215+
j--;
216+
} while (j > 0);
217+
}
218+
219+
if (cols & 2){
220+
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
221+
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
222+
223+
a_offset1 += 2;
224+
b_offset1 += ldb * 2;
225+
}
226+
227+
if (cols & 1){
228+
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
229+
}
230+
}
231+
232+
return 0;
233+
}
62234

kernel/x86_64/KERNEL

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -489,3 +489,6 @@ XGEMM3MKERNEL = xgemm3m_kernel_2x2.S
489489

490490
SSUMKERNEL = ../arm/sum.c
491491
DSUMKERNEL = ../arm/sum.c
492+
493+
SOMATCOPY_RT = omatcopy_rt.c
494+
DOMATCOPY_RT = omatcopy_rt.c

0 commit comments

Comments
 (0)