Skip to content

Commit 73128f3

Browse files
authored
Merge pull request #2310 from martin-frbg/ppc440
Fix PPC440 big-endian support and disable the QCDOC qalloc routine by default
2 parents 3e67017 + 0c07c35 commit 73128f3

File tree

2 files changed

+338
-10
lines changed

2 files changed

+338
-10
lines changed

driver/others/memory_qalloc.c

Lines changed: 311 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -38,29 +38,37 @@
3838

3939
#include <stdio.h>
4040
#include "common.h"
41+
#ifdef OS_LINUX
42+
#include <sys/sysinfo.h>
43+
#include <sched.h>
44+
#include <errno.h>
45+
#include <linux/unistd.h>
46+
#include <sys/syscall.h>
47+
#include <sys/time.h>
48+
#include <sys/resource.h>
49+
#endif
4150

42-
#ifndef SMP
43-
#define blas_cpu_number 1
44-
#else
45-
46-
int blas_cpu_number = 1;
47-
48-
int blas_get_cpu_number(void){
51+
#ifdef OS_HAIKU
52+
#include <unistd.h>
53+
#endif
4954

50-
return blas_cpu_number;
51-
}
55+
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
56+
#include <sys/sysctl.h>
57+
#include <sys/resource.h>
5258
#endif
5359

60+
5461
#define FIXED_PAGESIZE 4096
5562

63+
5664
void *sa = NULL;
5765
void *sb = NULL;
5866
static double static_buffer[BUFFER_SIZE/sizeof(double)];
5967

6068
void *blas_memory_alloc(int numproc){
6169

6270
if (sa == NULL){
63-
#if 1
71+
#if 0
6472
sa = (void *)qalloc(QFAST, BUFFER_SIZE);
6573
#else
6674
sa = (void *)malloc(BUFFER_SIZE);
@@ -75,3 +83,296 @@ void blas_memory_free(void *free_area){
7583
return;
7684
}
7785

86+
87+
88+
extern void openblas_warning(int verbose, const char * msg);
89+
90+
#ifndef SMP
91+
92+
#define blas_cpu_number 1
93+
#define blas_num_threads 1
94+
95+
/* Dummy Function */
96+
int goto_get_num_procs (void) { return 1;};
97+
void goto_set_num_threads(int num_threads) {};
98+
99+
#else
100+
101+
#if defined(OS_LINUX) || defined(OS_SUNOS)
102+
#ifndef NO_AFFINITY
103+
int get_num_procs(void);
104+
#else
105+
int get_num_procs(void) {
106+
107+
static int nums = 0;
108+
cpu_set_t cpuset,*cpusetp;
109+
size_t size;
110+
int ret;
111+
112+
#if defined(__GLIBC_PREREQ)
113+
#if !__GLIBC_PREREQ(2, 7)
114+
int i;
115+
#if !__GLIBC_PREREQ(2, 6)
116+
int n;
117+
#endif
118+
#endif
119+
#endif
120+
121+
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
122+
#if !defined(OS_LINUX)
123+
return nums;
124+
#endif
125+
126+
/*
127+
#if !defined(__GLIBC_PREREQ)
128+
return nums;
129+
#else
130+
#if !__GLIBC_PREREQ(2, 3)
131+
return nums;
132+
#endif
133+
134+
#if !__GLIBC_PREREQ(2, 7)
135+
ret = sched_getaffinity(0,sizeof(cpuset), &cpuset);
136+
if (ret!=0) return nums;
137+
n=0;
138+
#if !__GLIBC_PREREQ(2, 6)
139+
for (i=0;i<nums;i++)
140+
if (CPU_ISSET(i,&cpuset)) n++;
141+
nums=n;
142+
#else
143+
nums = CPU_COUNT(sizeof(cpuset),&cpuset);
144+
#endif
145+
return nums;
146+
#else
147+
if (nums >= CPU_SETSIZE) {
148+
cpusetp = CPU_ALLOC(nums);
149+
if (cpusetp == NULL) {
150+
return nums;
151+
}
152+
size = CPU_ALLOC_SIZE(nums);
153+
ret = sched_getaffinity(0,size,cpusetp);
154+
if (ret!=0) {
155+
CPU_FREE(cpusetp);
156+
return nums;
157+
}
158+
ret = CPU_COUNT_S(size,cpusetp);
159+
if (ret > 0 && ret < nums) nums = ret;
160+
CPU_FREE(cpusetp);
161+
return nums;
162+
} else {
163+
ret = sched_getaffinity(0,sizeof(cpuset),&cpuset);
164+
if (ret!=0) {
165+
return nums;
166+
}
167+
ret = CPU_COUNT(&cpuset);
168+
if (ret > 0 && ret < nums) nums = ret;
169+
return nums;
170+
}
171+
#endif
172+
#endif
173+
*/
174+
return 1;
175+
}
176+
#endif
177+
#endif
178+
179+
#ifdef OS_ANDROID
180+
int get_num_procs(void) {
181+
static int nums = 0;
182+
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
183+
return nums;
184+
}
185+
#endif
186+
187+
#ifdef OS_HAIKU
188+
int get_num_procs(void) {
189+
static int nums = 0;
190+
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
191+
return nums;
192+
}
193+
#endif
194+
195+
#ifdef OS_AIX
196+
int get_num_procs(void) {
197+
static int nums = 0;
198+
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
199+
return nums;
200+
}
201+
#endif
202+
203+
#ifdef OS_WINDOWS
204+
205+
int get_num_procs(void) {
206+
207+
static int nums = 0;
208+
209+
if (nums == 0) {
210+
211+
SYSTEM_INFO sysinfo;
212+
213+
GetSystemInfo(&sysinfo);
214+
215+
nums = sysinfo.dwNumberOfProcessors;
216+
}
217+
218+
return nums;
219+
}
220+
221+
#endif
222+
223+
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY)
224+
225+
int get_num_procs(void) {
226+
227+
static int nums = 0;
228+
229+
int m[2];
230+
size_t len;
231+
232+
if (nums == 0) {
233+
m[0] = CTL_HW;
234+
m[1] = HW_NCPU;
235+
len = sizeof(int);
236+
sysctl(m, 2, &nums, &len, NULL, 0);
237+
}
238+
239+
return nums;
240+
}
241+
242+
#endif
243+
244+
#if defined(OS_DARWIN)
245+
int get_num_procs(void) {
246+
static int nums = 0;
247+
size_t len;
248+
if (nums == 0){
249+
len = sizeof(int);
250+
sysctlbyname("hw.physicalcpu", &nums, &len, NULL, 0);
251+
}
252+
return nums;
253+
}
254+
/*
255+
void set_stack_limit(int limitMB){
256+
int result=0;
257+
struct rlimit rl;
258+
rlim_t StackSize;
259+
260+
StackSize=limitMB*1024*1024;
261+
result=getrlimit(RLIMIT_STACK, &rl);
262+
if(result==0){
263+
if(rl.rlim_cur < StackSize){
264+
rl.rlim_cur=StackSize;
265+
result=setrlimit(RLIMIT_STACK, &rl);
266+
if(result !=0){
267+
fprintf(stderr, "OpenBLAS: set stack limit error =%d\n", result);
268+
}
269+
}
270+
}
271+
}
272+
*/
273+
#endif
274+
275+
276+
/*
277+
OpenBLAS uses the numbers of CPU cores in multithreading.
278+
It can be set by openblas_set_num_threads(int num_threads);
279+
*/
280+
int blas_cpu_number = 0;
281+
/*
282+
The numbers of threads in the thread pool.
283+
This value is equal or large than blas_cpu_number. This means some threads are sleep.
284+
*/
285+
int blas_num_threads = 0;
286+
287+
int goto_get_num_procs (void) {
288+
return blas_cpu_number;
289+
}
290+
291+
void openblas_fork_handler()
292+
{
293+
// This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is
294+
// built with "make USE_OPENMP=0".
295+
// Hanging can still happen when OpenBLAS is built against the libgomp
296+
// implementation of OpenMP. The problem is tracked at:
297+
// http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035
298+
// In the mean time build with USE_OPENMP=0 or link against another
299+
// implementation of OpenMP.
300+
#if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER)
301+
int err;
302+
err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL);
303+
if(err != 0)
304+
openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n");
305+
#endif
306+
}
307+
308+
extern int openblas_num_threads_env();
309+
extern int openblas_goto_num_threads_env();
310+
extern int openblas_omp_num_threads_env();
311+
312+
int blas_get_cpu_number(void){
313+
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
314+
int max_num;
315+
#endif
316+
int blas_goto_num = 0;
317+
int blas_omp_num = 0;
318+
319+
if (blas_num_threads) return blas_num_threads;
320+
321+
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
322+
max_num = get_num_procs();
323+
#endif
324+
325+
// blas_goto_num = 0;
326+
#ifndef USE_OPENMP
327+
blas_goto_num=openblas_num_threads_env();
328+
if (blas_goto_num < 0) blas_goto_num = 0;
329+
330+
if (blas_goto_num == 0) {
331+
blas_goto_num=openblas_goto_num_threads_env();
332+
if (blas_goto_num < 0) blas_goto_num = 0;
333+
}
334+
335+
#endif
336+
337+
// blas_omp_num = 0;
338+
blas_omp_num=openblas_omp_num_threads_env();
339+
if (blas_omp_num < 0) blas_omp_num = 0;
340+
341+
if (blas_goto_num > 0) blas_num_threads = blas_goto_num;
342+
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
343+
else blas_num_threads = MAX_CPU_NUMBER;
344+
345+
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
346+
if (blas_num_threads > max_num) blas_num_threads = max_num;
347+
#endif
348+
349+
if (blas_num_threads > MAX_CPU_NUMBER) blas_num_threads = MAX_CPU_NUMBER;
350+
351+
#ifdef DEBUG
352+
printf( "Adjusted number of threads : %3d\n", blas_num_threads);
353+
#endif
354+
355+
blas_cpu_number = blas_num_threads;
356+
357+
return blas_num_threads;
358+
}
359+
#endif
360+
361+
362+
int openblas_get_num_procs(void) {
363+
#ifndef SMP
364+
return 1;
365+
#else
366+
return get_num_procs();
367+
#endif
368+
}
369+
370+
int openblas_get_num_threads(void) {
371+
#ifndef SMP
372+
return 1;
373+
#else
374+
// init blas_cpu_number if needed
375+
blas_get_cpu_number();
376+
return blas_cpu_number;
377+
#endif
378+
}

kernel/power/KERNEL.PPC440

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,23 @@ ZASUMKERNEL = zasum_ppc440.S
1515

1616
SAXPYKERNEL = axpy_ppc440.S
1717
DAXPYKERNEL = axpy_ppc440.S
18+
ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
19+
CAXPYKERNEL = ../arm/zaxpy.c
20+
ZAXPYKERNEL = ../arm/zaxpy.c
21+
else
1822
CAXPYKERNEL = zaxpy_ppc440.S
1923
ZAXPYKERNEL = zaxpy_ppc440.S
24+
endif
2025

2126
SDOTKERNEL = dot_ppc440.S
2227
DDOTKERNEL = dot_ppc440.S
28+
ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
2329
CDOTKERNEL = zdot_ppc440.S
2430
ZDOTKERNEL = zdot_ppc440.S
31+
else
32+
CDOTKERNEL = ../arm/zdot.c
33+
ZDOTKERNEL = ../arm/zdot.c
34+
endif
2535

2636
ISAMAXKERNEL = iamax_ppc440.S
2737
IDAMAXKERNEL = iamax_ppc440.S
@@ -52,8 +62,13 @@ ZNRM2KERNEL = znrm2_ppc440.S
5262

5363
SROTKERNEL = rot_ppc440.S
5464
DROTKERNEL = rot_ppc440.S
65+
ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
5566
CROTKERNEL = zrot_ppc440.S
5667
ZROTKERNEL = zrot_ppc440.S
68+
else
69+
CROTKERNEL = ../arm/zrot.c
70+
ZROTKERNEL = ../arm/zrot.c
71+
endif
5772

5873
SSCALKERNEL = scal_ppc440.S
5974
DSCALKERNEL = scal_ppc440.S
@@ -116,3 +131,15 @@ ZTRSMKERNEL_LN = ztrsm_kernel_ppc440_LN.S
116131
ZTRSMKERNEL_LT = ztrsm_kernel_ppc440_LT.S
117132
ZTRSMKERNEL_RN = ztrsm_kernel_ppc440_LT.S
118133
ZTRSMKERNEL_RT = ztrsm_kernel_ppc440_RT.S
134+
135+
ifeq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
136+
SGEMVNKERNEL = ../arm/gemv_n.c
137+
DGEMVNKERNEL = ../arm/gemv_n.c
138+
SGEMVTKERNEL = ../arm/gemv_t.c
139+
DGEMVTKERNEL = ../arm/gemv_t.c
140+
CGEMVNKERNEL = ../arm/zgemv_n.c
141+
ZGEMVNKERNEL = ../arm/zgemv_n.c
142+
CGEMVTKERNEL = ../arm/zgemv_t.c
143+
ZGEMVTKERNEL = ../arm/zgemv_t.c
144+
endif
145+

0 commit comments

Comments
 (0)