50
50
51
51
/* This is a thread implementation for Win32 lazy implementation */
52
52
53
+ #if defined (__GNUC__ ) && (__GNUC__ < 6 )
54
+ #define WIN_CAS (dest , exch , comp ) __sync_val_compare_and_swap(dest, comp, exch)
55
+ #else
56
+ #if defined(_WIN64 )
57
+ #define WIN_CAS (dest , exch , comp ) InterlockedCompareExchange64(dest, exch, comp)
58
+ #else
59
+ #define WIN_CAS (dest , exch , comp ) InterlockedCompareExchange(dest, exch, comp)
60
+ #endif
61
+ #endif
62
+
53
63
/* Thread server common information */
54
64
typedef struct {
55
- CRITICAL_SECTION lock ;
56
- HANDLE filled ;
57
- HANDLE killed ;
65
+ HANDLE taskSemaphore ;
58
66
59
67
blas_queue_t * queue ; /* Parameter Pointer */
60
68
int shutdown ; /* server shutdown flag */
@@ -71,8 +79,6 @@ static blas_pool_t pool;
71
79
static HANDLE blas_threads [MAX_CPU_NUMBER ];
72
80
static DWORD blas_threads_id [MAX_CPU_NUMBER ];
73
81
74
-
75
-
76
82
static void legacy_exec (void * func , int mode , blas_arg_t * args , void * sb ){
77
83
78
84
if (!(mode & BLAS_COMPLEX )){
@@ -198,7 +204,6 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
198
204
199
205
/* This is a main routine of threads. Each thread waits until job is */
200
206
/* queued. */
201
-
202
207
static DWORD WINAPI blas_thread_server (void * arg ){
203
208
204
209
/* Thread identifier */
@@ -207,9 +212,7 @@ static DWORD WINAPI blas_thread_server(void *arg){
207
212
#endif
208
213
209
214
void * buffer , * sa , * sb ;
210
- blas_queue_t * queue ;
211
- DWORD action ;
212
- HANDLE handles [] = {pool .filled , pool .killed };
215
+ volatile blas_queue_t * queue ;
213
216
214
217
/* Each server needs each buffer */
215
218
buffer = blas_memory_alloc (2 );
@@ -226,28 +229,32 @@ static DWORD WINAPI blas_thread_server(void *arg){
226
229
fprintf (STDERR , "Server[%2ld] Waiting for Queue.\n" , cpu );
227
230
#endif
228
231
229
- do {
230
- action = WaitForMultipleObjects (2 , handles , FALSE, INFINITE );
231
- } while ((action != WAIT_OBJECT_0 ) && (action != WAIT_OBJECT_0 + 1 ));
232
-
233
- if (action == WAIT_OBJECT_0 + 1 ) break ;
232
+ // all worker threads wait on the semaphore
233
+ WaitForSingleObject (pool .taskSemaphore , INFINITE );
234
234
235
+ // kill the thread if we are shutting down the server
236
+ if (pool .shutdown )
237
+ break ;
238
+
235
239
#ifdef SMP_DEBUG
236
240
fprintf (STDERR , "Server[%2ld] Got it.\n" , cpu );
237
241
#endif
238
242
239
- EnterCriticalSection (& pool .lock );
240
-
241
- queue = pool .queue ;
242
- if (queue ) pool .queue = queue -> next ;
243
+ // grab a queued task and update the list
244
+ volatile blas_queue_t * queue_next ;
245
+ INT_PTR prev_value ;
246
+ do {
247
+ queue = (volatile blas_queue_t * )pool .queue ;
248
+ if (!queue )
249
+ break ;
243
250
244
- LeaveCriticalSection (& pool .lock );
251
+ queue_next = (volatile blas_queue_t * )queue -> next ;
252
+ prev_value = WIN_CAS ((INT_PTR * )& pool .queue , (INT_PTR )queue_next , (INT_PTR )queue );
253
+ } while (prev_value != queue );
245
254
246
255
if (queue ) {
247
256
int (* routine )(blas_arg_t * , void * , void * , void * , void * , BLASLONG ) = queue -> routine ;
248
257
249
- if (pool .queue ) SetEvent (pool .filled );
250
-
251
258
sa = queue -> sa ;
252
259
sb = queue -> sb ;
253
260
@@ -332,13 +339,8 @@ static DWORD WINAPI blas_thread_server(void *arg){
332
339
fprintf (STDERR , "Server[%2ld] Finished!\n" , cpu );
333
340
#endif
334
341
335
- EnterCriticalSection (& queue -> lock );
336
-
337
- queue -> status = BLAS_STATUS_FINISHED ;
338
-
339
- LeaveCriticalSection (& queue -> lock );
340
-
341
- SetEvent (queue -> finish );
342
+ // mark our sub-task as complete
343
+ InterlockedDecrement (& queue -> status );
342
344
}
343
345
344
346
/* Shutdown procedure */
@@ -353,7 +355,7 @@ static DWORD WINAPI blas_thread_server(void *arg){
353
355
}
354
356
355
357
/* Initializing routine */
356
- int blas_thread_init (void ){
358
+ int blas_thread_init (void ){
357
359
BLASLONG i ;
358
360
359
361
if (blas_server_avail || (blas_cpu_number <= 1 )) return 0 ;
@@ -367,9 +369,7 @@ int blas_thread_init(void){
367
369
368
370
if (!blas_server_avail ){
369
371
370
- InitializeCriticalSection (& pool .lock );
371
- pool .filled = CreateEvent (NULL , FALSE, FALSE, NULL );
372
- pool .killed = CreateEvent (NULL , TRUE, FALSE, NULL );
372
+ pool .taskSemaphore = CreateSemaphore (NULL , 0 , blas_cpu_number - 1 , NULL );
373
373
374
374
pool .shutdown = 0 ;
375
375
pool .queue = NULL ;
@@ -391,11 +391,10 @@ int blas_thread_init(void){
391
391
/*
392
392
User can call one of two routines.
393
393
394
- exec_blas_async ... immediately returns after jobs are queued.
394
+ exec_blas_async ... immediately returns after jobs are queued.
395
395
396
- exec_blas ... returns after jobs are finished.
396
+ exec_blas ... returns after jobs are finished.
397
397
*/
398
-
399
398
int exec_blas_async (BLASLONG pos , blas_queue_t * queue ){
400
399
401
400
#if defined(SMP_SERVER )
@@ -409,8 +408,7 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
409
408
current = queue ;
410
409
411
410
while (current ) {
412
- InitializeCriticalSection (& current -> lock );
413
- current -> finish = CreateEvent (NULL , FALSE, FALSE, NULL );
411
+ current -> status = 1 ;
414
412
current -> position = pos ;
415
413
416
414
#ifdef CONSISTENT_FPCSR
@@ -422,19 +420,10 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
422
420
pos ++ ;
423
421
}
424
422
425
- EnterCriticalSection (& pool .lock );
426
-
427
- if (pool .queue ) {
428
- current = pool .queue ;
429
- while (current -> next ) current = current -> next ;
430
- current -> next = queue ;
431
- } else {
432
- pool .queue = queue ;
433
- }
434
-
435
- LeaveCriticalSection (& pool .lock );
423
+ pool .queue = queue ;
436
424
437
- SetEvent (pool .filled );
425
+ // start up worker threads
426
+ ReleaseSemaphore (pool .taskSemaphore , pos - 1 , NULL );
438
427
439
428
return 0 ;
440
429
}
@@ -450,10 +439,9 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
450
439
fprintf (STDERR , "Waiting Queue ..\n" );
451
440
#endif
452
441
453
- WaitForSingleObject (queue -> finish , INFINITE );
454
-
455
- CloseHandle (queue -> finish );
456
- DeleteCriticalSection (& queue -> lock );
442
+ // spin-wait on each sub-task to finish
443
+ while (* ((volatile int * )& queue -> status ))
444
+ YIELDING ;
457
445
458
446
queue = queue -> next ;
459
447
num -- ;
@@ -501,18 +489,21 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
501
489
502
490
/* Shutdown procedure, but user don't have to call this routine. The */
503
491
/* kernel automatically kill threads. */
504
-
505
492
int BLASFUNC (blas_thread_shutdown )(void ){
506
493
507
494
int i ;
508
495
496
+ #ifdef SMP_DEBUG
497
+ fprintf (STDERR , "blas_thread_shutdown..\n" );
498
+ #endif
499
+
509
500
if (!blas_server_avail ) return 0 ;
510
501
511
502
LOCK_COMMAND (& server_lock );
512
503
513
504
if (blas_server_avail ){
514
505
515
- SetEvent ( pool .killed ) ;
506
+ pool .shutdown = 1 ;
516
507
517
508
for (i = 0 ; i < blas_num_threads - 1 ; i ++ ){
518
509
// Could also just use WaitForMultipleObjects
@@ -528,8 +519,7 @@ int BLASFUNC(blas_thread_shutdown)(void){
528
519
CloseHandle (blas_threads [i ]);
529
520
}
530
521
531
- CloseHandle (pool .filled );
532
- CloseHandle (pool .killed );
522
+ CloseHandle (pool .taskSemaphore );
533
523
534
524
blas_server_avail = 0 ;
535
525
}
@@ -559,16 +549,14 @@ void goto_set_num_threads(int num_threads)
559
549
//increased_threads = 1;
560
550
if (!blas_server_avail ){
561
551
562
- InitializeCriticalSection (& pool .lock );
563
- pool .filled = CreateEvent (NULL , FALSE, FALSE, NULL );
564
- pool .killed = CreateEvent (NULL , TRUE, FALSE, NULL );
552
+ pool .taskSemaphore = CreateSemaphore (NULL , 0 , blas_cpu_number - 1 , NULL );
565
553
566
554
pool .shutdown = 0 ;
567
555
pool .queue = NULL ;
568
556
blas_server_avail = 1 ;
569
557
}
570
558
571
- for (i = blas_num_threads - 1 ; i < num_threads - 1 ; i ++ ){
559
+ for (i = blas_num_threads ; i < num_threads - 1 ; i ++ ){
572
560
573
561
blas_threads [i ] = CreateThread (NULL , 0 ,
574
562
blas_thread_server , (void * )i ,
0 commit comments