@@ -355,51 +355,78 @@ upsample_trilinear(x; size, align_corners::Bool = true) = upsample_linear(x; si
355
355
function upsample_linear_kernel! (
356
356
y:: AbstractArray{T, N} , x:: AbstractArray{T, N} ; align_corners:: Bool = true ,
357
357
) where {T, N}
358
- # ndrange = size(y)[1:N - 2]
359
- ndrange = size (y)[N - 1 : end ]
358
+ backend = KernelAbstractions. get_backend (x)
359
+ ndrange = backend isa CPU ?
360
+ size (y)[N - 1 : end ] : # Parallelization along channel x batch.
361
+ size (y)[1 : N - 2 ] # Parallelization along WHD.
360
362
ratios = align_corners ?
361
363
ntuple (i -> real (T)((size (x, i) - 1 ) / (size (y, i) - 1 )), N - 2 ) :
362
364
ntuple (i -> real (T)(size (x, i) / size (y, i)), N - 2 )
363
-
364
- backend = KernelAbstractions. get_backend (x)
365
- _upsample_linear_kernel! (backend)(y, x, ratios... , Val (align_corners); ndrange)
365
+ _upsample_linear_kernel! (backend)(backend, y, x, ratios... , Val (align_corners); ndrange)
366
366
return y
367
367
end
368
368
369
369
function ∇upsample_linear_kernel! (
370
370
dx:: AbstractArray{T, N} , Δ:: AbstractArray{T, N} ; align_corners:: Bool = true ,
371
371
) where {T, N}
372
- ndrange = size (Δ)[1 : N - 2 ]
372
+ backend = KernelAbstractions. get_backend (dx)
373
+ ndrange = backend isa CPU ?
374
+ size (Δ)[N - 1 : end ] : # Parallelization along channel x batch.
375
+ size (Δ)[1 : N - 2 ] # Parallelization along WHD.
373
376
ratios = align_corners ?
374
377
ntuple (i -> real (T)((size (dx, i) - 1 ) / (size (Δ, i) - 1 )), N - 2 ) :
375
378
ntuple (i -> real (T)(size (dx, i) / size (Δ, i)), N - 2 )
376
-
377
- backend = KernelAbstractions. get_backend (dx)
378
- _∇upsample_linear_kernel! (backend)(dx, Δ, ratios... , Val (align_corners); ndrange)
379
+ _∇upsample_linear_kernel! (backend)(backend, dx, Δ, ratios... , Val (align_corners); ndrange)
379
380
return dx
380
381
end
381
382
382
- # Linear.
383
+ # Linear (CPU): parallelization along channel x batch dimensions .
383
384
384
- @kernel function _upsample_linear_kernel! (y:: T , x:: T , rwidth, align:: Val{A} ) where {
385
- T <: AbstractArray{<: Any, 3} , A,
385
+ @kernel function _upsample_linear_kernel! (:: CPU , y:: T , x:: T , rwidth, align:: Val{A} ) where {
386
+ T <: AbstractArray{<:Any, 3} , A,
386
387
}
387
388
@uniform in_width:: UInt32 , channels:: UInt32 , batch:: UInt32 = size (x)
389
+ @uniform out_width:: UInt32 = size (y, 1 )
390
+ c:: UInt32 , n:: UInt32 = @index (Global, NTuple)
391
+ @inbounds for i in UnitRange {UInt32} (1 , out_width)
392
+ iw0, iw1, w0lambda, w1lambda = source_index_and_lambda (rwidth, i - 0x1 , align, in_width)
393
+ y[i, c, n] = w0lambda * x[iw0, c, n] + w1lambda * x[iw1, c, n]
394
+ end
395
+ end
396
+
397
+ @kernel function _∇upsample_linear_kernel! (:: CPU , dx:: T1 , Δ:: T2 , rwidth, align:: Val{A} ) where {
398
+ T1 <: AbstractArray{<:Any, 3} , T2 <: AbstractArray{<:Any, 3} , A,
399
+ }
400
+ @uniform in_width:: UInt32 , channels:: UInt32 , batch:: UInt32 = size (Δ)
401
+ @uniform out_width:: UInt32 = size (dx, 1 )
402
+ c:: UInt32 , n:: UInt32 = @index (Global, NTuple)
403
+ @inbounds for i in UnitRange {UInt32} (1 , in_width)
404
+ ow0, ow1, w0lambda, w1lambda = source_index_and_lambda (rwidth, i - 0x1 , align, out_width)
405
+ val = Δ[i, c, n]
406
+ @atomic dx[ow0, c, n] += w0lambda * val
407
+ @atomic dx[ow1, c, n] += w1lambda * val
408
+ end
409
+ end
410
+
411
+ # Linear (GPU): parallelization along width dimension.
412
+ # TODO replace AbstractArray -> AbstractGPUArray once device arrays subtype it.
388
413
414
+ @kernel function _upsample_linear_kernel! (:: B , y:: T , x:: T , rwidth, align:: Val{A} ) where {
415
+ B <: GPU , T <: AbstractArray{<:Any, 3} , A,
416
+ }
417
+ @uniform in_width:: UInt32 , channels:: UInt32 , batch:: UInt32 = size (x)
389
418
i:: UInt32 = @index (Global)
390
- iw0, iw1, w0lambda, w1lambda = source_index_and_lambda ( rwidth, i - 0x1 , align, in_width)
419
+ iw0, iw1, w0lambda, w1lambda = source_index_and_lambda (rwidth, i - 0x1 , align, in_width)
391
420
@inbounds for n in 1 : batch, c in 1 : channels
392
421
y[i, c, n] = w0lambda * x[iw0, c, n] + w1lambda * x[iw1, c, n]
393
422
end
394
423
end
395
424
396
- @kernel function _∇upsample_linear_kernel! (dx:: T1 , Δ:: T2 , rwidth, align:: Val{A} ) where {
397
- T1 <: AbstractArray{<: Any, 3} ,
398
- T2 <: AbstractArray{<: Any, 3} , A,
425
+ @kernel function _∇upsample_linear_kernel! (:: B , dx:: T , Δ:: T , rwidth, align:: Val{A} ) where {
426
+ B <: GPU , T <: AbstractArray{<:Any, 3} , A,
399
427
}
400
428
@uniform in_width:: UInt32 , channels:: UInt32 , batch:: UInt32 = size (Δ)
401
429
@uniform out_width:: UInt32 = size (dx, 1 )
402
-
403
430
i:: UInt32 = @index (Global)
404
431
ow0, ow1, w0lambda, w1lambda = source_index_and_lambda (rwidth, i - 0x1 , align, out_width)
405
432
@inbounds for n in 1 : batch, c in 1 : channels
@@ -409,16 +436,14 @@ end
409
436
end
410
437
end
411
438
412
- # Bilinear.
439
+ # Bilinear (CPU): parallelization along channel x batch dimensions .
413
440
414
- @kernel function _upsample_linear_kernel! (y:: T , x:: T , rwidth, rheight, align:: Val{A} ) where {
415
- T <: AbstractArray{<: Any, 4} , A,
441
+ @kernel function _upsample_linear_kernel! (:: CPU , y:: T , x:: T , rwidth, rheight, align:: Val{A} ) where {
442
+ T <: AbstractArray{<:Any, 4} , A,
416
443
}
417
444
@uniform in_width:: UInt32 , in_height:: UInt32 , channels:: UInt32 , batch:: UInt32 = size (x)
418
445
@uniform out_width:: UInt32 , out_height:: UInt32 = size (y)[1 : 2 ]
419
-
420
446
c:: UInt32 , n:: UInt32 = @index (Global, NTuple)
421
-
422
447
for j in UnitRange {UInt32} (1 , out_height)
423
448
ih0, ih1, h0lambda, h1lambda = source_index_and_lambda (rheight, j - 0x1 , align, in_height)
424
449
for i in UnitRange {UInt32} (1 , out_width)
@@ -428,48 +453,51 @@ end
428
453
h1lambda * (w0lambda * x[iw0, ih1, c, n] + w1lambda * x[iw1, ih1, c, n])
429
454
end
430
455
end
431
-
432
- # i::UInt32, j::UInt32 = @index(Global, NTuple)
433
-
434
- # iw0, iw1, w0lambda, w1lambda = source_index_and_lambda(rwidth, i - 0x1, align, in_width)
435
- # ih0, ih1, h0lambda, h1lambda = source_index_and_lambda(rheight, j - 0x1, align, in_height)
436
-
437
- # @inbounds for n in 1:batch, c in 1:channels
438
- # y[i, j, c, n] =
439
- # h0lambda * (w0lambda * x[iw0, ih0, c, n] + w1lambda * x[iw1, ih0, c, n]) +
440
- # h1lambda * (w0lambda * x[iw0, ih1, c, n] + w1lambda * x[iw1, ih1, c, n])
441
- # end
442
456
end
443
457
444
- # @kernel function _upsample_linear_kernel!(y::T, x::T, rwidth, rheight, align::Val{A}) where {
445
- # T <: AbstractArray{<: Any, 4}, A,
446
- # }
447
- # @uniform in_width::UInt32, in_height::UInt32, channels::UInt32, batch::UInt32 = size(x)
448
-
449
- # i::UInt32, j::UInt32 = @index(Global, NTuple)
458
+ @kernel function _∇upsample_linear_kernel! (:: CPU , dx:: T1 , Δ:: T2 , rwidth, rheight, align:: Val{A} ) where {
459
+ T1 <: AbstractArray{<:Any, 4} , T2 <: AbstractArray{<:Any, 4} , A,
460
+ }
461
+ @uniform in_width:: UInt32 , in_height:: UInt32 , channels:: UInt32 , batch:: UInt32 = size (Δ)
462
+ @uniform out_width:: UInt32 , out_height:: UInt32 = size (dx)[1 : 2 ]
463
+ c:: UInt32 , n:: UInt32 = @index (Global, NTuple)
464
+ for j in UnitRange {UInt32} (1 , in_height)
465
+ oh0, oh1, h0lambda, h1lambda = source_index_and_lambda (rheight, j - 0x1 , align, out_height)
466
+ for i in UnitRange {UInt32} (1 , in_width)
467
+ ow0, ow1, w0lambda, w1lambda = source_index_and_lambda (rwidth, i - 0x1 , align, out_width)
468
+ val = Δ[i, j, c, n]
469
+ @atomic dx[ow0, oh0, c, n] += w0lambda * h0lambda * val
470
+ @atomic dx[ow1, oh0, c, n] += w1lambda * h0lambda * val
471
+ @atomic dx[ow0, oh1, c, n] += w0lambda * h1lambda * val
472
+ @atomic dx[ow1, oh1, c, n] += w1lambda * h1lambda * val
473
+ end
474
+ end
475
+ end
450
476
451
- # iw0, iw1, w0lambda, w1lambda = source_index_and_lambda(rwidth, i - 0x1, align, in_width)
452
- # ih0, ih1, h0lambda, h1lambda = source_index_and_lambda(rheight, j - 0x1, align, in_height)
477
+ # Bilinear (GPU): parallelization along width, height dimensions.
453
478
454
- # @inbounds for n in 1:batch, c in 1:channels
455
- # y[i, j, c, n] =
456
- # h0lambda * (w0lambda * x[iw0, ih0, c, n] + w1lambda * x[iw1, ih0, c, n]) +
457
- # h1lambda * (w0lambda * x[iw0, ih1, c, n] + w1lambda * x[iw1, ih1, c, n])
458
- # end
459
- # end
479
+ @kernel function _upsample_linear_kernel! (:: B , y:: T , x:: T , rwidth, rheight, align:: Val{A} ) where {
480
+ B <: GPU , T <: AbstractArray{<:Any, 4} , A,
481
+ }
482
+ @uniform in_width:: UInt32 , in_height:: UInt32 , channels:: UInt32 , batch:: UInt32 = size (x)
483
+ i:: UInt32 , j:: UInt32 = @index (Global, NTuple)
484
+ iw0, iw1, w0lambda, w1lambda = source_index_and_lambda (rwidth, i - 0x1 , align, in_width)
485
+ ih0, ih1, h0lambda, h1lambda = source_index_and_lambda (rheight, j - 0x1 , align, in_height)
486
+ @inbounds for n in 1 : batch, c in 1 : channels
487
+ y[i, j, c, n] =
488
+ h0lambda * (w0lambda * x[iw0, ih0, c, n] + w1lambda * x[iw1, ih0, c, n]) +
489
+ h1lambda * (w0lambda * x[iw0, ih1, c, n] + w1lambda * x[iw1, ih1, c, n])
490
+ end
491
+ end
460
492
461
- @kernel function _∇upsample_linear_kernel! (dx:: T1 , Δ:: T2 , rwidth, rheight, align:: Val{A} ) where {
462
- T1 <: AbstractArray{<: Any, 4} ,
463
- T2 <: AbstractArray{<: Any, 4} , A,
493
+ @kernel function _∇upsample_linear_kernel! (:: B , dx:: T , Δ:: T , rwidth, rheight, align:: Val{A} ) where {
494
+ B <: GPU , T <: AbstractArray{<:Any, 4} , A,
464
495
}
465
496
@uniform in_width:: UInt32 , in_height:: UInt32 , channels:: UInt32 , batch:: UInt32 = size (Δ)
466
497
@uniform out_width:: UInt32 , out_height:: UInt32 = size (dx)[1 : 2 ]
467
-
468
498
i:: UInt32 , j:: UInt32 = @index (Global, NTuple)
469
-
470
499
ow0, ow1, w0lambda, w1lambda = source_index_and_lambda (rwidth, i - 0x1 , align, out_width)
471
500
oh0, oh1, h0lambda, h1lambda = source_index_and_lambda (rheight, j - 0x1 , align, out_height)
472
-
473
501
@inbounds for n in 1 : batch, c in 1 : channels
474
502
val = Δ[i, j, c, n]
475
503
@atomic dx[ow0, oh0, c, n] += w0lambda * h0lambda * val
@@ -479,20 +507,72 @@ end
479
507
end
480
508
end
481
509
482
- # Trilinear.
510
+ # Trilinear (CPU): parallelization along channel x batch dimensions .
483
511
484
- @kernel function _upsample_linear_kernel! (y:: T , x:: T , rwidth, rheight, rdepth, align:: Val{A} ) where {
485
- T <: AbstractArray{<: Any, 5} , A,
512
+ @kernel function _upsample_linear_kernel! (:: CPU , y:: T , x:: T , rwidth, rheight, rdepth, align:: Val{A} ) where {
513
+ T <: AbstractArray{<:Any, 5} , A,
486
514
}
487
515
@uniform in_width:: UInt32 , in_height:: UInt32 , in_depth:: UInt32 = size (x)[1 : 3 ]
488
516
@uniform channels:: UInt32 , batch:: UInt32 = size (x, 4 ), size (x, 5 )
517
+ @uniform out_width:: UInt32 , out_height:: UInt32 , out_depth:: UInt32 = size (y)[1 : 3 ]
518
+ c:: UInt32 , n:: UInt32 = @index (Global, NTuple)
519
+ for k in UnitRange {UInt32} (1 , out_depth)
520
+ id0, id1, d0lambda, d1lambda = source_index_and_lambda (rdepth, k - 0x1 , align, in_depth)
521
+ for j in UnitRange {UInt32} (1 , out_height)
522
+ ih0, ih1, h0lambda, h1lambda = source_index_and_lambda (rheight, j - 0x1 , align, in_height)
523
+ for i in UnitRange {UInt32} (1 , out_width)
524
+ iw0, iw1, w0lambda, w1lambda = source_index_and_lambda (rwidth, i - 0x1 , align, in_width)
525
+ @inbounds y[i, j, k, c, n] =
526
+ d0lambda * (
527
+ h0lambda * (w0lambda * x[iw0, ih0, id0, c, n] + w1lambda * x[iw1, ih0, id0, c, n]) +
528
+ h1lambda * (w0lambda * x[iw0, ih1, id0, c, n] + w1lambda * x[iw1, ih1, id0, c, n])) +
529
+ d1lambda * (
530
+ h0lambda * (w0lambda * x[iw0, ih0, id1, c, n] + w1lambda * x[iw1, ih0, id1, c, n]) +
531
+ h1lambda * (w0lambda * x[iw0, ih1, id1, c, n] + w1lambda * x[iw1, ih1, id1, c, n]))
532
+ end
533
+ end
534
+ end
535
+ end
489
536
490
- i:: UInt32 , j:: UInt32 , k:: UInt32 = @index (Global, NTuple)
537
+ @kernel function _∇upsample_linear_kernel! (:: CPU , dx:: T1 , Δ:: T2 , rwidth, rheight, rdepth, align:: Val{A} ) where {
538
+ T1 <: AbstractArray{<:Any, 5} , T2 <: AbstractArray{<:Any, 5} , A,
539
+ }
540
+ @uniform in_width:: UInt32 , in_height:: UInt32 , in_depth:: UInt32 = size (Δ)[1 : 3 ]
541
+ @uniform channels:: UInt32 , batch:: UInt32 = size (Δ, 4 ), size (Δ, 5 )
542
+ @uniform out_width:: UInt32 , out_height:: UInt32 , out_depth:: UInt32 = size (dx)[1 : 3 ]
543
+ c:: UInt32 , n:: UInt32 = @index (Global, NTuple)
544
+ for k in UnitRange {UInt32} (1 , in_depth)
545
+ od0, od1, d0lambda, d1lambda = source_index_and_lambda (rdepth, k - 0x1 , align, out_depth)
546
+ for j in UnitRange {UInt32} (1 , in_height)
547
+ oh0, oh1, h0lambda, h1lambda = source_index_and_lambda (rheight, j - 0x1 , align, out_height)
548
+ @inbounds for i in UnitRange {UInt32} (1 , in_width)
549
+ ow0, ow1, w0lambda, w1lambda = source_index_and_lambda (rwidth, i - 0x1 , align, out_width)
550
+ val = Δ[i, j, k, c, n]
551
+ @atomic dx[ow0, oh0, od0, c, n] += w0lambda * h0lambda * d0lambda * val
552
+ @atomic dx[ow1, oh0, od0, c, n] += w1lambda * h0lambda * d0lambda * val
553
+ @atomic dx[ow0, oh1, od0, c, n] += w0lambda * h1lambda * d0lambda * val
554
+ @atomic dx[ow1, oh1, od0, c, n] += w1lambda * h1lambda * d0lambda * val
555
+
556
+ @atomic dx[ow0, oh0, od1, c, n] += w0lambda * h0lambda * d1lambda * val
557
+ @atomic dx[ow1, oh0, od1, c, n] += w1lambda * h0lambda * d1lambda * val
558
+ @atomic dx[ow0, oh1, od1, c, n] += w0lambda * h1lambda * d1lambda * val
559
+ @atomic dx[ow1, oh1, od1, c, n] += w1lambda * h1lambda * d1lambda * val
560
+ end
561
+ end
562
+ end
563
+ end
564
+
565
+ # Trilinear (GPU): parallelization along width x height x depth dimensions.
491
566
567
+ @kernel function _upsample_linear_kernel! (:: B , y:: T , x:: T , rwidth, rheight, rdepth, align:: Val{A} ) where {
568
+ B <: GPU , T <: AbstractArray{<:Any, 5} , A,
569
+ }
570
+ @uniform in_width:: UInt32 , in_height:: UInt32 , in_depth:: UInt32 = size (x)[1 : 3 ]
571
+ @uniform channels:: UInt32 , batch:: UInt32 = size (x, 4 ), size (x, 5 )
572
+ i:: UInt32 , j:: UInt32 , k:: UInt32 = @index (Global, NTuple)
492
573
iw0, iw1, w0lambda, w1lambda = source_index_and_lambda (rwidth, i - 0x1 , align, in_width)
493
574
ih0, ih1, h0lambda, h1lambda = source_index_and_lambda (rheight, j - 0x1 , align, in_height)
494
575
id0, id1, d0lambda, d1lambda = source_index_and_lambda (rdepth, k - 0x1 , align, in_depth)
495
-
496
576
@inbounds for n in 1 : batch, c in 1 : channels
497
577
y[i, j, k, c, n] =
498
578
d0lambda * (
@@ -504,20 +584,16 @@ end
504
584
end
505
585
end
506
586
507
- @kernel function _∇upsample_linear_kernel! (dx:: T1 , Δ:: T2 , rwidth, rheight, rdepth, align:: Val{A} ) where {
508
- T1 <: AbstractArray{<: Any, 5} ,
509
- T2 <: AbstractArray{<: Any, 5} , A,
587
+ @kernel function _∇upsample_linear_kernel! (:: B , dx:: T , Δ:: T , rwidth, rheight, rdepth, align:: Val{A} ) where {
588
+ B <: GPU , T <: AbstractArray{<:Any, 5} , A,
510
589
}
511
590
@uniform in_width:: UInt32 , in_height:: UInt32 , in_depth:: UInt32 = size (Δ)[1 : 3 ]
512
591
@uniform channels:: UInt32 , batch:: UInt32 = size (Δ, 4 ), size (Δ, 5 )
513
592
@uniform out_width:: UInt32 , out_height:: UInt32 , out_depth:: UInt32 = size (dx)[1 : 3 ]
514
-
515
593
i:: UInt32 , j:: UInt32 , k:: UInt32 = @index (Global, NTuple)
516
-
517
594
ow0, ow1, w0lambda, w1lambda = source_index_and_lambda (rwidth, i - 0x1 , align, out_width)
518
595
oh0, oh1, h0lambda, h1lambda = source_index_and_lambda (rheight, j - 0x1 , align, out_height)
519
596
od0, od1, d0lambda, d1lambda = source_index_and_lambda (rdepth, k - 0x1 , align, out_depth)
520
-
521
597
@inbounds for n in 1 : batch, c in 1 : channels
522
598
val = Δ[i, j, k, c, n]
523
599
@atomic dx[ow0, oh0, od0, c, n] += w0lambda * h0lambda * d0lambda * val
545
621
546
622
w1lambda = real_index - iw0
547
623
w0lambda = T (1 ) - w1lambda
548
-
549
624
return iw0 + 0x1 , iw1, w0lambda, w1lambda
550
625
end
0 commit comments