Skip to content

Commit f53aa9b

Browse files
Merge remote-tracking branch 'upstream/master' into QR_views
2 parents 48e465f + cd237a4 commit f53aa9b

File tree

6 files changed

+28
-47
lines changed

6 files changed

+28
-47
lines changed

.buildkite/pipeline.yml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,10 +58,6 @@ steps:
5858
Pkg.develop(; path=pwd())
5959
Pkg.develop(; name="Metal")
6060
61-
println("+++ :julia: Building support library")
62-
include(joinpath(Pkg.devdir(), "Metal", "deps", "build_ci.jl"))
63-
Pkg.activate()
64-
6561
println("+++ :julia: Running tests")
6662
Pkg.test("Metal"; coverage=true)'
6763
agents:

Project.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
name = "GPUArrays"
22
uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
3-
version = "8.6.3"
3+
version = "8.6.6"
44

55
[deps]
66
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
@@ -16,6 +16,6 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
1616
[compat]
1717
Adapt = "2.0, 3.0"
1818
GPUArraysCore = "= 0.1.4"
19-
LLVM = "3.9, 4"
19+
LLVM = "3.9, 4, 5"
2020
Reexport = "1"
2121
julia = "1.6"

src/host/broadcast.jl

Lines changed: 8 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -51,53 +51,22 @@ end
5151
bc′ = Broadcast.preprocess(dest, bc)
5252

5353
# grid-stride kernel
54-
function broadcast_kernel(ctx, dest, ::Val{Is}, bc′, nelem) where Is
55-
j = 0
56-
while j < nelem
57-
j += 1
58-
59-
i = @linearidx(dest, j)
60-
61-
# cartesian indexing is slow, so avoid it if possible
62-
if isa(IndexStyle(dest), IndexCartesian) || isa(IndexStyle(bc′), IndexCartesian)
63-
# this performs an integer division, which is expensive. to make it possible
64-
# for the compiler to optimize it away, we put the iterator in the type
65-
# domain so that the indices are available at compile time. note that LLVM
66-
# only seems to replace pow2 divisions (with bitshifts), but other back-ends
67-
# may be smarter and replace arbitrary divisions by bit operations.
68-
#
69-
# also see maleadt/StaticCartesian.jl, which implements this in Julia,
70-
# but does not result in an additional speed-up on tested back-ends.
71-
#
72-
# in addition, we use @inbounds to avoid bounds checks, but we also need to
73-
# inform the compiler about the bounds that we are assuming. this is done
74-
# using the assume intrinsic, and in case of Metal yields a 8x speed-up.
75-
assume(1 <= i <= length(Is))
76-
I = @inbounds Is[i]
77-
end
78-
79-
val = if isa(IndexStyle(bc′), IndexCartesian)
80-
@inbounds bc′[I]
81-
else
82-
@inbounds bc′[i]
83-
end
84-
85-
if isa(IndexStyle(dest), IndexCartesian)
86-
@inbounds dest[I] = val
87-
else
88-
@inbounds dest[i] = val
89-
end
54+
function broadcast_kernel(ctx, dest, bc′, nelem)
55+
i = 0
56+
while i < nelem
57+
i += 1
58+
I = @cartesianidx(dest, i)
59+
@inbounds dest[I] = bc′[I]
9060
end
9161
return
9262
end
9363
elements = length(dest)
9464
elements_per_thread = typemax(Int)
95-
Is = CartesianIndices(dest)
96-
heuristic = launch_heuristic(backend(dest), broadcast_kernel, dest, Val(Is), bc′, 1;
65+
heuristic = launch_heuristic(backend(dest), broadcast_kernel, dest, bc′, 1;
9766
elements, elements_per_thread)
9867
config = launch_configuration(backend(dest), heuristic;
9968
elements, elements_per_thread)
100-
gpu_call(broadcast_kernel, dest, Val(Is), bc′, config.elements_per_thread;
69+
gpu_call(broadcast_kernel, dest, bc′, config.elements_per_thread;
10170
threads=config.threads, blocks=config.blocks)
10271

10372
return dest

src/host/math.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
function Base.clamp!(A::AnyGPUArray, low, high)
44
gpu_call(A, low, high) do ctx, A, low, high
5-
I = @linearidx A
5+
I = @cartesianidx A
66
A[I] = clamp(A[I], low, high)
77
return
88
end

test/testsuite/math.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ end
1717
@testsuite "math/power" (AT, eltypes)->begin
1818
for ET in eltypes
1919
for p in 0:5
20-
compare(x->x^p, AT, rand(ET, 2,2))
20+
@test compare(x->x^p, AT, rand(ET, 2,2))
2121
end
2222
end
2323
end

test/testsuite/reductions.jl

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,22 @@
1515
end
1616
end
1717

18+
@testsuite "reductions/mapreducedim!_large" (AT, eltypes)->begin
19+
@testset "$ET" for ET in eltypes
20+
# Skip smaller floating types due to precision issues
21+
if ET in (Float16, ComplexF16)
22+
continue
23+
end
24+
25+
range = ET <: Real ? (ET(1):ET(10)) : ET
26+
# Reduce larger array sizes to test multiple-element reading in certain implementations
27+
for (sz,red) in [(1000000,)=>(1,), (5000,500)=>(1,1), (500,5000)=>(1,1),
28+
(500,5000)=>(500,1), (5000,500)=>(1,500)]
29+
@test compare((A,R)->Base.mapreducedim!(identity, +, R, A), AT, rand(range, sz), zeros(ET, red))
30+
end
31+
end
32+
end
33+
1834
@testsuite "reductions/reducedim!" (AT, eltypes)->begin
1935
@testset "$ET" for ET in eltypes
2036
range = ET <: Real ? (ET(1):ET(10)) : ET

0 commit comments

Comments
 (0)