Allow regular convolution for AMDGPU (#473)

pxl-th · web-flow · commit c3cc520520a0 · 2023-02-18T09:58:32.000+01:00
* Do not error on regular convolutions

* Add regular convolution test

* Update docs

* Flip kernel automatically
diff --git a/Project.toml b/Project.toml
@@ -18,7 +18,7 @@ AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 AMDGPUExt = "AMDGPU"
 
 [compat]
-AMDGPU = "0.4.7"
+AMDGPU = "0.4.8"
 Adapt = "2, 3.2"
 ChainRulesCore = "1.13"
 Requires = "0.5, 1.0"
diff --git a/docs/src/reference.md b/docs/src/reference.md
@@ -76,6 +76,13 @@ pad_zeros
 
 `Flux`'s `Conv` and `CrossCor` layers use `NNlib.DenseConvDims` and `NNlib.conv` internally. 
 
+!!! AMDGPU MIOpen supports only cross-correlation (flipkernel=true).
+    Therefore for every regular convolution (flipkernel=false)
+    kernel is flipped before calculation.
+    For better performance, use cross-correlation (flipkernel=true)
+    and manually flip the kernel before `NNlib.conv` call.
+    `Flux` handles this automatically, this is only required for direct calls.
+
 ```@docs
 conv
 ConvDims
diff --git a/ext/AMDGPUExt/conv.jl b/ext/AMDGPUExt/conv.jl
@@ -1,8 +1,19 @@
 function NNlib.conv!(
     y::ROCArray{T, N}, x::ROCArray{T, N}, w::ROCArray{T, N}, cdims::DenseConvDims,
 ) where {T <: MIOPENFloat, N}
-    NNlib.flipkernel(cdims) || throw(ArgumentError(
-        "MIOpen supports only cross-correlation as its convolution implementation."))
+    if !NNlib.flipkernel(cdims)
+        @warn """
+        MIOpen supports only cross-correlation (flipkernel=true).
+        Therefore for every regular convolution (flipkernel=false)
+        kernel is flipped before calculation.
+        For better performance, use cross-correlation (flipkernel=true)
+        and manually flip the kernel before `NNlib.conv` call.
+        """ maxlog=1
+        flip_dims = ntuple(
+            i -> (i ≤ ndims(w) - 2) ? (size(w, i):-1:1) : Colon(),
+            ndims(w))
+        w = w[flip_dims...]
+    end
 
     nd = max(0, 4 - N)
     ncdims = NNlib.insert_singleton_spatial_dimension(cdims, nd)
@@ -18,8 +29,19 @@ end
 function NNlib.∇conv_data!(
     dx::ROCArray{T, N}, dy::ROCArray{T, N}, w::ROCArray{T, N}, cdims::DenseConvDims,
 ) where {T <: MIOPENFloat, N}
-    NNlib.flipkernel(cdims) || throw(ArgumentError(
-        "MIOpen supports only cross-correlation as its convolution implementation."))
+    if !NNlib.flipkernel(cdims)
+        @warn """
+        MIOpen supports only cross-correlation (flipkernel=true).
+        Therefore for every regular convolution (flipkernel=false)
+        kernel is flipped before calculation.
+        For better performance, use cross-correlation (flipkernel=true)
+        and manually flip the kernel before `NNlib.conv` call.
+        """ maxlog=1
+        flip_dims = ntuple(
+            i -> (i ≤ ndims(w) - 2) ? (size(w, i):-1:1) : Colon(),
+            ndims(w))
+        w = w[flip_dims...]
+    end
 
     nd = max(0, 4 - N)
     ncdims = NNlib.insert_singleton_spatial_dimension(cdims, nd)
@@ -35,9 +57,6 @@ end
 function NNlib.∇conv_filter!(
     dw::ROCArray{T, N}, x::ROCArray{T, N}, dy::ROCArray{T, N}, cdims::DenseConvDims,
 ) where {T <: MIOPENFloat, N}
-    NNlib.flipkernel(cdims) || throw(ArgumentError(
-        "MIOpen supports only cross-correlation as its convolution implementation."))
-
     nd = max(0, 4 - N)
     ncdims = NNlib.insert_singleton_spatial_dimension(cdims, nd)
     MIOpen.∇convolution_weight!(
@@ -46,5 +65,19 @@ function NNlib.∇conv_filter!(
         NNlib.insert_singleton_spatial_dimension(x, nd);
         padding=nnlib_padding(ncdims), stride=NNlib.stride(ncdims),
         dilation=NNlib.dilation(ncdims), groups=NNlib.groupcount(ncdims))
+
+    if !NNlib.flipkernel(cdims)
+        @warn """
+        MIOpen supports only cross-correlation (flipkernel=true).
+        Therefore for every regular convolution (flipkernel=false)
+        kernel is flipped before calculation.
+        For better performance, use cross-correlation (flipkernel=true)
+        and manually flip the kernel before `NNlib.conv` call.
+        """ maxlog=1
+        flip_dims = ntuple(
+            i -> (i ≤ ndims(dw) - 2) ? (size(dw, i):-1:1) : Colon(),
+            ndims(dw))
+        dw = dw[flip_dims...]
+    end
     return dw
 end
diff --git a/test/amd/conv.jl b/test/amd/conv.jl
@@ -3,7 +3,12 @@
     for T in (Float16, Float32), nd in (1, 2, 3)
         x = rand(Float32, fill(4, nd)..., 3, 1)
         w = rand(Float32, fill(2, nd)..., channels, 4)
+
         cdims = DenseConvDims(x, w, flipkernel=true)
         gputest((x, w) -> NNlib.conv(x, w, cdims), x, w; atol=1e-4)
+
+        # This one flips manually kernel for AMDGPU.
+        cdims = DenseConvDims(x, w)
+        gputest((x, w) -> NNlib.conv(x, w, cdims), x, w; atol=1e-4)
     end
 end