diff --git a/Project.toml b/Project.toml
index 42f7bdc3..fabcb63b 100644
--- a/Project.toml
+++ b/Project.toml
@@ -10,7 +10,6 @@ CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 CpuId = "adafc99b-e345-5852-983c-f28acb93d879"
 FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
 GeoStatsBase = "323cb8eb-fbf6-51c0-afd0-f8fba70507b2"
-GeoStatsImages = "7cd16168-b42c-5e7d-a585-4f59d326662d"
 Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6"
 GraphsFlows = "06909019-6f44-4949-96fc-b9d9aaa02889"
 ImageFiltering = "6a3955dd-da59-5b1f-98d4-e7296123deb5"
@@ -33,12 +32,10 @@ Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 CUDA = "3.11"
 CpuId = "0.2, 0.3"
 FFTW = "1.0"
-GeoStatsBase = "0.26"
 Graphs = "1.4"
 GraphsFlows = "0.1"
 ImageFiltering = "0.6, 0.7"
 ImageMorphology = "0.2, 0.3, 0.4"
-Meshes = "0.22"
 ProgressMeter = "1.1"
 RecipesBase = "1.0"
 StatsBase = "0.32, 0.33"
diff --git a/run_experiment_app.sh b/run_experiment_app.sh
new file mode 100755
index 00000000..65544e55
--- /dev/null
+++ b/run_experiment_app.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+for turn in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
+do
+   echo $turn.adhoc
+   git switch iq-sim-refact-adhoc
+   PATH_IQ="." $JULIA_PATH/julia ./run_sample_app.jl $turn >> output.iq.app.adhoc
+   echo $turn.structured
+   git switch iq-sim-refact
+   PATH_IQ="." $JULIA_PATH/julia ./run_sample_app.jl $turn >> output.iq.app.structured
+done
diff --git a/run_experiment_kernel.sh b/run_experiment_kernel.sh
new file mode 100755
index 00000000..2733d0a5
--- /dev/null
+++ b/run_experiment_kernel.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+
+
+for size in 64 128 256
+do
+   for version in 2
+   do
+      for turn in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
+      do
+         echo $size.$version.$turn.adhoc        
+         $JULIA_PATH/julia ./run_sample.jl $version $size $turn >> output.iq.adhoc.$version.$size
+         echo $size.$version.$turn.structured
+         PLATFORM_DESCRIPTION=Platform.$version.toml $JULIA_PATH/julia ./run_sample.jl -$version $size $turn >> output.iq.structured.$version.$size     
+      done
+   done
+done
diff --git a/run_sample_app.jl b/run_sample_app.jl
new file mode 100644
index 00000000..52ba33a4
--- /dev/null
+++ b/run_sample_app.jl
@@ -0,0 +1,37 @@
+path_iq = get(ENV,"PATH_IQ",".")
+import Pkg; Pkg.activate(path_iq)
+using GeoStats
+using GeoStatsImages
+using ImageQuilting
+
+function main(args)
+
+      @info args
+
+      i = parse(Int64,args[1])
+
+      @info "small $i"
+
+      # small
+      TIₛ = geostatsimage("WalkerLake")
+      iqsim(asarray(TIₛ, :Z), (30, 30))
+      @time iqsim(asarray(TIₛ, :Z), (30, 30))
+
+      @info "medium $i"
+
+      # medium
+      TIₘ = geostatsimage("StanfordV")
+      iqsim(asarray(TIₘ, :K), (30, 30, 30))
+      @time iqsim(asarray(TIₘ, :K), (30, 30, 30))
+
+      @info "large $i"
+
+      # large
+      TIₗ = geostatsimage("Fluvsim")
+      iqsim(asarray(TIₗ, :facies), (30, 30, 30))
+      @time iqsim(asarray(TIₗ, :facies), (30, 30, 30))
+
+end
+
+main(ARGS)
+
diff --git a/run_sample_kernel.jl b/run_sample_kernel.jl
new file mode 100644
index 00000000..6dd95b18
--- /dev/null
+++ b/run_sample_kernel.jl
@@ -0,0 +1,48 @@
+import Pkg; Pkg.activate(".")
+using ImageQuilting
+using CUDA
+
+function main(args)
+
+        @info args
+
+        v = parse(Int64,args[1])
+        size = parse(Int64,args[2])
+        i = parse(Int64,args[3])
+
+        img = rand(Float32,(size,size,size))
+        krn = rand(Float32,(10,10,10))
+
+        if (v < 0)
+           @info "$i: structured / $v"
+           img = ImageQuilting.array_kernel(img)
+           ImageQuilting.init_imfilter_kernel()
+           @time ImageQuilting.imfilter_kernel(img,krn)
+           ImageQuilting.imfilter_kernel(img,krn)
+           ImageQuilting.imfilter_kernel(img,krn)
+           @time ImageQuilting.imfilter_kernel(img,krn)
+        elseif (v == 1)
+           @info "$i: ad-hoc / default"
+           @time ImageQuilting.imfilter_cpu(img, krn)
+           @time ImageQuilting.imfilter_cpu(img, krn)
+        elseif (v == 2)
+           @info "$i: ad-hoc / CUDA"
+           img = CuArray(img)
+           @time ImageQuilting.imfilter_cuda(img, krn)
+           ImageQuilting.imfilter_cuda(img, krn)
+           ImageQuilting.imfilter_cuda(img, krn)
+           @time ImageQuilting.imfilter_cuda(img, krn)
+        elseif (v == 3)
+           @info "$i: ad-hoc / OpenCL"
+           ImageQuilting.init_opencl_context()
+           @time ImageQuilting.imfilter_opencl(img, krn)
+           ImageQuilting.imfilter_opencl(img, krn)
+           ImageQuilting.imfilter_opencl(img, krn)
+           @time ImageQuilting.imfilter_opencl(img, krn)
+        else
+           @info "wrong selection"
+        end
+end
+
+main(ARGS)
+
diff --git a/src/ImageQuilting.jl b/src/ImageQuilting.jl
index 4afbcece..f26b79b1 100644
--- a/src/ImageQuilting.jl
+++ b/src/ImageQuilting.jl
@@ -17,23 +17,20 @@ using ProgressMeter: Progress, next!
 using FFTW: set_num_threads
 using CpuId: cpucores
 using RecipesBase
-using Primes
+
 using CUDA
 using OpenCL
 using CLFFT
-const clfft = CLFFT
+using PlatformAware
 
 using Base: @nexprs, @nloops, @nref
 using SparseArrays: spzeros
 using Statistics: mean, std
 using Random
  
-using PlatformAware
-
 import GeoStatsBase: preprocess, solvesingle
 
 include("utils.jl")
-include("utils_gpu.jl")
 include("plotrecipes.jl")
 include("relaxation.jl")
 include("taumodel.jl")
@@ -42,15 +39,15 @@ include("iqsim.jl")
 include("voxelreuse.jl")
 include("geostats.jl")
 
-include("test_imfilter.jl")
-
-function __init__()
-  include(pkgdir(@__MODULE__) * "/src/kernels.jl")
-  include(pkgdir(@__MODULE__) * "/src/imfilter.default.jl")
-  include(pkgdir(@__MODULE__) * "/src/imfilter.cuda.jl")
-  include(pkgdir(@__MODULE__) * "/src/imfilter.opencl.jl")
+function __init__()    
+  include("src/kernels.jl")
+  include("src/kernel/imfilter_default.jl")
+  include("src/kernel/imfilter_cuda.jl")
+  include("src/kernel/imfilter_opencl.jl")
 end
 
+include("test_imfilter.jl")
+
 export
   # functions
   iqsim,
diff --git a/src/imfilter.cuda.jl b/src/imfilter.cuda.jl
deleted file mode 100644
index d4cc852a..00000000
--- a/src/imfilter.cuda.jl
+++ /dev/null
@@ -1,34 +0,0 @@
-# ------------------------------------------------------------------
-# Licensed under the MIT License. See LICENCE in the project root.
-# ------------------------------------------------------------------
-
-@platform aware function init_imfilter_kernel({accelerator_count::(@atleast 1), accelerator_api::CUDA_API})
-  println("Running on CUDA GPU")
-end
-
-@platform aware function array_kernel({accelerator_count::(@atleast 1), accelerator_api::CUDA_API}, array) CuArray(array) end
-
-@platform aware function view_kernel({accelerator_count::(@atleast 1), accelerator_api::CUDA_API}, array, I) Array(array[I]) end
-
-@platform aware function imfilter_kernel({accelerator_count::(@atleast 1), accelerator_api::CUDA_API}, img, krn)
-    imfilter_cuda(img,krn)
- end
-
-
-function imfilter_cuda(img, krn)
- 
-   # pad kernel to common size with image
-   padkrn = CUDA.zeros(size(img))
-   copyto!(padkrn, CartesianIndices(krn), CuArray(krn), CartesianIndices(krn))
- 
-   # perform ifft(fft(img) .* conj.(fft(krn)))
-   fftimg = img |> CUFFT.fft
-   fftkrn = padkrn |> CuArray |> CUFFT.fft
-   result = (fftimg .* conj.(fftkrn)) |> CUFFT.ifft
- 
-   # recover result
-   finalsize = size(img) .- (size(krn) .- 1)
-   real.(result[CartesianIndices(finalsize)]) |> Array
-
-end
-
diff --git a/src/imfilter.opencl.jl b/src/imfilter.opencl.jl
deleted file mode 100644
index c9c1f005..00000000
--- a/src/imfilter.opencl.jl
+++ /dev/null
@@ -1,111 +0,0 @@
-# ------------------------------------------------------------------
-# Licensed under the MIT License. See LICENCE in the project root.
-# ------------------------------------------------------------------
-
-@platform aware function init_imfilter_kernel({accelerator_count::(@atleast 1), accelerator_api::OpenCL_API})
-  println("Running on OpenCL GPU")
-  global GPU = gpu_setup()
-end
-
-@platform aware function array_kernel({accelerator_count::(@atleast 1), accelerator_api::OpenCL_API}, array) array end
-
-@platform aware function view_kernel({accelerator_count::(@atleast 1), accelerator_api::OpenCL_API}, array, I) view(array, I) end
-
-@platform aware function imfilter_kernel({accelerator_count::(@atleast 1), accelerator_api::OpenCL_API}, img, kern)
-    imfilter_opencl(img,kern)
-end
-
-
-function imfilter_opencl(img, kern)
-   # retrieve basic info
-   N = ndims(img)
-   T = ComplexF64
-
-   # GPU metadata
-   ctx = GPU.ctx; queue = GPU.queue
-   mult_kernel = GPU.mult_kernel
- 
-   # operations with complex type
-   img  = T.(img)
-   kern = T.(kern)
-   
-   # kernel may require padding
-   prepad  = ntuple(d->(size(kern,d)-1) ÷ 2, N)
-   postpad = ntuple(d->(size(kern,d)  ) ÷ 2, N)
- 
-   # OpenCL FFT expects powers of 2, 3, 5, 7, 11 or 13
-   clpad = clfftpad(img)
-   A = padarray(img, Pad(:symmetric, zeros(Int, ndims(img)), clpad))
-   A = parent(A)
- 
-   krn = zeros(T, size(A))
-   indexesK = ntuple(d->[size(A,d)-prepad[d]+1:size(A,d);1:size(kern,d)-prepad[d]], N)
-   krn[indexesK...] = reflect(kern) 
-
-   # plan FFT
-   p = clfft.Plan(T, ctx, size(A))
-   clfft.set_layout!(p, :interleaved, :interleaved)
-   clfft.set_result!(p, :inplace)
-   clfft.bake!(p, queue)
- 
-   # populate GPU memory
-   bufA   = cl.Buffer(T, ctx, :copy, hostbuf=A)
-   bufkrn = cl.Buffer(T, ctx, :copy, hostbuf=krn)
-   bufRES = cl.Buffer(T, ctx, length(A))
-   
-   # compute ifft(fft(A).*fft(kern))
-   clfft.enqueue_transform(p, :forward, [queue], bufA, nothing)
-   clfft.enqueue_transform(p, :forward, [queue], bufkrn, nothing)
-   queue(mult_kernel, length(A), nothing, bufA, bufkrn, bufRES)
-
-   clfft.enqueue_transform(p, :backward, [queue], bufRES, nothing)
-
-   # get result back
-   AF = reshape(cl.read(queue, bufRES), size(A))
- 
-   # undo OpenCL FFT paddings
-   AF = view(AF, ntuple(d->1:size(AF,d)-clpad[d], N)...)
-  
-   out = Array{realtype(eltype(AF))}(undef, ntuple(d->size(img,d) - prepad[d] - postpad[d], N)...)
-   indexesA = ntuple(d->postpad[d]+1:size(img,d)-prepad[d], N)
-   copyreal!(out, AF, indexesA)
-   
-   out
-
-  end
-
- @generated function reflect(A::AbstractArray{T,N}) where {T,N}
-    quote
-        B = Array{T,N}(undef,size(A))
-        @nexprs $N d->(n_d = size(A, d)+1)
-        @nloops $N i A d->(j_d = n_d - i_d) begin
-            @nref($N, B, j) = @nref($N, A, i)
-        end
-        B
-    end
- end
-
-
-for N = 1:5
-    @eval begin
-        function copyreal!(dst::Array{T,$N}, src, I::Tuple{Vararg{UnitRange{Int}}}) where {T<:Real}
-            @nexprs $N d->(I_d = I[d])
-            @nloops $N i dst d->(j_d = first(I_d)+i_d-1) begin
-                (@nref $N dst i) = real(@nref $N src j)
-            end
-            dst
-        end
-        function copyreal!(dst::Array{T,$N}, src, I::Tuple{Vararg{UnitRange{Int}}}) where {T<:Complex}
-            @nexprs $N d->I_d = I[d]
-            @nloops $N i dst d->(j_d = first(I_d)+i_d-1) begin
-                (@nref $N dst i) = @nref $N src j
-            end
-            dst
-        end
-    end
-end
-
-realtype(::Type{R}) where {R<:Real} = R
-realtype(::Type{Complex{R}}) where {R<:Real} = R
-
-
diff --git a/src/kernel/imfilter_cuda.jl b/src/kernel/imfilter_cuda.jl
new file mode 100644
index 00000000..40514f33
--- /dev/null
+++ b/src/kernel/imfilter_cuda.jl
@@ -0,0 +1,44 @@
+# ------------------------------------------------------------------
+# Licensed under the MIT License. See LICENCE in the project root.
+# ------------------------------------------------------------------
+
+@platform aware function init_imfilter_kernel({accelerator_count::(@atleast 1), 
+                                               accelerator_manufacturer::NVIDIA, 
+                                               accelerator_api::(@api CUDA)})
+  @info "Running on NVIDIA/CUDA GPU"
+end
+
+@platform aware array_kernel({accelerator_count::(@atleast 1), 
+                              accelerator_manufacturer::NVIDIA,
+                              accelerator_api::(@api CUDA)}, array) =CuArray{Float32}(array)
+
+@platform aware view_kernel({accelerator_count::(@atleast 1), 
+                             accelerator_manufacturer::NVIDIA, 
+                             accelerator_api::(@api CUDA)}, array, I)  = Array(array[I])
+
+counter = Ref{Int}(0)
+
+
+@platform aware function imfilter_kernel({accelerator_count::(@atleast 1), 
+                                          accelerator_manufacturer::NVIDIA, 
+                                          accelerator_api::(@api CUDA)}, img, krn)
+
+   counter[] = counter[] + 1
+   imfilter_cuda(img,krn)
+end
+
+function imfilter_cuda(img, krn)
+
+   # pad kernel to common size with image
+   padkrn = CUDA.zeros(Float32, size(img))
+   copyto!(padkrn, CartesianIndices(krn), CuArray{Float32}(krn), CartesianIndices(krn))
+ 
+   # perform ifft(fft(img) .* conj.(fft(krn)))
+   fftimg = img |> CUFFT.fft
+   fftkrn = padkrn |> CUFFT.fft
+   result = (fftimg .* conj.(fftkrn)) |> CUFFT.ifft
+ 
+   # recover result
+   finalsize = size(img) .- (size(krn) .- 1)
+   real.(result[CartesianIndices(finalsize)]) |> Array
+ end
diff --git a/src/imfilter.default.jl b/src/kernel/imfilter_default.jl
similarity index 65%
rename from src/imfilter.default.jl
rename to src/kernel/imfilter_default.jl
index ea125534..bc67755d 100644
--- a/src/imfilter.default.jl
+++ b/src/kernel/imfilter_default.jl
@@ -3,15 +3,17 @@
 # ------------------------------------------------------------------
 
 @platform default function init_imfilter_kernel()
-  println("Running on DEFAULT PLATFORM")
+  @info "Running on DEFAULT PLATFORM"
 end
 
-@platform default function array_kernel(array) array end
+@platform default array_kernel(array) = array
 
-@platform default function view_kernel(array, I) view(array, I) end
+@platform default view_kernel(array, I) = view(array, I)
 
 @platform default function imfilter_kernel(img, krn)
-  imfilter(img, centered(krn), Inner(), Algorithm.FFT())
+  imfilter_cpu(img, krn)
 end
 
-
+function imfilter_cpu(img, krn)
+  imfilter(img, centered(krn), Inner(), Algorithm.FFT())
+end
diff --git a/src/kernel/imfilter_opencl.jl b/src/kernel/imfilter_opencl.jl
new file mode 100644
index 00000000..3e6c644c
--- /dev/null
+++ b/src/kernel/imfilter_opencl.jl
@@ -0,0 +1,110 @@
+# ------------------------------------------------------------------
+# Licensed under the MIT License. See LICENCE in the project root.
+# ------------------------------------------------------------------
+
+@platform aware function init_imfilter_kernel({accelerator_count::(@atleast 1), accelerator_api::(@api OpenCL)})
+  @info "Running on OpenCL device"
+  init_opencl_context()
+end
+
+function init_opencl_context()
+  # retrieve global OpenCL info
+  device_, ctx_, queue_ = cl.create_compute_context()
+  global device = device_
+  global ctx = ctx_
+  global queue = queue_
+end
+
+@platform aware array_kernel({accelerator_count::(@atleast 1), accelerator_api::(@api OpenCL)}, array) = array
+
+@platform aware view_kernel({accelerator_count::(@atleast 1), accelerator_api::(@api OpenCL)}, array, I) = view(array, I)
+
+@platform aware function imfilter_kernel({accelerator_count::(@atleast 1), accelerator_api::(@api OpenCL)}, img, krn)
+  imfilter_opencl(img, krn)
+end
+
+
+function imfilter_opencl(img, krn)
+  # retrieve basic info
+  T = ComplexF32
+
+  # build OpenCL program kernels
+  conj_kernel = build_conj_kernel(ctx)
+  mult_kernel = build_mult_kernel(ctx)
+
+  # pad img to support CLFFT operations
+  padimg = pad_opencl_img(img)
+
+  # pad krn to common size with padimg
+  padkrn = zeros(eltype(img), size(padimg))
+  padkrn[CartesianIndices(krn)] = krn
+
+  # convert to Complex
+  fftimg = T.(padimg)
+  fftkrn = T.(padkrn)
+
+  # OpenCl setup
+  plan = CLFFT.Plan(T, ctx, size(fftimg))
+  CLFFT.set_layout!(plan, :interleaved, :interleaved)
+  CLFFT.set_result!(plan, :inplace)
+  CLFFT.bake!(plan, queue)
+
+  # populate device memory
+  bufimg = cl.Buffer(T, ctx, :copy, hostbuf=fftimg)
+  bufkrn = cl.Buffer(T, ctx, :copy, hostbuf=fftkrn)
+  bufres = cl.Buffer(T, ctx, :w, length(fftimg))
+
+  # transform img and krn to FFT representation 
+  CLFFT.enqueue_transform(plan, :forward, [queue], bufimg, nothing)
+  CLFFT.enqueue_transform(plan, :forward, [queue], bufkrn, nothing)
+
+  # compute ifft(fft(A).*conj.(fft(krn)))
+  queue(conj_kernel, length(fftimg), nothing, bufkrn)
+  queue(mult_kernel, length(fftimg), nothing, bufimg, bufkrn, bufres)
+  CLFFT.enqueue_transform(plan, :backward, [queue], bufres, nothing)
+
+  # recover result
+  result = reshape(cl.read(queue, bufres), size(fftimg))
+  real_result = real.(result)
+
+  finalsize = size(img) .- (size(krn) .- 1)
+
+  real_result[CartesianIndices(finalsize)]
+end
+
+function build_mult_kernel(ctx)
+  mult_kernel = "
+  __kernel void mult(__global const float2 *a,
+                      __global const float2 *b,
+                      __global float2 *c)
+  {
+    int gid = get_global_id(0);
+    c[gid].x = a[gid].x*b[gid].x - a[gid].y*b[gid].y;
+    c[gid].y = a[gid].x*b[gid].y + a[gid].y*b[gid].x;
+  }
+  "
+  prog = cl.Program(ctx, source=mult_kernel) |> cl.build!
+  cl.Kernel(prog, "mult")
+end
+
+function build_conj_kernel(ctx)
+  conj_kernel = "
+  __kernel void conj(__global float2 *a)
+  {
+    int gid = get_global_id(0);
+    a[gid].y = -a[gid].y;
+  }
+  "
+  prog = cl.Program(ctx, source=conj_kernel) |> cl.build!
+  cl.Kernel(prog, "conj")
+end
+
+function pad_opencl_img(img)
+  # OpenCL FFT expects products of powers of 2, 3, 5, 7, 11 or 13
+  radices = CLFFT.supported_radices()
+  newsize = map(dim -> nextprod(radices, dim), size(img))
+  
+  padimg = zeros(eltype(img), newsize)
+  padimg[CartesianIndices(img)] = img
+  padimg
+end
diff --git a/src/kernels.jl b/src/kernels.jl
index e51bbe79..a47debe8 100644
--- a/src/kernels.jl
+++ b/src/kernels.jl
@@ -4,3 +4,4 @@
 @platform parameter clear
 @platform parameter accelerator_count
 @platform parameter accelerator_api
+@platform parameter accelerator_manufacturer
diff --git a/src/test_imfilter.jl b/src/test_imfilter.jl
index 5f46d24c..b3253597 100644
--- a/src/test_imfilter.jl
+++ b/src/test_imfilter.jl
@@ -19,22 +19,23 @@ function test_imfilter(N)
 
     @info "start"
 
-    img = rand(Float64,(N,N,N))
-    krn = rand(Float64,(10,10,10))
+    img = rand(Float32,(N,N,N))
+    krn = rand(Float32,(10,10,10))
 
     X = Int(round(N/2))
 
     @info "FIRST RUN - GPU - CUDA"
-    r1 = @btime imfilter_cuda(CuArray($img), $krn)
+    r1 = @time imfilter_cuda(CuArray(img), krn)
     @info "check = ", r1[X,X,X]
 
     @info("SECOND RUN - GPU - OpenCL")
-    global GPU = gpu_setup()
-    r2 = @btime imfilter_opencl($img, $krn)
+    #global GPU = gpu_setup()
+    init_opencl_context()
+    r2 = @time imfilter_opencl(img, krn)
     @info "check = ", r2[X,X,X]
 
     @info "THIRD RUN -  CPU - ImageFiltering"
-    r3 = @btime imfilter($img, centered($krn), Inner(), Algorithm.FFT())
+    r3 = @time imfilter(img, centered(krn), Inner(), Algorithm.FFT())
     @info "check = ", r3[X,X,X]
 
     @info "r1 & r2", allclose(r1, r2; rtol=1e-2, atol=1e-3)
@@ -43,4 +44,4 @@ function test_imfilter(N)
 
     @info "end"
 
-end
\ No newline at end of file
+end
diff --git a/src/utils_gpu.jl b/src/utils_gpu.jl
deleted file mode 100644
index 3086feca..00000000
--- a/src/utils_gpu.jl
+++ /dev/null
@@ -1,93 +0,0 @@
-# ------------------------------------------------------------------
-# Licensed under the MIT License. See LICENCE in the project root.
-# ------------------------------------------------------------------
-
-struct GPUmeta
-  dev
-  ctx
-  queue
-  mult_kernel
-end
-
-function gpu_setup()
-  @assert cl ≠ nothing "OpenCL.jl not installed, cannot use GPU"
-  @assert clfft ≠ nothing "CLFFT.jl not installed, cannot use GPU"
-
-  devs = cl.devices(:gpu)
-
-  if isempty(devs)
-    @warn "GPU not found, falling back to other OpenCL devices"
-    devs = cl.devices()
-  end
-  @assert !isempty(devs) "OpenCL device not found, make sure drivers are installed"
-
-  dev = []
-  devnames = map(d -> d[:platform][:name], devs)
-  for vendor in ["NVIDIA","AMD","Intel"], (idx,name) in enumerate(devnames)
-    if occursin(vendor, name)
-      dev = devs[idx]
-      break
-    end
-  end
-
-  devtype = uppercase(string(dev[:device_type]))
-  devname = dev[:name]
-
-  @info "using $devtype $devname"
-
-  ctx = cl.Context(dev)
-  queue = cl.CmdQueue(ctx)
-  mult_kernel = basic_kernels(ctx)
-
-  GPUmeta(dev, ctx, queue, mult_kernel)
-end
-
-function basic_kernels(ctx)
-  mult_kernel = "
-    __kernel void mult(__global const double2 *a,
-                       __global const double2 *b,
-                       __global double2 *c)
-    {
-      int gid = get_global_id(0);
-      c[gid].x = a[gid].x*b[gid].x - a[gid].y*b[gid].y;
-      c[gid].y = a[gid].x*b[gid].y + a[gid].y*b[gid].x;
-    }
-  "
-  
-  # build OpenCL program
-  prog = cl.Program(ctx, source=mult_kernel) |> cl.build!
-
-  cl.Kernel(prog, "mult")
-end
-
-
-
-function clfftpad(A::AbstractArray)
-  # clFFT releases support powers of 2, 3, 5, ...
-  radices = [2,3,5]
-  v = clfft.version()
-  v ≥ v"2.8.0"  && push!(radices, 7)
-  v ≥ v"2.12.0" && push!(radices, 11, 13)
-
-  result = Int[]
-  for s in size(A)
-    fs = keys(factor(s))
-    if fs ⊆ radices
-      push!(result, 0)
-    else
-      # Try a closer number that has prime factors of 2 and 3.
-      # Use the next power of 2 (say N) to get multiple new
-      # candidates.
-      N = nextpow(2,s)
-
-      # fractions of N: 100%, 93%, 84%, 75%, 56%
-      candidates = [N, 15(N÷16), 27(N÷32), 3(N÷4), 9(N÷16)]
-      candidates = candidates[candidates .> s]
-      n = minimum(candidates)
-
-      push!(result, n - s)
-    end
-  end
-
-  result
-end