From 77f0b20a8612647359a2c6cb0334eceb73a5cba8 Mon Sep 17 00:00:00 2001
From: ClaroHenrique <clarohenriques@gmail.com>
Date: Mon, 31 Oct 2022 23:15:19 -0300
Subject: [PATCH 01/20] Refact imfilter code

---
 src/ImageQuilting.jl                          |  18 +--
 src/imfilter.default.jl                       |  17 ---
 src/imfilter.opencl.jl                        | 111 ------------------
 .../imfilter_cuda.jl}                         |   5 +-
 src/kernel/imfilter_default.jl                |  17 +++
 src/kernel/imfilter_opencl.jl                 | 107 +++++++++++++++++
 src/test_imfilter.jl                          |  46 --------
 src/utils_gpu.jl                              |  93 ---------------
 8 files changed, 131 insertions(+), 283 deletions(-)
 delete mode 100644 src/imfilter.default.jl
 delete mode 100644 src/imfilter.opencl.jl
 rename src/{imfilter.cuda.jl => kernel/imfilter_cuda.jl} (97%)
 create mode 100644 src/kernel/imfilter_default.jl
 create mode 100644 src/kernel/imfilter_opencl.jl
 delete mode 100644 src/test_imfilter.jl
 delete mode 100644 src/utils_gpu.jl

diff --git a/src/ImageQuilting.jl b/src/ImageQuilting.jl
index 4afbcece..82fe122a 100644
--- a/src/ImageQuilting.jl
+++ b/src/ImageQuilting.jl
@@ -17,23 +17,19 @@ using ProgressMeter: Progress, next!
 using FFTW: set_num_threads
 using CpuId: cpucores
 using RecipesBase
-using Primes
 using CUDA
 using OpenCL
 using CLFFT
-const clfft = CLFFT
+using PlatformAware
 
 using Base: @nexprs, @nloops, @nref
 using SparseArrays: spzeros
 using Statistics: mean, std
 using Random
  
-using PlatformAware
-
 import GeoStatsBase: preprocess, solvesingle
 
 include("utils.jl")
-include("utils_gpu.jl")
 include("plotrecipes.jl")
 include("relaxation.jl")
 include("taumodel.jl")
@@ -42,14 +38,10 @@ include("iqsim.jl")
 include("voxelreuse.jl")
 include("geostats.jl")
 
-include("test_imfilter.jl")
-
-function __init__()
-  include(pkgdir(@__MODULE__) * "/src/kernels.jl")
-  include(pkgdir(@__MODULE__) * "/src/imfilter.default.jl")
-  include(pkgdir(@__MODULE__) * "/src/imfilter.cuda.jl")
-  include(pkgdir(@__MODULE__) * "/src/imfilter.opencl.jl")
-end
+include("kernels.jl")
+include("kernel/imfilter_default.jl")
+include("kernel/imfilter_cuda.jl")
+include("kernel/imfilter_opencl.jl")
 
 export
   # functions
diff --git a/src/imfilter.default.jl b/src/imfilter.default.jl
deleted file mode 100644
index ea125534..00000000
--- a/src/imfilter.default.jl
+++ /dev/null
@@ -1,17 +0,0 @@
-# ------------------------------------------------------------------
-# Licensed under the MIT License. See LICENCE in the project root.
-# ------------------------------------------------------------------
-
-@platform default function init_imfilter_kernel()
-  println("Running on DEFAULT PLATFORM")
-end
-
-@platform default function array_kernel(array) array end
-
-@platform default function view_kernel(array, I) view(array, I) end
-
-@platform default function imfilter_kernel(img, krn)
-  imfilter(img, centered(krn), Inner(), Algorithm.FFT())
-end
-
-
diff --git a/src/imfilter.opencl.jl b/src/imfilter.opencl.jl
deleted file mode 100644
index c9c1f005..00000000
--- a/src/imfilter.opencl.jl
+++ /dev/null
@@ -1,111 +0,0 @@
-# ------------------------------------------------------------------
-# Licensed under the MIT License. See LICENCE in the project root.
-# ------------------------------------------------------------------
-
-@platform aware function init_imfilter_kernel({accelerator_count::(@atleast 1), accelerator_api::OpenCL_API})
-  println("Running on OpenCL GPU")
-  global GPU = gpu_setup()
-end
-
-@platform aware function array_kernel({accelerator_count::(@atleast 1), accelerator_api::OpenCL_API}, array) array end
-
-@platform aware function view_kernel({accelerator_count::(@atleast 1), accelerator_api::OpenCL_API}, array, I) view(array, I) end
-
-@platform aware function imfilter_kernel({accelerator_count::(@atleast 1), accelerator_api::OpenCL_API}, img, kern)
-    imfilter_opencl(img,kern)
-end
-
-
-function imfilter_opencl(img, kern)
-   # retrieve basic info
-   N = ndims(img)
-   T = ComplexF64
-
-   # GPU metadata
-   ctx = GPU.ctx; queue = GPU.queue
-   mult_kernel = GPU.mult_kernel
- 
-   # operations with complex type
-   img  = T.(img)
-   kern = T.(kern)
-   
-   # kernel may require padding
-   prepad  = ntuple(d->(size(kern,d)-1) ÷ 2, N)
-   postpad = ntuple(d->(size(kern,d)  ) ÷ 2, N)
- 
-   # OpenCL FFT expects powers of 2, 3, 5, 7, 11 or 13
-   clpad = clfftpad(img)
-   A = padarray(img, Pad(:symmetric, zeros(Int, ndims(img)), clpad))
-   A = parent(A)
- 
-   krn = zeros(T, size(A))
-   indexesK = ntuple(d->[size(A,d)-prepad[d]+1:size(A,d);1:size(kern,d)-prepad[d]], N)
-   krn[indexesK...] = reflect(kern) 
-
-   # plan FFT
-   p = clfft.Plan(T, ctx, size(A))
-   clfft.set_layout!(p, :interleaved, :interleaved)
-   clfft.set_result!(p, :inplace)
-   clfft.bake!(p, queue)
- 
-   # populate GPU memory
-   bufA   = cl.Buffer(T, ctx, :copy, hostbuf=A)
-   bufkrn = cl.Buffer(T, ctx, :copy, hostbuf=krn)
-   bufRES = cl.Buffer(T, ctx, length(A))
-   
-   # compute ifft(fft(A).*fft(kern))
-   clfft.enqueue_transform(p, :forward, [queue], bufA, nothing)
-   clfft.enqueue_transform(p, :forward, [queue], bufkrn, nothing)
-   queue(mult_kernel, length(A), nothing, bufA, bufkrn, bufRES)
-
-   clfft.enqueue_transform(p, :backward, [queue], bufRES, nothing)
-
-   # get result back
-   AF = reshape(cl.read(queue, bufRES), size(A))
- 
-   # undo OpenCL FFT paddings
-   AF = view(AF, ntuple(d->1:size(AF,d)-clpad[d], N)...)
-  
-   out = Array{realtype(eltype(AF))}(undef, ntuple(d->size(img,d) - prepad[d] - postpad[d], N)...)
-   indexesA = ntuple(d->postpad[d]+1:size(img,d)-prepad[d], N)
-   copyreal!(out, AF, indexesA)
-   
-   out
-
-  end
-
- @generated function reflect(A::AbstractArray{T,N}) where {T,N}
-    quote
-        B = Array{T,N}(undef,size(A))
-        @nexprs $N d->(n_d = size(A, d)+1)
-        @nloops $N i A d->(j_d = n_d - i_d) begin
-            @nref($N, B, j) = @nref($N, A, i)
-        end
-        B
-    end
- end
-
-
-for N = 1:5
-    @eval begin
-        function copyreal!(dst::Array{T,$N}, src, I::Tuple{Vararg{UnitRange{Int}}}) where {T<:Real}
-            @nexprs $N d->(I_d = I[d])
-            @nloops $N i dst d->(j_d = first(I_d)+i_d-1) begin
-                (@nref $N dst i) = real(@nref $N src j)
-            end
-            dst
-        end
-        function copyreal!(dst::Array{T,$N}, src, I::Tuple{Vararg{UnitRange{Int}}}) where {T<:Complex}
-            @nexprs $N d->I_d = I[d]
-            @nloops $N i dst d->(j_d = first(I_d)+i_d-1) begin
-                (@nref $N dst i) = @nref $N src j
-            end
-            dst
-        end
-    end
-end
-
-realtype(::Type{R}) where {R<:Real} = R
-realtype(::Type{Complex{R}}) where {R<:Real} = R
-
-
diff --git a/src/imfilter.cuda.jl b/src/kernel/imfilter_cuda.jl
similarity index 97%
rename from src/imfilter.cuda.jl
rename to src/kernel/imfilter_cuda.jl
index d4cc852a..4700f0d7 100644
--- a/src/imfilter.cuda.jl
+++ b/src/kernel/imfilter_cuda.jl
@@ -11,9 +11,8 @@ end
 @platform aware function view_kernel({accelerator_count::(@atleast 1), accelerator_api::CUDA_API}, array, I) Array(array[I]) end
 
 @platform aware function imfilter_kernel({accelerator_count::(@atleast 1), accelerator_api::CUDA_API}, img, krn)
-    imfilter_cuda(img,krn)
- end
-
+   imfilter_cuda(img,krn)
+end
 
 function imfilter_cuda(img, krn)
  
diff --git a/src/kernel/imfilter_default.jl b/src/kernel/imfilter_default.jl
new file mode 100644
index 00000000..623e2a31
--- /dev/null
+++ b/src/kernel/imfilter_default.jl
@@ -0,0 +1,17 @@
+# ------------------------------------------------------------------
+# Licensed under the MIT License. See LICENCE in the project root.
+# ------------------------------------------------------------------
+
+# @platform default function init_imfilter_kernel()
+#   println("Running on DEFAULT PLATFORM")
+# end
+
+# @platform default function array_kernel(array) array end
+
+# @platform default function view_kernel(array, I) view(array, I) end
+
+# @platform default function imfilter_kernel(img, krn)
+#   imfilter(img, centered(krn), Inner(), Algorithm.FFT())
+# end
+
+
diff --git a/src/kernel/imfilter_opencl.jl b/src/kernel/imfilter_opencl.jl
new file mode 100644
index 00000000..06c9a535
--- /dev/null
+++ b/src/kernel/imfilter_opencl.jl
@@ -0,0 +1,107 @@
+# ------------------------------------------------------------------
+# Licensed under the MIT License. See LICENCE in the project root.
+# ------------------------------------------------------------------
+
+#@platform aware function init_imfilter_kernel({accelerator_api::OpenCL_API})
+@platform default function init_imfilter_kernel()
+  println("Running on OpenCL device")
+  global cl_device, cl_ctx, cl_queue = cl.create_compute_context()
+end
+
+#@platform aware function array_kernel({accelerator_api::OpenCL_API}, array) array end
+@platform default function array_kernel(array) array end
+
+#@platform aware function view_kernel({accelerator_api::OpenCL_API}, array, I) view(array, I) end
+@platform default function view_kernel(array, I) view(array, I) end
+
+#@platform aware function imfilter_kernel({accelerator_api::OpenCL_API}, img, kern)
+@platform default function imfilter_kernel(img, krn)
+  imfilter_opencl(img, krn)
+end
+
+function imfilter_opencl(img, krn)
+  # retrieve basic info
+  T = ComplexF64
+
+  # retrieve global OpenCL info
+  ctx, queue = cl_ctx, cl_queue
+
+  # build OpenCL program kernels
+  conj_kernel = build_conj_kernel(ctx)
+  mult_kernel = build_mult_kernel(ctx)
+
+  # pad img to support CLFFT operations
+  padimg = pad_opencl_img(img)
+
+  # pad krn to common size with padimg
+  padkrn = zeros(eltype(img), size(padimg))
+  padkrn[CartesianIndices(krn)] = krn
+
+  # convert to Complex
+  fftimg = T.(padimg)
+  fftkrn = T.(padkrn)
+
+  # OpenCl setup
+  plan = CLFFT.Plan(T, ctx, size(fftimg))
+  CLFFT.set_layout!(plan, :interleaved, :interleaved)
+  CLFFT.set_result!(plan, :inplace)
+  CLFFT.bake!(plan, queue)
+
+  # populate device memory
+  bufimg = cl.Buffer(T, ctx, :copy, hostbuf=fftimg)
+  bufkrn = cl.Buffer(T, ctx, :copy, hostbuf=fftkrn)
+  bufres = cl.Buffer(T, ctx, :w, length(fftimg))
+
+  # transform img and krn to FFT representation 
+  CLFFT.enqueue_transform(plan, :forward, [queue], bufimg, nothing)
+  CLFFT.enqueue_transform(plan, :forward, [queue], bufkrn, nothing)
+
+  # compute ifft(fft(A).*conj.(fft(krn)))
+  queue(conj_kernel, length(fftimg), nothing, bufkrn)
+  queue(mult_kernel, length(fftimg), nothing, bufimg, bufkrn, bufres)
+  CLFFT.enqueue_transform(plan, :backward, [queue], bufres, nothing)
+
+  # recover result
+  result = reshape(cl.read(queue, bufres), size(fftimg))
+  real_result = real.(result)
+
+  finalsize = size(img) .- (size(krn) .- 1)
+  real_result[CartesianIndices(finalsize)]
+end
+
+function build_mult_kernel(ctx)
+  mult_kernel = "
+  __kernel void mult(__global const double2 *a,
+                      __global const double2 *b,
+                      __global double2 *c)
+  {
+    int gid = get_global_id(0);
+    c[gid].x = a[gid].x*b[gid].x - a[gid].y*b[gid].y;
+    c[gid].y = a[gid].x*b[gid].y + a[gid].y*b[gid].x;
+  }
+  "
+  prog = cl.Program(ctx, source=mult_kernel) |> cl.build!
+  cl.Kernel(prog, "mult")
+end
+
+function build_conj_kernel(ctx)
+  conj_kernel = "
+  __kernel void conj(__global double2 *a)
+  {
+    int gid = get_global_id(0);
+    a[gid].y = -a[gid].y;
+  }
+  "
+  prog = cl.Program(ctx, source=conj_kernel) |> cl.build!
+  cl.Kernel(prog, "conj")
+end
+
+function pad_opencl_img(img)
+  # OpenCL FFT expects products of powers of 2, 3, 5, 7, 11 or 13
+  radices = CLFFT.supported_radices()
+  newsize = map(dim -> nextprod(radices, dim), size(img))
+  
+  padimg = zeros(eltype(img), newsize)
+  padimg[CartesianIndices(img)] = img
+  padimg
+end
diff --git a/src/test_imfilter.jl b/src/test_imfilter.jl
deleted file mode 100644
index 5f46d24c..00000000
--- a/src/test_imfilter.jl
+++ /dev/null
@@ -1,46 +0,0 @@
-using BenchmarkTools
-
-const TOLERANCE = 1e-3
-
-function allclose(x::AbstractArray{T}, y::AbstractArray{T}; rtol=1e-5, atol=1e-8) where {T}
-    @assert length(x) == length(y)
-    @inbounds begin
-        for i in 1:length(x)
-            xx, yy = x[i], y[i]
-            if !(isapprox(xx, yy; rtol=rtol, atol=atol))
-                return false
-            end
-        end
-    end
-    return true
-end
-
-function test_imfilter(N)
-
-    @info "start"
-
-    img = rand(Float64,(N,N,N))
-    krn = rand(Float64,(10,10,10))
-
-    X = Int(round(N/2))
-
-    @info "FIRST RUN - GPU - CUDA"
-    r1 = @btime imfilter_cuda(CuArray($img), $krn)
-    @info "check = ", r1[X,X,X]
-
-    @info("SECOND RUN - GPU - OpenCL")
-    global GPU = gpu_setup()
-    r2 = @btime imfilter_opencl($img, $krn)
-    @info "check = ", r2[X,X,X]
-
-    @info "THIRD RUN -  CPU - ImageFiltering"
-    r3 = @btime imfilter($img, centered($krn), Inner(), Algorithm.FFT())
-    @info "check = ", r3[X,X,X]
-
-    @info "r1 & r2", allclose(r1, r2; rtol=1e-2, atol=1e-3)
-    @info "r1 & r3", allclose(r1, r3; rtol=1e-2, atol=1e-3)
-    @info "r2 & r3", allclose(r2, r3; rtol=1e-2, atol=1e-3)
-
-    @info "end"
-
-end
\ No newline at end of file
diff --git a/src/utils_gpu.jl b/src/utils_gpu.jl
deleted file mode 100644
index 3086feca..00000000
--- a/src/utils_gpu.jl
+++ /dev/null
@@ -1,93 +0,0 @@
-# ------------------------------------------------------------------
-# Licensed under the MIT License. See LICENCE in the project root.
-# ------------------------------------------------------------------
-
-struct GPUmeta
-  dev
-  ctx
-  queue
-  mult_kernel
-end
-
-function gpu_setup()
-  @assert cl ≠ nothing "OpenCL.jl not installed, cannot use GPU"
-  @assert clfft ≠ nothing "CLFFT.jl not installed, cannot use GPU"
-
-  devs = cl.devices(:gpu)
-
-  if isempty(devs)
-    @warn "GPU not found, falling back to other OpenCL devices"
-    devs = cl.devices()
-  end
-  @assert !isempty(devs) "OpenCL device not found, make sure drivers are installed"
-
-  dev = []
-  devnames = map(d -> d[:platform][:name], devs)
-  for vendor in ["NVIDIA","AMD","Intel"], (idx,name) in enumerate(devnames)
-    if occursin(vendor, name)
-      dev = devs[idx]
-      break
-    end
-  end
-
-  devtype = uppercase(string(dev[:device_type]))
-  devname = dev[:name]
-
-  @info "using $devtype $devname"
-
-  ctx = cl.Context(dev)
-  queue = cl.CmdQueue(ctx)
-  mult_kernel = basic_kernels(ctx)
-
-  GPUmeta(dev, ctx, queue, mult_kernel)
-end
-
-function basic_kernels(ctx)
-  mult_kernel = "
-    __kernel void mult(__global const double2 *a,
-                       __global const double2 *b,
-                       __global double2 *c)
-    {
-      int gid = get_global_id(0);
-      c[gid].x = a[gid].x*b[gid].x - a[gid].y*b[gid].y;
-      c[gid].y = a[gid].x*b[gid].y + a[gid].y*b[gid].x;
-    }
-  "
-  
-  # build OpenCL program
-  prog = cl.Program(ctx, source=mult_kernel) |> cl.build!
-
-  cl.Kernel(prog, "mult")
-end
-
-
-
-function clfftpad(A::AbstractArray)
-  # clFFT releases support powers of 2, 3, 5, ...
-  radices = [2,3,5]
-  v = clfft.version()
-  v ≥ v"2.8.0"  && push!(radices, 7)
-  v ≥ v"2.12.0" && push!(radices, 11, 13)
-
-  result = Int[]
-  for s in size(A)
-    fs = keys(factor(s))
-    if fs ⊆ radices
-      push!(result, 0)
-    else
-      # Try a closer number that has prime factors of 2 and 3.
-      # Use the next power of 2 (say N) to get multiple new
-      # candidates.
-      N = nextpow(2,s)
-
-      # fractions of N: 100%, 93%, 84%, 75%, 56%
-      candidates = [N, 15(N÷16), 27(N÷32), 3(N÷4), 9(N÷16)]
-      candidates = candidates[candidates .> s]
-      n = minimum(candidates)
-
-      push!(result, n - s)
-    end
-  end
-
-  result
-end

From a1087b6113da643d47c1e0a39e8444c6eafa96bb Mon Sep 17 00:00:00 2001
From: ClaroHenrique <clarohenriques@gmail.com>
Date: Tue, 1 Nov 2022 14:43:57 -0300
Subject: [PATCH 02/20] Update PlatformAware syntax

---
 src/kernel/imfilter_default.jl | 19 +++++++++++--------
 src/kernel/imfilter_opencl.jl  | 12 ++++--------
 2 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/src/kernel/imfilter_default.jl b/src/kernel/imfilter_default.jl
index 623e2a31..c270622e 100644
--- a/src/kernel/imfilter_default.jl
+++ b/src/kernel/imfilter_default.jl
@@ -2,16 +2,19 @@
 # Licensed under the MIT License. See LICENCE in the project root.
 # ------------------------------------------------------------------
 
-# @platform default function init_imfilter_kernel()
-#   println("Running on DEFAULT PLATFORM")
-# end
+@platform default function init_imfilter_kernel()
+  println("Running on DEFAULT PLATFORM")
+end
 
-# @platform default function array_kernel(array) array end
+@platform default function array_kernel(array) array end
 
-# @platform default function view_kernel(array, I) view(array, I) end
+@platform default function view_kernel(array, I) view(array, I) end
 
-# @platform default function imfilter_kernel(img, krn)
-#   imfilter(img, centered(krn), Inner(), Algorithm.FFT())
-# end
+@platform default function imfilter_kernel(img, krn)
+  imfilter_cpu(img, krn)
+end
 
+function imfilter_cpu(img, krn)
+  imfilter(img, centered(krn), Inner(), Algorithm.FFT())
+end
 
diff --git a/src/kernel/imfilter_opencl.jl b/src/kernel/imfilter_opencl.jl
index 06c9a535..b5ad98a0 100644
--- a/src/kernel/imfilter_opencl.jl
+++ b/src/kernel/imfilter_opencl.jl
@@ -2,20 +2,16 @@
 # Licensed under the MIT License. See LICENCE in the project root.
 # ------------------------------------------------------------------
 
-#@platform aware function init_imfilter_kernel({accelerator_api::OpenCL_API})
-@platform default function init_imfilter_kernel()
+@platform aware function init_imfilter_kernel({accelerator_count::(@atleast 1), accelerator_api::(@api OpenCL)})
   println("Running on OpenCL device")
   global cl_device, cl_ctx, cl_queue = cl.create_compute_context()
 end
 
-#@platform aware function array_kernel({accelerator_api::OpenCL_API}, array) array end
-@platform default function array_kernel(array) array end
+@platform aware function array_kernel({accelerator_count::(@atleast 1), accelerator_api::(@api OpenCL)}, array) array end
 
-#@platform aware function view_kernel({accelerator_api::OpenCL_API}, array, I) view(array, I) end
-@platform default function view_kernel(array, I) view(array, I) end
+@platform aware function view_kernel({accelerator_count::(@atleast 1), accelerator_api::(@api OpenCL)}, array, I) view(array, I) end
 
-#@platform aware function imfilter_kernel({accelerator_api::OpenCL_API}, img, kern)
-@platform default function imfilter_kernel(img, krn)
+@platform aware function imfilter_kernel({accelerator_count::(@atleast 1), accelerator_api::(@api OpenCL)}, img, krn)
   imfilter_opencl(img, krn)
 end
 

From a24757234f3a39e20e83e679f89c0133f1122761 Mon Sep 17 00:00:00 2001
From: ClaroHenrique <clarohenriques@gmail.com>
Date: Wed, 2 Nov 2022 11:16:24 -0300
Subject: [PATCH 03/20] Build OpenCL context locally

---
 src/kernel/imfilter_cuda.jl    | 2 --
 src/kernel/imfilter_default.jl | 1 -
 src/kernel/imfilter_opencl.jl  | 3 +--
 3 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/kernel/imfilter_cuda.jl b/src/kernel/imfilter_cuda.jl
index 4700f0d7..0ec82886 100644
--- a/src/kernel/imfilter_cuda.jl
+++ b/src/kernel/imfilter_cuda.jl
@@ -28,6 +28,4 @@ function imfilter_cuda(img, krn)
    # recover result
    finalsize = size(img) .- (size(krn) .- 1)
    real.(result[CartesianIndices(finalsize)]) |> Array
-
 end
-
diff --git a/src/kernel/imfilter_default.jl b/src/kernel/imfilter_default.jl
index c270622e..a998904d 100644
--- a/src/kernel/imfilter_default.jl
+++ b/src/kernel/imfilter_default.jl
@@ -17,4 +17,3 @@ end
 function imfilter_cpu(img, krn)
   imfilter(img, centered(krn), Inner(), Algorithm.FFT())
 end
-
diff --git a/src/kernel/imfilter_opencl.jl b/src/kernel/imfilter_opencl.jl
index b5ad98a0..460d1011 100644
--- a/src/kernel/imfilter_opencl.jl
+++ b/src/kernel/imfilter_opencl.jl
@@ -4,7 +4,6 @@
 
 @platform aware function init_imfilter_kernel({accelerator_count::(@atleast 1), accelerator_api::(@api OpenCL)})
   println("Running on OpenCL device")
-  global cl_device, cl_ctx, cl_queue = cl.create_compute_context()
 end
 
 @platform aware function array_kernel({accelerator_count::(@atleast 1), accelerator_api::(@api OpenCL)}, array) array end
@@ -20,7 +19,7 @@ function imfilter_opencl(img, krn)
   T = ComplexF64
 
   # retrieve global OpenCL info
-  ctx, queue = cl_ctx, cl_queue
+  device, ctx, queue = cl.create_compute_context()
 
   # build OpenCL program kernels
   conj_kernel = build_conj_kernel(ctx)

From df8479506cf0c50349fc9ab9dcfce48790f67813 Mon Sep 17 00:00:00 2001
From: Francisco Heron de Carvalho Junior <heron@dc.ufc.br>
Date: Thu, 3 Nov 2022 11:05:38 -0400
Subject: [PATCH 04/20] adjust in CUDA kernel

---
 src/ImageQuilting.jl           | 11 +++++++----
 src/kernel/imfilter_cuda.jl    | 18 +++++++++++++-----
 src/kernel/imfilter_default.jl |  4 ++--
 src/kernel/imfilter_opencl.jl  |  4 ++--
 src/kernels.jl                 |  1 +
 5 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/src/ImageQuilting.jl b/src/ImageQuilting.jl
index 82fe122a..d6f8b01f 100644
--- a/src/ImageQuilting.jl
+++ b/src/ImageQuilting.jl
@@ -17,6 +17,7 @@ using ProgressMeter: Progress, next!
 using FFTW: set_num_threads
 using CpuId: cpucores
 using RecipesBase
+
 using CUDA
 using OpenCL
 using CLFFT
@@ -38,10 +39,12 @@ include("iqsim.jl")
 include("voxelreuse.jl")
 include("geostats.jl")
 
-include("kernels.jl")
-include("kernel/imfilter_default.jl")
-include("kernel/imfilter_cuda.jl")
-include("kernel/imfilter_opencl.jl")
+function __init__()    
+  include("src/kernels.jl")
+  include("src/kernel/imfilter_default.jl")
+  include("src/kernel/imfilter_cuda.jl")
+  include("src/kernel/imfilter_opencl.jl")
+end
 
 export
   # functions
diff --git a/src/kernel/imfilter_cuda.jl b/src/kernel/imfilter_cuda.jl
index 0ec82886..f75f42cf 100644
--- a/src/kernel/imfilter_cuda.jl
+++ b/src/kernel/imfilter_cuda.jl
@@ -2,15 +2,23 @@
 # Licensed under the MIT License. See LICENCE in the project root.
 # ------------------------------------------------------------------
 
-@platform aware function init_imfilter_kernel({accelerator_count::(@atleast 1), accelerator_api::CUDA_API})
-  println("Running on CUDA GPU")
+@platform aware function init_imfilter_kernel({accelerator_count::(@atleast 1), 
+                                               accelerator_manufacturer::NVIDIA, 
+                                               accelerator_api::(@api CUDA)})
+  println("Running on NVIDIA/CUDA GPU")
 end
 
-@platform aware function array_kernel({accelerator_count::(@atleast 1), accelerator_api::CUDA_API}, array) CuArray(array) end
+@platform aware array_kernel({accelerator_count::(@atleast 1), 
+                              accelerator_manufacturer::NVIDIA,
+                              accelerator_api::(@api CUDA)}, array) = CuArray(array)
 
-@platform aware function view_kernel({accelerator_count::(@atleast 1), accelerator_api::CUDA_API}, array, I) Array(array[I]) end
+@platform aware view_kernel({accelerator_count::(@atleast 1), 
+                             accelerator_manufacturer::NVIDIA, 
+                             accelerator_api::(@api CUDA)}, array, I)  = Array(array[I])
 
-@platform aware function imfilter_kernel({accelerator_count::(@atleast 1), accelerator_api::CUDA_API}, img, krn)
+@platform aware function imfilter_kernel({accelerator_count::(@atleast 1), 
+                                          accelerator_manufacturer::NVIDIA, 
+                                          accelerator_api::(@api CUDA)}, img, krn)
    imfilter_cuda(img,krn)
 end
 
diff --git a/src/kernel/imfilter_default.jl b/src/kernel/imfilter_default.jl
index a998904d..24792e6b 100644
--- a/src/kernel/imfilter_default.jl
+++ b/src/kernel/imfilter_default.jl
@@ -6,9 +6,9 @@
   println("Running on DEFAULT PLATFORM")
 end
 
-@platform default function array_kernel(array) array end
+@platform default array_kernel(array) = array
 
-@platform default function view_kernel(array, I) view(array, I) end
+@platform default view_kernel(array, I) = view(array, I)
 
 @platform default function imfilter_kernel(img, krn)
   imfilter_cpu(img, krn)
diff --git a/src/kernel/imfilter_opencl.jl b/src/kernel/imfilter_opencl.jl
index 460d1011..3541b958 100644
--- a/src/kernel/imfilter_opencl.jl
+++ b/src/kernel/imfilter_opencl.jl
@@ -6,9 +6,9 @@
   println("Running on OpenCL device")
 end
 
-@platform aware function array_kernel({accelerator_count::(@atleast 1), accelerator_api::(@api OpenCL)}, array) array end
+@platform aware array_kernel({accelerator_count::(@atleast 1), accelerator_api::(@api OpenCL)}, array) = array
 
-@platform aware function view_kernel({accelerator_count::(@atleast 1), accelerator_api::(@api OpenCL)}, array, I) view(array, I) end
+@platform aware view_kernel({accelerator_count::(@atleast 1), accelerator_api::(@api OpenCL)}, array, I) = view(array, I)
 
 @platform aware function imfilter_kernel({accelerator_count::(@atleast 1), accelerator_api::(@api OpenCL)}, img, krn)
   imfilter_opencl(img, krn)
diff --git a/src/kernels.jl b/src/kernels.jl
index e51bbe79..a47debe8 100644
--- a/src/kernels.jl
+++ b/src/kernels.jl
@@ -4,3 +4,4 @@
 @platform parameter clear
 @platform parameter accelerator_count
 @platform parameter accelerator_api
+@platform parameter accelerator_manufacturer

From 5d4200bdb96c6de122d19d6c55ce8999184fa8f9 Mon Sep 17 00:00:00 2001
From: Francisco Heron de Carvalho Junior <heron@dc.ufc.br>
Date: Thu, 3 Nov 2022 16:47:20 -0400
Subject: [PATCH 05/20] adjusts in CUDA kernel

---
 src/ImageQuilting.jl          | 2 ++
 src/kernel/imfilter_opencl.jl | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/src/ImageQuilting.jl b/src/ImageQuilting.jl
index d6f8b01f..f26b79b1 100644
--- a/src/ImageQuilting.jl
+++ b/src/ImageQuilting.jl
@@ -46,6 +46,8 @@ function __init__()
   include("src/kernel/imfilter_opencl.jl")
 end
 
+include("test_imfilter.jl")
+
 export
   # functions
   iqsim,
diff --git a/src/kernel/imfilter_opencl.jl b/src/kernel/imfilter_opencl.jl
index 3541b958..5059a17c 100644
--- a/src/kernel/imfilter_opencl.jl
+++ b/src/kernel/imfilter_opencl.jl
@@ -61,6 +61,9 @@ function imfilter_opencl(img, krn)
   real_result = real.(result)
 
   finalsize = size(img) .- (size(krn) .- 1)
+
+  GC.gc()
+
   real_result[CartesianIndices(finalsize)]
 end
 

From 7b9f1221fc3a12ec8277785db4bfb4318ddcae8f Mon Sep 17 00:00:00 2001
From: Francisco Heron de Carvalho Junior <heron@dc.ufc.br>
Date: Thu, 3 Nov 2022 17:02:01 -0400
Subject: [PATCH 06/20] adding testing file

---
 src/test_imfilter.jl | 46 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)
 create mode 100644 src/test_imfilter.jl

diff --git a/src/test_imfilter.jl b/src/test_imfilter.jl
new file mode 100644
index 00000000..12b441b2
--- /dev/null
+++ b/src/test_imfilter.jl
@@ -0,0 +1,46 @@
+using BenchmarkTools
+
+const TOLERANCE = 1e-3
+
+function allclose(x::AbstractArray{T}, y::AbstractArray{T}; rtol=1e-5, atol=1e-8) where {T}
+    @assert length(x) == length(y)
+    @inbounds begin
+        for i in 1:length(x)
+            xx, yy = x[i], y[i]
+            if !(isapprox(xx, yy; rtol=rtol, atol=atol))
+                return false
+            end
+        end
+    end
+    return true
+end
+
+function test_imfilter(N)
+
+    @info "start"
+
+    img = rand(Float64,(N,N,N))
+    krn = rand(Float64,(10,10,10))
+
+    X = Int(round(N/2))
+
+    @info "FIRST RUN - GPU - CUDA"
+    r1 = @btime imfilter_cuda(CuArray($img), $krn)
+    @info "check = ", r1[X,X,X]
+
+    @info("SECOND RUN - GPU - OpenCL")
+    #global GPU = gpu_setup()
+    r2 = @btime imfilter_opencl($img, $krn)
+    @info "check = ", r2[X,X,X]
+
+    @info "THIRD RUN -  CPU - ImageFiltering"
+    r3 = @btime imfilter($img, centered($krn), Inner(), Algorithm.FFT())
+    @info "check = ", r3[X,X,X]
+
+    @info "r1 & r2", allclose(r1, r2; rtol=1e-2, atol=1e-3)
+    @info "r1 & r3", allclose(r1, r3; rtol=1e-2, atol=1e-3)
+    @info "r2 & r3", allclose(r2, r3; rtol=1e-2, atol=1e-3)
+
+    @info "end"
+
+end
\ No newline at end of file

From 806696f1852fd82b02deec2fa0acf69110328a48 Mon Sep 17 00:00:00 2001
From: Francisco Heron de Carvalho Junior <heron@dc.ufc.br>
Date: Thu, 3 Nov 2022 17:13:25 -0400
Subject: [PATCH 07/20] free resources

---
 src/kernel/imfilter_opencl.jl | 2 --
 src/test_imfilter.jl          | 3 +++
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/kernel/imfilter_opencl.jl b/src/kernel/imfilter_opencl.jl
index 5059a17c..088219b9 100644
--- a/src/kernel/imfilter_opencl.jl
+++ b/src/kernel/imfilter_opencl.jl
@@ -62,8 +62,6 @@ function imfilter_opencl(img, krn)
 
   finalsize = size(img) .- (size(krn) .- 1)
 
-  GC.gc()
-
   real_result[CartesianIndices(finalsize)]
 end
 
diff --git a/src/test_imfilter.jl b/src/test_imfilter.jl
index 12b441b2..691bb3f2 100644
--- a/src/test_imfilter.jl
+++ b/src/test_imfilter.jl
@@ -33,6 +33,9 @@ function test_imfilter(N)
     r2 = @btime imfilter_opencl($img, $krn)
     @info "check = ", r2[X,X,X]
 
+    CLFFT.api.clfftTeardown()
+    GC.gc()
+
     @info "THIRD RUN -  CPU - ImageFiltering"
     r3 = @btime imfilter($img, centered($krn), Inner(), Algorithm.FFT())
     @info "check = ", r3[X,X,X]

From 1a3c575e060b371ec565ab04689cceee9bb5c6ef Mon Sep 17 00:00:00 2001
From: ClaroHenrique <clarohenriques@gmail.com>
Date: Fri, 4 Nov 2022 01:09:36 +0000
Subject: [PATCH 08/20] Use OpenCL imfilter with single precision floats

---
 src/kernel/imfilter_opencl.jl | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/kernel/imfilter_opencl.jl b/src/kernel/imfilter_opencl.jl
index 088219b9..f525a888 100644
--- a/src/kernel/imfilter_opencl.jl
+++ b/src/kernel/imfilter_opencl.jl
@@ -16,7 +16,7 @@ end
 
 function imfilter_opencl(img, krn)
   # retrieve basic info
-  T = ComplexF64
+  T = ComplexF32
 
   # retrieve global OpenCL info
   device, ctx, queue = cl.create_compute_context()
@@ -67,9 +67,9 @@ end
 
 function build_mult_kernel(ctx)
   mult_kernel = "
-  __kernel void mult(__global const double2 *a,
-                      __global const double2 *b,
-                      __global double2 *c)
+  __kernel void mult(__global const float2 *a,
+                      __global const float2 *b,
+                      __global float2 *c)
   {
     int gid = get_global_id(0);
     c[gid].x = a[gid].x*b[gid].x - a[gid].y*b[gid].y;
@@ -82,7 +82,7 @@ end
 
 function build_conj_kernel(ctx)
   conj_kernel = "
-  __kernel void conj(__global double2 *a)
+  __kernel void conj(__global float2 *a)
   {
     int gid = get_global_id(0);
     a[gid].y = -a[gid].y;

From 07f421d573959be8d47f3b43100740d12f69a3d5 Mon Sep 17 00:00:00 2001
From: Francisco Heron de Carvalho Junior <heron@dc.ufc.br>
Date: Thu, 3 Nov 2022 21:43:26 -0400
Subject: [PATCH 09/20] opencl context outside kernel call

---
 src/kernel/imfilter_cuda.jl    |  2 +-
 src/kernel/imfilter_default.jl |  2 +-
 src/kernel/imfilter_opencl.jl  | 26 +++++++++++++++++---------
 src/test_imfilter.jl           |  8 +++-----
 4 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/src/kernel/imfilter_cuda.jl b/src/kernel/imfilter_cuda.jl
index f75f42cf..910f6193 100644
--- a/src/kernel/imfilter_cuda.jl
+++ b/src/kernel/imfilter_cuda.jl
@@ -5,7 +5,7 @@
 @platform aware function init_imfilter_kernel({accelerator_count::(@atleast 1), 
                                                accelerator_manufacturer::NVIDIA, 
                                                accelerator_api::(@api CUDA)})
-  println("Running on NVIDIA/CUDA GPU")
+  @info "Running on NVIDIA/CUDA GPU"
 end
 
 @platform aware array_kernel({accelerator_count::(@atleast 1), 
diff --git a/src/kernel/imfilter_default.jl b/src/kernel/imfilter_default.jl
index 24792e6b..bc67755d 100644
--- a/src/kernel/imfilter_default.jl
+++ b/src/kernel/imfilter_default.jl
@@ -3,7 +3,7 @@
 # ------------------------------------------------------------------
 
 @platform default function init_imfilter_kernel()
-  println("Running on DEFAULT PLATFORM")
+  @info "Running on DEFAULT PLATFORM"
 end
 
 @platform default array_kernel(array) = array
diff --git a/src/kernel/imfilter_opencl.jl b/src/kernel/imfilter_opencl.jl
index 088219b9..9077f022 100644
--- a/src/kernel/imfilter_opencl.jl
+++ b/src/kernel/imfilter_opencl.jl
@@ -3,7 +3,16 @@
 # ------------------------------------------------------------------
 
 @platform aware function init_imfilter_kernel({accelerator_count::(@atleast 1), accelerator_api::(@api OpenCL)})
-  println("Running on OpenCL device")
+  @info "Running on OpenCL device"
+  init_opencl_context()
+end
+
+function init_opencl_context()
+  # retrieve global OpenCL info
+  device_, ctx_, queue_ = cl.create_compute_context()
+  global device = device_
+  global ctx = ctx_
+  global queue = queue_
 end
 
 @platform aware array_kernel({accelerator_count::(@atleast 1), accelerator_api::(@api OpenCL)}, array) = array
@@ -14,12 +23,10 @@ end
   imfilter_opencl(img, krn)
 end
 
+
 function imfilter_opencl(img, krn)
   # retrieve basic info
-  T = ComplexF64
-
-  # retrieve global OpenCL info
-  device, ctx, queue = cl.create_compute_context()
+  T = ComplexF32
 
   # build OpenCL program kernels
   conj_kernel = build_conj_kernel(ctx)
@@ -37,6 +44,7 @@ function imfilter_opencl(img, krn)
   fftkrn = T.(padkrn)
 
   # OpenCl setup
+
   plan = CLFFT.Plan(T, ctx, size(fftimg))
   CLFFT.set_layout!(plan, :interleaved, :interleaved)
   CLFFT.set_result!(plan, :inplace)
@@ -67,9 +75,9 @@ end
 
 function build_mult_kernel(ctx)
   mult_kernel = "
-  __kernel void mult(__global const double2 *a,
-                      __global const double2 *b,
-                      __global double2 *c)
+  __kernel void mult(__global const float2 *a,
+                      __global const float2 *b,
+                      __global float2 *c)
   {
     int gid = get_global_id(0);
     c[gid].x = a[gid].x*b[gid].x - a[gid].y*b[gid].y;
@@ -82,7 +90,7 @@ end
 
 function build_conj_kernel(ctx)
   conj_kernel = "
-  __kernel void conj(__global double2 *a)
+  __kernel void conj(__global float2 *a)
   {
     int gid = get_global_id(0);
     a[gid].y = -a[gid].y;
diff --git a/src/test_imfilter.jl b/src/test_imfilter.jl
index 691bb3f2..ea01f806 100644
--- a/src/test_imfilter.jl
+++ b/src/test_imfilter.jl
@@ -19,8 +19,8 @@ function test_imfilter(N)
 
     @info "start"
 
-    img = rand(Float64,(N,N,N))
-    krn = rand(Float64,(10,10,10))
+    img = rand(Float32,(N,N,N))
+    krn = rand(Float32,(10,10,10))
 
     X = Int(round(N/2))
 
@@ -30,12 +30,10 @@ function test_imfilter(N)
 
     @info("SECOND RUN - GPU - OpenCL")
     #global GPU = gpu_setup()
+    init_opencl_context()
     r2 = @btime imfilter_opencl($img, $krn)
     @info "check = ", r2[X,X,X]
 
-    CLFFT.api.clfftTeardown()
-    GC.gc()
-
     @info "THIRD RUN -  CPU - ImageFiltering"
     r3 = @btime imfilter($img, centered($krn), Inner(), Algorithm.FFT())
     @info "check = ", r3[X,X,X]

From f1de845fddb1f391058b91c312bfaa022ef31b3c Mon Sep 17 00:00:00 2001
From: Francisco Heron de Carvalho Junior
 <102302676+decarvalhojunior-fh@users.noreply.github.com>
Date: Thu, 3 Nov 2022 21:49:34 -0400
Subject: [PATCH 10/20] OpenCL context outsilde the kernel function

---
 src/kernel/imfilter_opencl.jl | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/kernel/imfilter_opencl.jl b/src/kernel/imfilter_opencl.jl
index f525a888..a06f7ba5 100644
--- a/src/kernel/imfilter_opencl.jl
+++ b/src/kernel/imfilter_opencl.jl
@@ -3,7 +3,16 @@
 # ------------------------------------------------------------------
 
 @platform aware function init_imfilter_kernel({accelerator_count::(@atleast 1), accelerator_api::(@api OpenCL)})
-  println("Running on OpenCL device")
+  @info "Running on OpenCL device"
+  init_opencl_context()
+end
+
+function init_opencl_context()
+  # retrieve global OpenCL info
+  device_, ctx_, queue_ = cl.create_compute_context()
+  global device = device_
+  global ctx = ctx_
+  global queue = queue_
 end
 
 @platform aware array_kernel({accelerator_count::(@atleast 1), accelerator_api::(@api OpenCL)}, array) = array
@@ -18,9 +27,6 @@ function imfilter_opencl(img, krn)
   # retrieve basic info
   T = ComplexF32
 
-  # retrieve global OpenCL info
-  device, ctx, queue = cl.create_compute_context()
-
   # build OpenCL program kernels
   conj_kernel = build_conj_kernel(ctx)
   mult_kernel = build_mult_kernel(ctx)

From 60604c4d1207c88117ad34335c3b363964d14049 Mon Sep 17 00:00:00 2001
From: Francisco Heron de Carvalho Junior <heron@dc.ufc.br>
Date: Sun, 6 Nov 2022 19:16:33 -0500
Subject: [PATCH 11/20] updated Project.toml

---
 Project.toml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/Project.toml b/Project.toml
index 42f7bdc3..fabcb63b 100644
--- a/Project.toml
+++ b/Project.toml
@@ -10,7 +10,6 @@ CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 CpuId = "adafc99b-e345-5852-983c-f28acb93d879"
 FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
 GeoStatsBase = "323cb8eb-fbf6-51c0-afd0-f8fba70507b2"
-GeoStatsImages = "7cd16168-b42c-5e7d-a585-4f59d326662d"
 Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6"
 GraphsFlows = "06909019-6f44-4949-96fc-b9d9aaa02889"
 ImageFiltering = "6a3955dd-da59-5b1f-98d4-e7296123deb5"
@@ -33,12 +32,10 @@ Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 CUDA = "3.11"
 CpuId = "0.2, 0.3"
 FFTW = "1.0"
-GeoStatsBase = "0.26"
 Graphs = "1.4"
 GraphsFlows = "0.1"
 ImageFiltering = "0.6, 0.7"
 ImageMorphology = "0.2, 0.3, 0.4"
-Meshes = "0.22"
 ProgressMeter = "1.1"
 RecipesBase = "1.0"
 StatsBase = "0.32, 0.33"

From 2f7f35614f26481b6d2887c45339fe5fdd27d6fe Mon Sep 17 00:00:00 2001
From: Francisco Heron de Carvalho Junior <heron@dc.ufc.br>
Date: Sun, 6 Nov 2022 19:23:58 -0500
Subject: [PATCH 12/20] experiment scripts

---
 run_experiment_app.jl    | 17 ++++++++++++++
 run_experiment_kernel.jl | 17 ++++++++++++++
 run_sample.jl            | 48 ++++++++++++++++++++++++++++++++++++++++
 src/test_imfilter.jl     |  8 +++----
 4 files changed, 86 insertions(+), 4 deletions(-)
 create mode 100644 run_experiment_app.jl
 create mode 100644 run_experiment_kernel.jl
 create mode 100644 run_sample.jl

diff --git a/run_experiment_app.jl b/run_experiment_app.jl
new file mode 100644
index 00000000..2733d0a5
--- /dev/null
+++ b/run_experiment_app.jl
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+
+
+for size in 64 128 256
+do
+   for version in 2
+   do
+      for turn in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
+      do
+         echo $size.$version.$turn.adhoc        
+         $JULIA_PATH/julia ./run_sample.jl $version $size $turn >> output.iq.adhoc.$version.$size
+         echo $size.$version.$turn.structured
+         PLATFORM_DESCRIPTION=Platform.$version.toml $JULIA_PATH/julia ./run_sample.jl -$version $size $turn >> output.iq.structured.$version.$size     
+      done
+   done
+done
diff --git a/run_experiment_kernel.jl b/run_experiment_kernel.jl
new file mode 100644
index 00000000..2733d0a5
--- /dev/null
+++ b/run_experiment_kernel.jl
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+
+
+for size in 64 128 256
+do
+   for version in 2
+   do
+      for turn in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
+      do
+         echo $size.$version.$turn.adhoc        
+         $JULIA_PATH/julia ./run_sample.jl $version $size $turn >> output.iq.adhoc.$version.$size
+         echo $size.$version.$turn.structured
+         PLATFORM_DESCRIPTION=Platform.$version.toml $JULIA_PATH/julia ./run_sample.jl -$version $size $turn >> output.iq.structured.$version.$size     
+      done
+   done
+done
diff --git a/run_sample.jl b/run_sample.jl
new file mode 100644
index 00000000..6dd95b18
--- /dev/null
+++ b/run_sample.jl
@@ -0,0 +1,48 @@
+import Pkg; Pkg.activate(".")
+using ImageQuilting
+using CUDA
+
+function main(args)
+
+        @info args
+
+        v = parse(Int64,args[1])
+        size = parse(Int64,args[2])
+        i = parse(Int64,args[3])
+
+        img = rand(Float32,(size,size,size))
+        krn = rand(Float32,(10,10,10))
+
+        if (v < 0)
+           @info "$i: structured / $v"
+           img = ImageQuilting.array_kernel(img)
+           ImageQuilting.init_imfilter_kernel()
+           @time ImageQuilting.imfilter_kernel(img,krn)
+           ImageQuilting.imfilter_kernel(img,krn)
+           ImageQuilting.imfilter_kernel(img,krn)
+           @time ImageQuilting.imfilter_kernel(img,krn)
+        elseif (v == 1)
+           @info "$i: ad-hoc / default"
+           @time ImageQuilting.imfilter_cpu(img, krn)
+           @time ImageQuilting.imfilter_cpu(img, krn)
+        elseif (v == 2)
+           @info "$i: ad-hoc / CUDA"
+           img = CuArray(img)
+           @time ImageQuilting.imfilter_cuda(img, krn)
+           ImageQuilting.imfilter_cuda(img, krn)
+           ImageQuilting.imfilter_cuda(img, krn)
+           @time ImageQuilting.imfilter_cuda(img, krn)
+        elseif (v == 3)
+           @info "$i: ad-hoc / OpenCL"
+           ImageQuilting.init_opencl_context()
+           @time ImageQuilting.imfilter_opencl(img, krn)
+           ImageQuilting.imfilter_opencl(img, krn)
+           ImageQuilting.imfilter_opencl(img, krn)
+           @time ImageQuilting.imfilter_opencl(img, krn)
+        else
+           @info "wrong selection"
+        end
+end
+
+main(ARGS)
+
diff --git a/src/test_imfilter.jl b/src/test_imfilter.jl
index ea01f806..b3253597 100644
--- a/src/test_imfilter.jl
+++ b/src/test_imfilter.jl
@@ -25,17 +25,17 @@ function test_imfilter(N)
     X = Int(round(N/2))
 
     @info "FIRST RUN - GPU - CUDA"
-    r1 = @btime imfilter_cuda(CuArray($img), $krn)
+    r1 = @time imfilter_cuda(CuArray(img), krn)
     @info "check = ", r1[X,X,X]
 
     @info("SECOND RUN - GPU - OpenCL")
     #global GPU = gpu_setup()
     init_opencl_context()
-    r2 = @btime imfilter_opencl($img, $krn)
+    r2 = @time imfilter_opencl(img, krn)
     @info "check = ", r2[X,X,X]
 
     @info "THIRD RUN -  CPU - ImageFiltering"
-    r3 = @btime imfilter($img, centered($krn), Inner(), Algorithm.FFT())
+    r3 = @time imfilter(img, centered(krn), Inner(), Algorithm.FFT())
     @info "check = ", r3[X,X,X]
 
     @info "r1 & r2", allclose(r1, r2; rtol=1e-2, atol=1e-3)
@@ -44,4 +44,4 @@ function test_imfilter(N)
 
     @info "end"
 
-end
\ No newline at end of file
+end

From 3c08a03b2d6d3e5e863248057a41efdc1fdad61e Mon Sep 17 00:00:00 2001
From: Francisco Heron de Carvalho Junior <heron@dc.ufc.br>
Date: Sun, 6 Nov 2022 19:25:28 -0500
Subject: [PATCH 13/20] experiment scripts

---
 run_sample.jl => run_sample_app.jl |  0
 run_sample_kernel.jl               | 48 ++++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+)
 rename run_sample.jl => run_sample_app.jl (100%)
 create mode 100644 run_sample_kernel.jl

diff --git a/run_sample.jl b/run_sample_app.jl
similarity index 100%
rename from run_sample.jl
rename to run_sample_app.jl
diff --git a/run_sample_kernel.jl b/run_sample_kernel.jl
new file mode 100644
index 00000000..6dd95b18
--- /dev/null
+++ b/run_sample_kernel.jl
@@ -0,0 +1,48 @@
+import Pkg; Pkg.activate(".")
+using ImageQuilting
+using CUDA
+
+function main(args)
+
+        @info args
+
+        v = parse(Int64,args[1])
+        size = parse(Int64,args[2])
+        i = parse(Int64,args[3])
+
+        img = rand(Float32,(size,size,size))
+        krn = rand(Float32,(10,10,10))
+
+        if (v < 0)
+           @info "$i: structured / $v"
+           img = ImageQuilting.array_kernel(img)
+           ImageQuilting.init_imfilter_kernel()
+           @time ImageQuilting.imfilter_kernel(img,krn)
+           ImageQuilting.imfilter_kernel(img,krn)
+           ImageQuilting.imfilter_kernel(img,krn)
+           @time ImageQuilting.imfilter_kernel(img,krn)
+        elseif (v == 1)
+           @info "$i: ad-hoc / default"
+           @time ImageQuilting.imfilter_cpu(img, krn)
+           @time ImageQuilting.imfilter_cpu(img, krn)
+        elseif (v == 2)
+           @info "$i: ad-hoc / CUDA"
+           img = CuArray(img)
+           @time ImageQuilting.imfilter_cuda(img, krn)
+           ImageQuilting.imfilter_cuda(img, krn)
+           ImageQuilting.imfilter_cuda(img, krn)
+           @time ImageQuilting.imfilter_cuda(img, krn)
+        elseif (v == 3)
+           @info "$i: ad-hoc / OpenCL"
+           ImageQuilting.init_opencl_context()
+           @time ImageQuilting.imfilter_opencl(img, krn)
+           ImageQuilting.imfilter_opencl(img, krn)
+           ImageQuilting.imfilter_opencl(img, krn)
+           @time ImageQuilting.imfilter_opencl(img, krn)
+        else
+           @info "wrong selection"
+        end
+end
+
+main(ARGS)
+

From 18ffbdc8f96f26179a586caa08916ccdf3d7359e Mon Sep 17 00:00:00 2001
From: Francisco Heron de Carvalho Junior <heron@dc.ufc.br>
Date: Sun, 6 Nov 2022 20:42:51 -0500
Subject: [PATCH 14/20] experiment scripts

---
 run_experiment_app.sh                         |  9 +++
 run_experiment_kernel.jl                      | 17 -----
 ...eriment_app.jl => run_experiment_kernel.sh |  0
 run_sample_app.jl                             | 69 ++++++++-----------
 4 files changed, 38 insertions(+), 57 deletions(-)
 create mode 100644 run_experiment_app.sh
 delete mode 100644 run_experiment_kernel.jl
 rename run_experiment_app.jl => run_experiment_kernel.sh (100%)

diff --git a/run_experiment_app.sh b/run_experiment_app.sh
new file mode 100644
index 00000000..8f178251
--- /dev/null
+++ b/run_experiment_app.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+for turn in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
+do
+   echo $turn.adhoc
+   PATH_IQ="/home/tcarneiropessoa/heron/ImageQuilting-adhoc" $JULIA_PATH/julia./run_sample.jl $version $size $turn >> output.iq.app.adhoc
+   echo $turn.structured
+   PATH_IQ="/home/tcarneiropessoa/heron/ImageQuilting-structured" PLATFORM_DESCRIPTION=Platform.$version.toml $JULIA_PATH/julia ./run_sample.jl $turn >> output.iq.app.structured
+done
diff --git a/run_experiment_kernel.jl b/run_experiment_kernel.jl
deleted file mode 100644
index 2733d0a5..00000000
--- a/run_experiment_kernel.jl
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/bash
-
-
-
-for size in 64 128 256
-do
-   for version in 2
-   do
-      for turn in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
-      do
-         echo $size.$version.$turn.adhoc        
-         $JULIA_PATH/julia ./run_sample.jl $version $size $turn >> output.iq.adhoc.$version.$size
-         echo $size.$version.$turn.structured
-         PLATFORM_DESCRIPTION=Platform.$version.toml $JULIA_PATH/julia ./run_sample.jl -$version $size $turn >> output.iq.structured.$version.$size     
-      done
-   done
-done
diff --git a/run_experiment_app.jl b/run_experiment_kernel.sh
similarity index 100%
rename from run_experiment_app.jl
rename to run_experiment_kernel.sh
diff --git a/run_sample_app.jl b/run_sample_app.jl
index 6dd95b18..644f7cee 100644
--- a/run_sample_app.jl
+++ b/run_sample_app.jl
@@ -1,47 +1,36 @@
-import Pkg; Pkg.activate(".")
+path_iq = get(ENV,"PATH_IQ",".")
+import Pkg; Pkg.activate(path_iq)
+using GeoStats
+using GeoStatsImages
 using ImageQuilting
-using CUDA
 
 function main(args)
 
-        @info args
-
-        v = parse(Int64,args[1])
-        size = parse(Int64,args[2])
-        i = parse(Int64,args[3])
-
-        img = rand(Float32,(size,size,size))
-        krn = rand(Float32,(10,10,10))
-
-        if (v < 0)
-           @info "$i: structured / $v"
-           img = ImageQuilting.array_kernel(img)
-           ImageQuilting.init_imfilter_kernel()
-           @time ImageQuilting.imfilter_kernel(img,krn)
-           ImageQuilting.imfilter_kernel(img,krn)
-           ImageQuilting.imfilter_kernel(img,krn)
-           @time ImageQuilting.imfilter_kernel(img,krn)
-        elseif (v == 1)
-           @info "$i: ad-hoc / default"
-           @time ImageQuilting.imfilter_cpu(img, krn)
-           @time ImageQuilting.imfilter_cpu(img, krn)
-        elseif (v == 2)
-           @info "$i: ad-hoc / CUDA"
-           img = CuArray(img)
-           @time ImageQuilting.imfilter_cuda(img, krn)
-           ImageQuilting.imfilter_cuda(img, krn)
-           ImageQuilting.imfilter_cuda(img, krn)
-           @time ImageQuilting.imfilter_cuda(img, krn)
-        elseif (v == 3)
-           @info "$i: ad-hoc / OpenCL"
-           ImageQuilting.init_opencl_context()
-           @time ImageQuilting.imfilter_opencl(img, krn)
-           ImageQuilting.imfilter_opencl(img, krn)
-           ImageQuilting.imfilter_opencl(img, krn)
-           @time ImageQuilting.imfilter_opencl(img, krn)
-        else
-           @info "wrong selection"
-        end
+      @info args
+
+      i = parse(Int64,args[1])
+
+      @info "large $i"
+
+      # large
+      TIₗ = geostatsimage("Fluvsim")
+      iqsim(asarray(TIₗ, :facies), (30, 30, 30))
+      @time iqsim(asarray(TIₗ, :facies), (30, 30, 30))
+
+      @info "medium $i"
+
+      # medium
+      TIₘ = geostatsimage("StanfordV")
+      iqsim(asarray(TIₘ, :K), (30, 30, 30))
+      @time iqsim(asarray(TIₘ, :K), (30, 30, 30))
+
+      @info "small $i"
+
+      # small
+      TIₛ = geostatsimage("WalkerLake")
+      iqsim(asarray(TIₛ, :Z), (30, 30))
+      @time iqsim(asarray(TIₛ, :Z), (30, 30))
+
 end
 
 main(ARGS)

From 1dc2d288f0f97329c2faaa27a592ae04cb36d596 Mon Sep 17 00:00:00 2001
From: Francisco Heron de Carvalho Junior <heron@dc.ufc.br>
Date: Sun, 6 Nov 2022 21:14:54 -0500
Subject: [PATCH 15/20] experiment scripts

---
 run_experiment_app.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/run_experiment_app.sh b/run_experiment_app.sh
index 8f178251..3ea261be 100644
--- a/run_experiment_app.sh
+++ b/run_experiment_app.sh
@@ -3,7 +3,8 @@
 for turn in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
 do
    echo $turn.adhoc
-   PATH_IQ="/home/tcarneiropessoa/heron/ImageQuilting-adhoc" $JULIA_PATH/julia./run_sample.jl $version $size $turn >> output.iq.app.adhoc
+   PATH_IQ="/home/tcarneiropessoa/heron/ImageQuilting-adhoc"      $JULIA_PATH/julia ./run_sample.jl $turn >> output.iq.app.adhoc
    echo $turn.structured
-   PATH_IQ="/home/tcarneiropessoa/heron/ImageQuilting-structured" PLATFORM_DESCRIPTION=Platform.$version.toml $JULIA_PATH/julia ./run_sample.jl $turn >> output.iq.app.structured
+#   PATH_IQ="/home/tcarneiropessoa/heron/ImageQuilting-structured" PLATFORM_DESCRIPTION=Platform.$version.toml $JULIA_PATH/julia ./run_sample.jl $turn >> output.iq.app.structured
+   PATH_IQ="/home/tcarneiropessoa/heron/ImageQuilting-structured" $JULIA_PATH/julia ./run_sample.jl $turn >> output.iq.app.structured
 done

From 470cc03c794345dc41c05c94da9a0ac6bf92035e Mon Sep 17 00:00:00 2001
From: Francisco Heron de Carvalho Junior <heron@dc.ufc.br>
Date: Sun, 6 Nov 2022 23:15:23 -0500
Subject: [PATCH 16/20] experiment scripts

---
 run_experiment_app.sh    |  7 ++++---
 run_experiment_kernel.sh |  0
 run_sample_app.jl        | 20 ++++++++++----------
 3 files changed, 14 insertions(+), 13 deletions(-)
 mode change 100644 => 100755 run_experiment_app.sh
 mode change 100644 => 100755 run_experiment_kernel.sh

diff --git a/run_experiment_app.sh b/run_experiment_app.sh
old mode 100644
new mode 100755
index 3ea261be..cc5ab35f
--- a/run_experiment_app.sh
+++ b/run_experiment_app.sh
@@ -3,8 +3,9 @@
 for turn in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
 do
    echo $turn.adhoc
-   PATH_IQ="/home/tcarneiropessoa/heron/ImageQuilting-adhoc"      $JULIA_PATH/julia ./run_sample.jl $turn >> output.iq.app.adhoc
+   PATH_IQ="/home/tcarneiropessoa/heron/ImageQuilting-adhoc"      $JULIA_PATH/julia ./run_sample_app.jl $turn >> output.iq.app.adhoc
+#   PATH_IQ="/home/heron/Dropbox/Copy/ufc_mdcc_hpc/JuliaEarth-work/PlatformAwareIQ-adhoc"      $JULIA_PATH/julia ./run_sample_app.jl $turn >> output.iq.app.adhoc
    echo $turn.structured
-#   PATH_IQ="/home/tcarneiropessoa/heron/ImageQuilting-structured" PLATFORM_DESCRIPTION=Platform.$version.toml $JULIA_PATH/julia ./run_sample.jl $turn >> output.iq.app.structured
-   PATH_IQ="/home/tcarneiropessoa/heron/ImageQuilting-structured" $JULIA_PATH/julia ./run_sample.jl $turn >> output.iq.app.structured
+   PATH_IQ="/home/tcarneiropessoa/heron/ImageQuilting-structured" $JULIA_PATH/julia ./run_sample_app.jl $turn >> output.iq.app.structured
+#   PATH_IQ="/home/heron/Dropbox/Copy/ufc_mdcc_hpc/JuliaEarth-work/PlatformAwareIQ" $JULIA_PATH/julia ./run_sample_app.jl $turn >> output.iq.app.structured
 done
diff --git a/run_experiment_kernel.sh b/run_experiment_kernel.sh
old mode 100644
new mode 100755
diff --git a/run_sample_app.jl b/run_sample_app.jl
index 644f7cee..52ba33a4 100644
--- a/run_sample_app.jl
+++ b/run_sample_app.jl
@@ -10,12 +10,12 @@ function main(args)
 
       i = parse(Int64,args[1])
 
-      @info "large $i"
+      @info "small $i"
 
-      # large
-      TIₗ = geostatsimage("Fluvsim")
-      iqsim(asarray(TIₗ, :facies), (30, 30, 30))
-      @time iqsim(asarray(TIₗ, :facies), (30, 30, 30))
+      # small
+      TIₛ = geostatsimage("WalkerLake")
+      iqsim(asarray(TIₛ, :Z), (30, 30))
+      @time iqsim(asarray(TIₛ, :Z), (30, 30))
 
       @info "medium $i"
 
@@ -24,12 +24,12 @@ function main(args)
       iqsim(asarray(TIₘ, :K), (30, 30, 30))
       @time iqsim(asarray(TIₘ, :K), (30, 30, 30))
 
-      @info "small $i"
+      @info "large $i"
 
-      # small
-      TIₛ = geostatsimage("WalkerLake")
-      iqsim(asarray(TIₛ, :Z), (30, 30))
-      @time iqsim(asarray(TIₛ, :Z), (30, 30))
+      # large
+      TIₗ = geostatsimage("Fluvsim")
+      iqsim(asarray(TIₗ, :facies), (30, 30, 30))
+      @time iqsim(asarray(TIₗ, :facies), (30, 30, 30))
 
 end
 

From 41edfb5a7f8d717392b01f71976ede317ab3887f Mon Sep 17 00:00:00 2001
From: Francisco Heron de Carvalho Junior <heron@dc.ufc.br>
Date: Sun, 6 Nov 2022 23:18:58 -0500
Subject: [PATCH 17/20] experiment scripts

---
 src/kernel/imfilter_opencl.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/kernel/imfilter_opencl.jl b/src/kernel/imfilter_opencl.jl
index 9077f022..3e6c644c 100644
--- a/src/kernel/imfilter_opencl.jl
+++ b/src/kernel/imfilter_opencl.jl
@@ -44,7 +44,6 @@ function imfilter_opencl(img, krn)
   fftkrn = T.(padkrn)
 
   # OpenCl setup
-
   plan = CLFFT.Plan(T, ctx, size(fftimg))
   CLFFT.set_layout!(plan, :interleaved, :interleaved)
   CLFFT.set_result!(plan, :inplace)

From 0be2df4d247a27371dd93b44991afa3cc82336ef Mon Sep 17 00:00:00 2001
From: Francisco Heron de Carvalho Junior <heron@dc.ufc.br>
Date: Mon, 7 Nov 2022 11:18:46 -0500
Subject: [PATCH 18/20] adhoc version

---
 run_experiment_app.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/run_experiment_app.sh b/run_experiment_app.sh
index cc5ab35f..b359b77b 100755
--- a/run_experiment_app.sh
+++ b/run_experiment_app.sh
@@ -3,9 +3,9 @@
 for turn in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
 do
    echo $turn.adhoc
-   PATH_IQ="/home/tcarneiropessoa/heron/ImageQuilting-adhoc"      $JULIA_PATH/julia ./run_sample_app.jl $turn >> output.iq.app.adhoc
-#   PATH_IQ="/home/heron/Dropbox/Copy/ufc_mdcc_hpc/JuliaEarth-work/PlatformAwareIQ-adhoc"      $JULIA_PATH/julia ./run_sample_app.jl $turn >> output.iq.app.adhoc
+   git switch iq-sim-refact
+   PATH_IQ="." $JULIA_PATH/julia ./run_sample_app.jl $turn >> output.iq.app.adhoc
    echo $turn.structured
-   PATH_IQ="/home/tcarneiropessoa/heron/ImageQuilting-structured" $JULIA_PATH/julia ./run_sample_app.jl $turn >> output.iq.app.structured
-#   PATH_IQ="/home/heron/Dropbox/Copy/ufc_mdcc_hpc/JuliaEarth-work/PlatformAwareIQ" $JULIA_PATH/julia ./run_sample_app.jl $turn >> output.iq.app.structured
+   git switch iq-sim-refact-adhoc
+   PATH_IQ="." $JULIA_PATH/julia ./run_sample_app.jl $turn >> output.iq.app.structured
 done

From 9fb35664f54e89fea47602d64eedd698d7b2a2b5 Mon Sep 17 00:00:00 2001
From: Francisco Heron de Carvalho Junior <heron@dc.ufc.br>
Date: Mon, 7 Nov 2022 11:25:29 -0500
Subject: [PATCH 19/20] ...

---
 run_experiment_app.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/run_experiment_app.sh b/run_experiment_app.sh
index b359b77b..65544e55 100755
--- a/run_experiment_app.sh
+++ b/run_experiment_app.sh
@@ -3,9 +3,9 @@
 for turn in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
 do
    echo $turn.adhoc
-   git switch iq-sim-refact
+   git switch iq-sim-refact-adhoc
    PATH_IQ="." $JULIA_PATH/julia ./run_sample_app.jl $turn >> output.iq.app.adhoc
    echo $turn.structured
-   git switch iq-sim-refact-adhoc
+   git switch iq-sim-refact
    PATH_IQ="." $JULIA_PATH/julia ./run_sample_app.jl $turn >> output.iq.app.structured
 done

From 428d47c6a8a98a851d04ab7c451620a287e8f285 Mon Sep 17 00:00:00 2001
From: Francisco Heron de Carvalho Junior <heron@dc.ufc.br>
Date: Mon, 7 Nov 2022 15:46:22 -0500
Subject: [PATCH 20/20] CUDA kernel adjust

---
 src/kernel/imfilter_cuda.jl | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/src/kernel/imfilter_cuda.jl b/src/kernel/imfilter_cuda.jl
index 910f6193..40514f33 100644
--- a/src/kernel/imfilter_cuda.jl
+++ b/src/kernel/imfilter_cuda.jl
@@ -10,30 +10,35 @@ end
 
 @platform aware array_kernel({accelerator_count::(@atleast 1), 
                               accelerator_manufacturer::NVIDIA,
-                              accelerator_api::(@api CUDA)}, array) = CuArray(array)
+                              accelerator_api::(@api CUDA)}, array) =CuArray{Float32}(array)
 
 @platform aware view_kernel({accelerator_count::(@atleast 1), 
                              accelerator_manufacturer::NVIDIA, 
                              accelerator_api::(@api CUDA)}, array, I)  = Array(array[I])
 
+counter = Ref{Int}(0)
+
+
 @platform aware function imfilter_kernel({accelerator_count::(@atleast 1), 
                                           accelerator_manufacturer::NVIDIA, 
                                           accelerator_api::(@api CUDA)}, img, krn)
+
+   counter[] = counter[] + 1
    imfilter_cuda(img,krn)
 end
 
 function imfilter_cuda(img, krn)
- 
+
    # pad kernel to common size with image
-   padkrn = CUDA.zeros(size(img))
-   copyto!(padkrn, CartesianIndices(krn), CuArray(krn), CartesianIndices(krn))
+   padkrn = CUDA.zeros(Float32, size(img))
+   copyto!(padkrn, CartesianIndices(krn), CuArray{Float32}(krn), CartesianIndices(krn))
  
    # perform ifft(fft(img) .* conj.(fft(krn)))
    fftimg = img |> CUFFT.fft
-   fftkrn = padkrn |> CuArray |> CUFFT.fft
+   fftkrn = padkrn |> CUFFT.fft
    result = (fftimg .* conj.(fftkrn)) |> CUFFT.ifft
  
    # recover result
    finalsize = size(img) .- (size(krn) .- 1)
    real.(result[CartesianIndices(finalsize)]) |> Array
-end
+ end