diff --git a/src/CovarianceFunctions.jl b/src/CovarianceFunctions.jl
index b63ccc1..f2029eb 100644
--- a/src/CovarianceFunctions.jl
+++ b/src/CovarianceFunctions.jl
@@ -60,7 +60,7 @@ include("taylor.jl")
 include("gradient.jl")
 include("gradient_algebra.jl")
 include("hessian.jl")
-# include("taylor_gradient.jl") # fast MVM algorithm for isotropic GradientKernel Gramians
+include("taylor_gradient.jl") # fast MVM algorithm for isotropic GradientKernel Gramians
 include("separable.jl")
 
 end # CovarianceFunctions
diff --git a/src/barneshut.jl b/src/barneshut.jl
index 0e66749..e46e0d3 100644
--- a/src/barneshut.jl
+++ b/src/barneshut.jl
@@ -30,7 +30,7 @@ function BarnesHutFactorization(k, x, y = x, D = nothing; θ::Real = 1/4, leafsi
     # w = zeros(length(m))
     # i = zeros(Bool, m)
     # WT, BT = typeof(w), typeof(i)
-    T = gramian_eltype(k, xs, ys)
+    T = gramian_eltype(k, xs[1], ys[1])
     BarnesHutFactorization{T, KT, XT, YT, TT, DT, RT}(k, xs, ys, Tree, D, θ) #, w, i)
 end
 function BarnesHutFactorization(G::Gramian, θ::Real = 1/2; leafsize::Int = BARNES_HUT_DEFAULT_LEAFSIZE)
@@ -49,7 +49,7 @@ function LinearAlgebra.mul!(y::AbstractVector, F::BarnesHutFactorization, x::Abs
         taylor!(y, F, x, α, β)
     end
 end
-function Base.:*(F::BarnesHutFactorization, x::AbstractVector)
+function Base.:*(F::BarnesHutFactorization{<:Number}, x::AbstractVector{<:Number})
     T = promote_type(eltype(F), eltype(x))
     y = zeros(T, size(F, 1))
     mul!(y, F, x)
@@ -148,45 +148,45 @@ end
 
 ############################# centers of mass ##################################
 # this is a weighted sum, could be generalized to incorporate node_sums
-function compute_centers_of_mass(x::AbstractVector, w::AbstractVector, T::BallTree)
+function compute_centers_of_mass(w::AbstractVector, x::AbstractVector, T::BallTree)
     D = eltype(x) <: StaticVector ? length(eltype(x)) : length(x[1]) # if x is static vector
     com = [zero(MVector{D, Float64}) for _ in 1:length(T.hyper_spheres)]
-    compute_centers_of_mass!(com, x, w, T)
+    compute_centers_of_mass!(com, w, x, T)
 end
 
 function compute_centers_of_mass(F::BarnesHutFactorization, w::AbstractVector)
-    compute_centers_of_mass(F.y, w, F.Tree)
+    compute_centers_of_mass(w, F.y, F.Tree)
 end
 
-function compute_centers_of_mass!(com::AbstractVector, x::AbstractVector, w::AbstractVector, T::BallTree)
+function compute_centers_of_mass!(com::AbstractVector, w::AbstractVector, x::AbstractVector, T::BallTree)
     abs_w = abs.(w)
-    weighted_node_sums!(com, x, abs_w, T)
+    weighted_node_sums!(com, abs_w, x, T)
     sum_w = node_sums(abs_w, T)
     ε = eps(eltype(w)) # ensuring division by zero it not a problem
     @. com ./= sum_w + ε
 end
 
-node_sums(x::AbstractVector, T::BallTree) = weighted_node_sums(x, Ones(length(x)), T)
+node_sums(x::AbstractVector, T::BallTree) = weighted_node_sums(Ones(length(x)), x, T)
 function node_sums!(sums, x::AbstractVector, T::BallTree)
-    weighted_node_sums!(sums, x, Ones(length(x)), T)
+    weighted_node_sums!(sums, Ones(length(x)), x, T)
 end
 
-function weighted_node_sums(x::AbstractVector, w::AbstractVector, T::BallTree, index::Int = 1)
+function weighted_node_sums(w::AbstractVector, x::AbstractVector, T::BallTree, index::Int = 1)
     length(x) == 0 && return zero(eltype(x))
-    sums = zeros(typeof(w[1]'x[1]), length(T.hyper_spheres))
-    weighted_node_sums!(sums, x, w, T)
+    sums = fill(zero(w[1]'x[1]), length(T.hyper_spheres))
+    weighted_node_sums!(sums, w, x, T)
 end
 
 # NOTE: x should either be vector of numbers or vector of static arrays
-function weighted_node_sums!(sums::AbstractVector, x::AbstractVector,
-                            w::AbstractVector{<:Number}, T::BallTree, index::Int = 1)
+function weighted_node_sums!(sums::AbstractVector, w::AbstractVector,
+                            x::AbstractVector, T::BallTree, index::Int = 1)
     if isleaf(T.tree_data.n_internal_nodes, index)
         i = get_leaf_range(T.tree_data, index)
         wi, xi = @views w[T.indices[i]], x[T.indices[i]]
         sums[index] = wi'xi
     else
-        task = @spawn weighted_node_sums!(sums, x, w, T, getleft(index))
-        weighted_node_sums!(sums, x, w, T, getright(index))
+        task = @spawn weighted_node_sums!(sums, w, x, T, getleft(index))
+        weighted_node_sums!(sums, w, x, T, getright(index))
         wait(task)
         sums[index] = sums[getleft(index)] + sums[getright(index)]
     end
diff --git a/src/taylor_gradient.jl b/src/taylor_gradient.jl
new file mode 100644
index 0000000..9a9b42e
--- /dev/null
+++ b/src/taylor_gradient.jl
@@ -0,0 +1,130 @@
+############################### matrix valued version ##########################
+# function BarnesHutFactorization(k::GradientKernel, x, y = x, D = nothing;
+#                     θ::Real = 1/4, leafsize::Int = BARNES_HUT_DEFAULT_LEAFSIZE)
+#     xs = vector_of_static_vectors(x)
+#     ys = x === y ? xs : vector_of_static_vectors(y)
+#     Tree = BallTree(ys, leafsize = leafsize)
+#     m = length(y)
+#     XT, YT, KT, TT, DT, RT = typeof.((xs, ys, k, Tree, D, θ))
+#     # w = zeros(length(m))
+#     # i = zeros(Bool, m)
+#     # WT, BT = typeof(w), typeof(i)
+#     T = gramian_eltype(k, xs, ys)
+#     F = BarnesHutFactorization{T, KT, XT, YT, TT, DT, RT}(k, xs, ys, Tree, D, θ) #, w, i)
+# end
+
+function LinearAlgebra.mul!(y::AbstractVector{<:Real}, F::BarnesHutFactorization{<:Any, <:GradientKernel},
+                            x::AbstractVector{<:Real}, α::Real = 1, β::Real = 0)
+    d = length(F.x[1])
+    X, Y = reshape(x, d, :), reshape(y, d, :) # converting vector of reals to vector of vectors
+    xx, yy = [c for c in eachcol(X)], [c for c in eachcol(Y)]
+    mul!(yy, F, xx, α, β)
+    return y
+end
+
+function LinearAlgebra.mul!(y::AbstractVector{<:AbstractVector}, F::BarnesHutFactorization{<:Any, <:GradientKernel},
+                            x::AbstractVector{<:AbstractVector}, α::Real = 1, β::Real = 0)
+    taylor!(y, F, x, α, β)
+end
+
+# function Base.:*(F::BarnesHutFactorization{<:Number}, x::AbstractVector{<:Number})
+#     T = promote_type(eltype(F), eltype(x))
+#     y = zeros(T, size(F, 1))
+#     mul!(y, F, x)
+# end
+
+function taylor!(b::AbstractVector{<:AbstractVector}, F::BarnesHutFactorization{<:Any, <:GradientKernel},
+                 w::AbstractVector{<:AbstractVector}, α::Number = 1, β::Number = 0, θ::Real = F.θ)
+    size(F, 2) == length(w) || throw(DimensionMismatch("length of w does not match second dimension of F: $(length(w)) ≠ $(size(F, 2))"))
+    # eltype(b) == promote_type(eltype(F), eltype(w)) ||
+    #         throw(TypeError("eltype of target vector b not equal to eltype of matrix-vector product: $(eltype(b)) and $(promote_type(eltype(F), eltype(w)))"))
+    f_orders(r²) = derivatives(F.k.k, r², 3)
+    sums_w = node_sums(w, F.Tree) # IDEA: could pre-allocate, worth it? is several orders of magnitude less expensive than multiply
+    sums_w_r = weighted_node_sums(adjoint.(w), adjoint.(F.y), F.Tree) # need sum of outer products of F.y and w
+    centers = get_hyper_centers(F)
+    @. sums_w_r -= sums_w * adjoint(centers) # need to center the moments
+    Gijs = [F[1, 1] for _ in 1:Base.Threads.nthreads()]
+    for i in eachindex(F.x) # exactly 4 * length(y) allocations?
+        if β == 0
+            @. b[i] = 0 # this avoids trouble if b is initialized with NaN's, e.g. thorugh "similar"
+        else
+            @. b[i] *= β
+        end
+        Gij = Gijs[Base.Threads.threadid()]
+        taylor_recursion!(b[i], Gij, 1, F.k, f_orders, F.x[i], F.y,
+                          w, sums_w, sums_w_r, θ, F.Tree, centers, α) # x[i] creates an allocation
+    end
+    if !isnothing(F.D) # if there is a diagonal correction, need to add it
+        mul!(b, F.D, w, α, 1)
+    end
+    return b
+end
+
+# barnes hut recursion for matrix-valued kernels, could merge with scalar version
+# bi is target vector corresponding to input point xi
+# Gij is temporary storage for evaluation of k(xi, y[j]), important if it is matrix valued
+# to avoid allocations
+function taylor_recursion!(bi::AbstractVector, Gij,
+                      index, k::GradientKernel, f_orders,
+                      xi, y::AbstractVector,
+                      w::AbstractVector{<:AbstractVector},
+                      sums_w::AbstractVector{<:AbstractVector},
+                      sums_w_r::AbstractVector{<:AbstractMatrix},
+                      θ::Real, T::BallTree, centers, α::Number)
+    h = T.hyper_spheres[index]
+    if isleaf(T.tree_data.n_internal_nodes, index) # do direct computation
+        for i in get_leaf_range(T.tree_data, index)
+            j = T.indices[i]
+            # @time Gij = evaluate_block!(Gij, k, xi, y[j]) # k(xi, y[j])
+            Gij = IsotropicGradientKernelElement{eltype(bi)}(k.k, xi, y[j])
+            wj = w[j]
+            mul!(bi, Gij, wj, α, 1)
+        end
+        return bi
+
+    elseif h.r < θ * euclidean(xi, centers[index]) # compress
+            S_index = sums_w_r[index]
+            ri = difference(xi, centers[index])
+            sum_abs2_ri = sum(abs2, ri)
+            # NOTE: this line is the only one that still allocates ~688 bytes
+            f0, f1, f2, f3 = f_orders(sum_abs2_ri) # contains first and second order derivative evaluation
+
+            # Gij = evaluate_block!(Gij, k, xi, centers[index]) # k(xi, centers_of_mass[index])
+            Gij = IsotropicGradientKernelElement{eltype(bi)}(k.k, xi, centers[index])
+
+            # zeroth order
+            mul!(bi, Gij, sums_w[index], α, 1) # bi .+= α * k(xi, centers_of_mass[index]) * sums_w[index]
+            # first order
+            # this block has zero allocations
+            mul!(bi, 2*f3*dot(ri, S_index, ri) + f2 * tr(S_index), ri, 4α, 1)
+            mul!(bi, S_index, ri, 4α*f2, 1)
+            mul!(bi, S_index', ri, 4α*f2, 1)
+        return bi
+    else # recurse NOTE: parallelizing here is not as efficient as parallelizing over target points
+        taylor_recursion!(bi, Gij, getleft(index), k, f_orders, xi, y, w, sums_w, sums_w_r, θ, T, centers, α)
+        taylor_recursion!(bi, Gij, getright(index), k, f_orders, xi, y, w, sums_w, sums_w_r, θ, T, centers, α)
+    end
+end
+
+# function node_mapreduce(x::AbstractVector, w::AbstractVector, T::BallTree, index::Int = 1)
+#     length(x) == 0 && return zero(eltype(x))
+#     sums = fill(w[1]*x[1]', length(T.hyper_spheres))
+#     node_outer_products!(sums, x, w, T)
+# end
+#
+# # NOTE: x should either be vector of numbers or vector of static arrays
+# function node_mapreduce!(sums::AbstractVector{<:AbstractMatrix}, x::AbstractVector,
+#                             w::AbstractVector{<:Number}, T::BallTree, index::Int = 1)
+#     if isleaf(T.tree_data.n_internal_nodes, index)
+#         i = get_leaf_range(T.tree_data, index)
+#         wi, xi = @views w[T.indices[i]], x[T.indices[i]]
+#         sums[index] = wi * xi'
+#         # adjoint.(w)' * adjoint.(x)
+#     else
+#         task = @spawn weighted_node_sums!(sums, x, w, T, getleft(index))
+#         weighted_node_sums!(sums, x, w, T, getright(index))
+#         wait(task)
+#         sums[index] = sums[getleft(index)] + sums[getright(index)]
+#     end
+#     return sums
+# end
diff --git a/test/barneshut.jl b/test/barneshut.jl
index e982276..8dba3fe 100644
--- a/test/barneshut.jl
+++ b/test/barneshut.jl
@@ -1,12 +1,14 @@
 module TestBarnesHut
 using LinearAlgebra
+using WoodburyFactorizations
 using CovarianceFunctions
 using CovarianceFunctions: BarnesHutFactorization, barneshut!, vector_of_static_vectors,
-        node_sums, euclidean, GradientKernel, taylor!
+        node_sums, euclidean, GradientKernel, taylor!, IsotropicGradientKernelElement
 using NearestNeighbors
 using NearestNeighbors: isleaf, getleft, getright, get_leaf_range
 using Test
 
+# like barnes hut but puts 0 as far field contribution
 function barneshut_no_far_field!(b::AbstractVector, F::BarnesHutFactorization, w::AbstractVector,
                     α::Number = 1, β::Number = 0, θ::Real = F.θ)
     D = length(eltype(F.x))
@@ -95,10 +97,9 @@ verbose = false
         nexp = 16
         err = zeros(nexp)
         err_no_split = zeros(nexp)
-        err_hyp = zeros(nexp)
-        err_hyp_no_split = zeros(nexp)
         err_nff = zeros(nexp)
         err_taylor = zeros(nexp)
+        err_taylor_hyp = zeros(nexp)
         theta_array = range(1e-1, 1, length = nexp)
         for (i, θ) in enumerate(theta_array)
             barneshut!(b_bh, F, w, 1, 0, θ, split = true)
@@ -110,8 +111,11 @@ verbose = false
             barneshut_no_far_field!(b_bh, F, w, 1, 0, θ) # compare against pseudo barnes hut where far field = 0
             err_nff[i] = norm(b - b_bh)
 
-            taylor!(b_bh, F, w, 1, 0, θ) # compare against pseudo barnes hut where far field = 0
+            taylor!(b_bh, F, w, 1, 0, θ, use_com = true)
             err_taylor[i] = norm(b - b_bh)
+
+            taylor!(b_bh, F, w, 1, 0, θ, use_com = false)
+            err_taylor_hyp[i] = norm(b - b_bh)
         end
 
         rel_err = err / norm(b)
@@ -122,29 +126,59 @@ verbose = false
         # using Plots
         # plotly()
         #
-        # rel_err_nff = err_nff / norm(b)
-        # rel_err_no_split = err_no_split / norm(b)
-        # rel_err_taylor = err_taylor / norm(b)
-        #
+        # norm_b = norm(b)
+        # rel_err_nff = err_nff / norm_b
+        # rel_err_no_split = err_no_split / norm_b
+        # rel_err_taylor = err_taylor / norm_b
+        # rel_err_taylor_hyp = err_taylor_hyp / norm_b
+        # #
         # plot(theta_array, rel_err, yscale = :log10, label = "barneshut", ylabel = "relative error", xlabel = "θ")
         # plot!(theta_array, rel_err_no_split, yscale = :log10, label = "no split")
         # plot!(theta_array, rel_err_nff, yscale = :log10, label = "sparse")
         # plot!(theta_array, rel_err_taylor, yscale = :log10, label = "taylor")
+        # plot!(theta_array, rel_err_taylor_hyp, yscale = :log10, label = "taylor hyper-sphere centers")
         # gui()
 
     end # testset weight vectors
 
-
-    # @testset "gradient kernels" begin
-    #     k = CovarianceFunctions.Cauchy()
-    #     g = CovarianceFunctions.GradientKernel(k)
-    #
-    #     F = BarnesHutFactorization(g, x)
-    #     @test F isa BarnesHutFactorization
-    #     @test eltype(F) <: Diagonal
-    #     @test size(F) == (n, n)
-    #     @test size(F[1, 1]) == (d_out, d_out)
-    # end # testset matrix valued bh
+    @testset "gradient kernels" begin
+
+        n = 1024
+        d = 2
+        x = randn(d, n)
+        # k = CovarianceFunctions.Cauchy()
+        k = CovarianceFunctions.EQ()
+        g = CovarianceFunctions.GradientKernel(k)
+
+        F = BarnesHutFactorization(g, x)
+        @test F isa BarnesHutFactorization
+        @test eltype(F) <: IsotropicGradientKernelElement
+        @test size(F) == (n, n)
+        @test size(F[1, 1]) == (d, d)
+
+        a = [randn(d) for _ in 1:n]
+        b = [zeros(d) for _ in 1:n]
+        # F*a
+        G = gramian(g, x)
+        b_truth = deepcopy(b)
+        # @time b_truth = G.A * a
+        # @time b_truth = G.A * a
+        mul!(b_truth, G.A, a)
+        norm_b = sqrt(sum(sum.(abs2, b_truth)))
+
+        α, β = 1, 0
+        θ = 0
+        taylor!(b, F, a, α, β, θ)
+        err = sqrt(sum(sum.(abs2, b - b_truth)))
+        rel_err = err / norm_b
+        @test rel_err < 1e-10
+
+        θ = 1/10
+        taylor!(b, F, a, α, β, θ)
+        err = sqrt(sum(sum.(abs2, b - b_truth)))
+        rel_err = err / norm_b
+        @test rel_err < 1e-3
+    end # testset matrix valued bh
 
 end # testset