edits for product and sum kernels: support for custom kernels, and tests with input_trait and constant constituents

SebastianAment · SebastianAment · commit 8f8641f2bff3 · 2022-05-18T00:19:25.000-04:00
diff --git a/README.md b/README.md
@@ -94,8 +94,16 @@ custom_rbf(x, y) = exp(-sum(abs2, x .- y)) # custom RBF implementation
 ```
 To take advantage of some specialized structure-aware algorithms, it is prudent to let CovarianceFunctions.jl know about the input type, in this case
 ```julia
-input_trait(::typeof(custom_rbf)) = IsotropicInput()
+CovarianceFunctions.input_trait(::typeof(custom_rbf)) = IsotropicInput()
 ```
+Other possible options include `DotProductInput` or `StationaryLinearFunctionalInput`.
+To enable efficient output type inference for custom kernels with parameters,
+extend `Base.eltype`.
+Since the custom kernel above does not have any parameters, we set the type to the bottom type `Union{}`:
+```julia
+Base.eltype(k::typeof(custom_rbf)) = Union{}
+```
+The type of the output of the kernel `k` with inputs `x` and `y` is then expected to be `promote_type(eltype.((k, x, y))...)`.
 
 ## Toeplitz Structure
 
diff --git a/src/algebra.jl b/src/algebra.jl
@@ -1,19 +1,22 @@
 ############################# kernel algebra ###################################
-# IDEA: separable sum gramian
-# IDEA: (Separable) Sum and Product could be one definition with meta programming
+# NOTE: output type inference of product, sum, and power not supported for
+# user-defined kernels unless Base.eltype is defined for them
 ################################ Product #######################################
-# TODO: constructors which merge products and sums
-struct Product{T, AT<:Tuple{Vararg{AbstractKernel}}} <: AbstractKernel{T}
+# IDEA: constructors which merge products and sums
+struct Product{T, AT<:Union{Tuple, AbstractVector}} <: AbstractKernel{T}
     args::AT
-    function Product(k::Tuple{Vararg{AbstractKernel}})
-        T = promote_type(eltype.(k)...)
-        new{T, typeof(k)}(k)
-    end
+    # input_traits : # could keep track of input_trait.(args)
+    # input_trait # could keep track of the overall input trait
 end
 @functor Product
-(P::Product)(τ) = prod(k->k(τ), P.args) # TODO could check for isotropy here
+function Product(k::Union{Tuple, AbstractVector})
+    T = promote_type(eltype.(k)...)
+    Product{T, typeof(k)}(k)
+end
+Product(k...) = Product(k)
+(P::Product)(τ) = prod(k->k(τ), P.args) # IDEA could check for isotropy here
 (P::Product)(x, y) = prod(k->k(x, y), P.args)
-# (P::Product)(x, y) = isstationary(P) ? P(difference(x, y)) : prod(k->k(x, y), P.args)
+# (P::Product)(x, y) = isisotropic(P) ? P(difference(x, y)) : prod(k->k(x, y), P.args)
 Product(k::AbstractKernel...) = Product(k)
 Product(k::AbstractVector{<:AbstractKernel}) = Product(k...)
 Base.prod(k::AbstractVector{<:AbstractKernel}) = Product(k)
@@ -23,32 +26,38 @@ Base.:*(c::Number, k::AbstractKernel) = Constant(c) * k
 Base.:*(k::AbstractKernel, c::Number) = Constant(c) * k
 
 ################################### Sum ########################################
-struct Sum{T, AT<:Tuple{Vararg{AbstractKernel}}} <: AbstractKernel{T}
+struct Sum{T, AT<:Union{Tuple, AbstractVector}} <: AbstractKernel{T}
     args::AT
-    function Sum(k::Tuple{Vararg{AbstractKernel}})
-        T = promote_type(eltype.(k)...)
-        new{T, typeof(k)}(k)
-    end
+    # input_trait # could keep track of the overall input trait
 end
 @functor Sum
+function Sum(k::Union{Tuple, AbstractVector})
+    T = promote_type(eltype.(k)...)
+    Sum{T, typeof(k)}(k)
+end
+Sum(k...) = Sum(k)
 (S::Sum)(τ) = sum(k->k(τ), S.args) # should only be called if S is stationary
 (S::Sum)(x, y) = sum(k->k(x, y), S.args)
 # (S::Sum)(τ) = isstationary(S) ? sum(k->k(τ), S.args) : error("One argument evaluation not possible for non-stationary kernel")
 # (S::Sum)(x, y) = isstationary(S) ? S(difference(x, y)) : sum(k->k(x, y), S.args)
-Sum(k::AbstractKernel...) = Sum(k)
-Sum(k::AbstractVector{<:AbstractKernel}) = Sum(k...)
+Sum(k...) = Sum(k)
 Base.sum(k::AbstractVector{<:AbstractKernel}) = Sum(k)
 
 Base.:+(k::AbstractKernel...) = Sum(k)
 Base.:+(k::AbstractKernel, c::Number) = k + Constant(c)
 Base.:+(c::Number, k::AbstractKernel) = k + Constant(c)
 
 ################################## Power #######################################
-struct Power{T, K<:AbstractKernel{T}, PT} <: AbstractKernel{T}
+struct Power{T, K<:AbstractKernel} <: AbstractKernel{T}
     k::K
-    p::PT
+    p::Int
+    # input_trait # could keep track of the overall input trait
 end
 @functor Power
+function Power(k, p::Int)
+    T = promote_type(eltype(k))
+    Power{T, typeof(k)}(k, p)
+end
 (P::Power)(τ) = P.k(τ)^P.p
 (P::Power)(x, y) = P.k(x, y)^P.p
 Base.:^(k::AbstractKernel, p::Number) = Power(k, p)
@@ -57,6 +66,7 @@ Base.:^(k::AbstractKernel, p::Number) = Power(k, p)
 # product kernel, but separately evaluates component kernels on different parts of the input
 struct SeparableProduct{T, K} <: AbstractKernel{T}
     args::K # kernel for input covariances
+    # input_trait # could keep track of the overall input trait
 end
 @functor SeparableProduct
 SeparableProduct(k...) = SeparableProduct(k)
diff --git a/src/gradient.jl b/src/gradient.jl
@@ -442,6 +442,18 @@ function value_gradient_kernel(k, x, y, T::InputTrait = input_trait(G.k))
     value_gradient_kernel!(K, k, x, y, T)
 end
 
+# IDEA: specialize first_gradient!(g, k, x, y) = ForwardDiff.gradient!(g, z->k(z, y), x)
+# computes covariance between value and gradient
+# function value_gradient_covariance!(gx, gy, k, x, y, ::IsotropicInput)
+#     r² = sum(abs2, difference(x, y))
+#     g .= derivative(k, r²)
+# end
+#
+# function value_gradient_covariance!(gx, gy, k, x, y, ::GenericInput())
+#     r² = sum(abs2, difference(x, y))
+#     g .= derivative(k, r²)
+# end
+
 # IDEA: specialize evaluate for IsotropicInput, DotProductInput
 # returns block matrix
 function value_gradient_kernel!(K::DerivativeKernelElement, k, x, y, T::InputTrait)
diff --git a/src/gradient_algebra.jl b/src/gradient_algebra.jl
@@ -49,30 +49,25 @@ function gradient_kernel!(W::Woodbury, k::Product, x::AbstractVector, y::Abstrac
     # k_vec(x, y) = [h(x, y) for h in k.args] # include in loop
     # ForwardDiff.jacobian!(W.U', z->k_vec(z, y), x) # this is actually less allocating than the gradient! option
     # ForwardDiff.jacobian!(W.V, z->k_vec(x, z), y)
-
+    # GradientConfig() # for generic version, this could be pre-computed for efficiency gains
     r = length(k.args)
     for i in 1:r # parallelize this?
         h, H = k.args[i], A.args[i]
         hxy = h(x, y)
         D = H.args[1]
         D.diag .= prod_k_j / hxy
+        # input_trait(h) could be pre-computed, or should not be passed here, because the factors might be composite kernels themselves
         H.args[2] = gradient_kernel!(H.args[2], h, x, y, input_trait(h))
 
         ui, vi = @views W.U[:, i], W.V[i, :]
-        ForwardDiff.gradient!(ui, z->h(z, y), x)
-        ForwardDiff.gradient!(vi, z->h(x, z), y) # these are bottlenecks
+        ForwardDiff.gradient!(ui, z->h(z, y), x) # these are bottlenecks
+        ForwardDiff.gradient!(vi, z->h(x, z), y) # TODO: replace by value_gradient_covariance!
         @. ui *= prod_k_j / hxy
         @. vi /= hxy
     end
     return W
 end
 
-# IDEA: specialize first_gradient!(g, k, x, y) = ForwardDiff.gradient!(g, z->k(z, y), x)
-# function first_gradient!(g, k, x, y, ::IsotropicInput)
-# r² = sum(abs2, difference(x, y))
-#   g .= derivative(k, r²)
-# end
-
 ############################# Separable Product ################################
 # for product kernel with generic input
 function allocate_gradient_kernel(k::SeparableProduct, x::AbstractVector{<:Number},
diff --git a/src/properties.jl b/src/properties.jl
@@ -51,7 +51,7 @@ function input_trait(S::ProductsAndSums)
         trait = input_trait(S.args[i]) # first non-constant kernel
         for j in i+1:length(S.args)
             k = S.args[j]
-            if k isa Constant
+            if k isa Constant # ignore constants, since they can function as any input type
                 continue
             elseif input_trait(k) != trait # if the non-constant kernels don't have the same input type,
                 return GenericInput() # we default back to GenericInput
diff --git a/src/stationary.jl b/src/stationary.jl
@@ -12,24 +12,22 @@
 ############################# constant kernel ##################################
 # can be used to rescale existing kernels
 # IDEA: Allow Matrix-valued constant
-struct ConstantKernel{T} <: IsotropicKernel{T}
+struct Constant{T} <: IsotropicKernel{T}
     c::T
-    function ConstantKernel(c, check::Bool = true)
+    function Constant(c, check::Bool = true)
         if check && !ispsd(c)
             throw(DomainError("Constant is not positive semi-definite: $c"))
         end
         new{typeof(c)}(c)
     end
 end
-@functor ConstantKernel
-const Constant = ConstantKernel
-
 # isisotropic(::Constant) = true
 # ismercer(k::Constant) = ispsd(k.c)
 # Constant(c) = Constant{typeof(c)}(c)
 
 # should type of constant field and r agree? what promotion is necessary?
 # do we need the isotropic/ stationary evaluation, if we overwrite the mercer one?
+(k::Constant)() = k.c
 (k::Constant)(r²) = k.c # stationary / isotropic
 (k::Constant)(x, y) = k.c # mercer
 
@@ -196,8 +194,9 @@ end
 struct CosineKernel{T, V<:Union{T, AbstractVector{T}}} <: StationaryKernel{T}
     c::V
 end
+@functor CosineKernel
 const Cosine = CosineKernel
-@functor Cosine
+const Cos = Cosine
 
 # IDEA: trig-identity -> low-rank gramian
 # NOTE: this is the only stationary non-isotropic kernel so far
diff --git a/test/gradient.jl b/test/gradient.jl
@@ -7,7 +7,7 @@ using CovarianceFunctions
 using CovarianceFunctions: EQ, RQ, Dot, ExponentialDot, NN, Matern, MaternP,
         Lengthscale, input_trait, GradientKernel, ValueGradientKernel, GradientKernelElement,
         DerivativeKernel, ValueDerivativeKernel, DerivativeKernelElement, Cosine,
-        Woodbury, LazyMatrixProduct, ConstantKernel
+        Woodbury, LazyMatrixProduct, Constant
 
 const AbstractMatOrFac = Union{AbstractMatrix, Factorization}
 
@@ -78,7 +78,7 @@ const AbstractMatOrFac = Union{AbstractMatrix, Factorization}
         @test W*a ≈ G*a
 
         # testing constant kernel
-        c = ConstantKernel(1)
+        c = Constant(1)
         g = GradientKernel(c)
         @test g(x, y) ≈ zeros(d, d)
     end
diff --git a/test/properties.jl b/test/properties.jl
@@ -1,13 +1,13 @@
 module TestProperties
 using Test
 using CovarianceFunctions
-using CovarianceFunctions: input_trait, DotProductInput, IsotropicInput, GenericInput
-using CovarianceFunctions: EQ, RQ, Exp, Dot, Poly, Line
+using CovarianceFunctions: input_trait, DotProductInput, IsotropicInput, StationaryLinearFunctionalInput, GenericInput
+using CovarianceFunctions: EQ, RQ, Exp, Dot, ExponentialDot, Poly, Line, Cos
 
 using LinearAlgebra
 
 @testset "properties" begin
-    dot_kernels = [Dot(), Dot()^3] # , Line(1.), Poly(5, 1.)] # TODO: take care of constants
+    dot_kernels = [Dot(), Dot()^3, ExponentialDot(), Line(1.), Poly(5, 1.)]
     for k in dot_kernels
         @test input_trait(k) isa DotProductInput
     end
@@ -19,6 +19,17 @@ using LinearAlgebra
 
     k = CovarianceFunctions.NeuralNetwork()
     @test input_trait(k) isa GenericInput
+
+    # testing that constant kernels don't confuse the input_trait inference
+    @test input_trait(1*EQ() + 1) isa IsotropicInput
+    @test input_trait(1*EQ() + 2 + RQ(1.)*1) isa IsotropicInput
+
+    @test input_trait(1*Dot() + 1) isa DotProductInput
+    @test input_trait(1*Dot() + 2 + Dot()^2*1) isa DotProductInput
+
+    w = randn()
+    @test input_trait(1*Cos(w) + 1) isa StationaryLinearFunctionalInput
+    @test input_trait(1*Cos(w) + 2 + Cos(w)^2*1) isa StationaryLinearFunctionalInput
 end
 
 end