Add rules for specialized norm functions and normalize (#226)

sethaxen · willtebbutt · web-flow · commit fa4b93af9c8b · 2020-12-06T03:54:44.000-08:00
* Add generic normp

* Reorganize

* Add frule for normp

* Make more readable

* Use norm instead of abs

* Add rules for normMinusInf

* Also cover normInf

* Finish early when passed a Zero

* Add norm1 rules

* Add norm2 rules

* Separate pullbacks

* Remove whitespace

* Add comment

* Only compute log if necessary

* Simplify logic

* Make normInf cotangent one-hot

* Constrain rrules to arrays

* Add comment

* Split pullback functions

* Don't broadcast over shared denom

* Add frules for norm

* Generalize rrule for norm

* Reimplement rrule for number norm

* Use correct variable name

* Release type constraint

* Add more special cases

* Split forward passes into own functions

* don't ignore (co)tangents on p

* Add rules for normalize and normalize!

* Special-case normalize with p=2

* Add overloads for transpose and adjoint

* Generalize by going through norm

* Bump ChainRulesCore compat

* Don't assume has eltype

* Import rand_tangent

* Test normalize and normalize!

* Don't unnecessarily thunk

* Bump required version number

To ensure we get TestIterator

* Add tests for norm functions

* Restrict types for rrules

* Move norm functions to their own file

* Remove frules for norm

* Ensure real multiplied first

* Lower precision of test

* Revert accidental commit

* Remove signatures with default args

* Reuse variable

* Reorganize normalize tests

* Test scalar frule

* Test transpose/adjoint rules

* Add back in frules for norm2

* Add back rrule for norm no p

* Test norm without p

* Ensure normalize pulls back Zero

* Apply suggestions from code review

Co-authored-by: willtebbutt &lt;wt0881@my.bristol.ac.uk&gt;

* Increase tolerance for infinite norms

* Always define kwargs

* Make normp pullback more stable for p = +/- inf

* Test with higher power

* Use norm2 forward for empty x

* Combine checks

* Test norm2 frule

* Test norm for empty array

* Test structured matrices

* Update Project.toml

Co-authored-by: willtebbutt &lt;wt0881@my.bristol.ac.uk&gt;
diff --git a/Project.toml b/Project.toml
@@ -1,6 +1,6 @@
 name = "ChainRules"
 uuid = "082447d4-558c-5d27-93f4-14fc19e9eca2"
-version = "0.7.34"
+version = "0.7.35"
 
 [deps]
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
@@ -14,7 +14,7 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
 [compat]
 ChainRulesCore = "0.9.21"
-ChainRulesTestUtils = "0.5"
+ChainRulesTestUtils = "0.5.1"
 Compat = "3"
 FiniteDifferences = "0.11.4"
 Reexport = "0.2"
diff --git a/src/ChainRules.jl b/src/ChainRules.jl
@@ -43,6 +43,7 @@ include("rulesets/Statistics/statistics.jl")
 include("rulesets/LinearAlgebra/utils.jl")
 include("rulesets/LinearAlgebra/blas.jl")
 include("rulesets/LinearAlgebra/dense.jl")
+include("rulesets/LinearAlgebra/norm.jl")
 include("rulesets/LinearAlgebra/structured.jl")
 include("rulesets/LinearAlgebra/factorization.jl")
 
diff --git a/src/rulesets/LinearAlgebra/dense.jl b/src/rulesets/LinearAlgebra/dense.jl
@@ -239,27 +239,3 @@ function rrule(::typeof(pinv), A::AbstractMatrix{T}; kwargs...) where {T}
     end
     return Y, pinv_pullback
 end
-
-#####
-##### `norm`
-#####
-
-function rrule(::typeof(norm), A::AbstractArray{<:Real}, p::Real=2)
-    y = norm(A, p)
-    function norm_pullback(ȳ)
-        u = y^(1-p)
-        ∂A = @thunk ȳ .* u .* abs.(A).^p ./ A
-        ∂p = @thunk ȳ * (u * sum(a->abs(a)^p * log(abs(a)), A) - y * log(y)) / p
-        (NO_FIELDS, ∂A, ∂p)
-    end
-    return y, norm_pullback
-end
-
-function rrule(::typeof(norm), x::Real, p::Real=2)
-    function norm_pullback(ȳ)
-        ∂x = @thunk ȳ * sign(x)
-        ∂p = @thunk zero(x)  # TODO: should this be Zero()?
-        (NO_FIELDS, ∂x, ∂p)
-    end
-    return norm(x, p), norm_pullback
-end
diff --git a/src/rulesets/LinearAlgebra/norm.jl b/src/rulesets/LinearAlgebra/norm.jl
@@ -0,0 +1,241 @@
+#####
+##### `norm`
+#####
+
+function frule((_, Δx), ::typeof(norm), x)
+    y = norm(x)
+    return y, _norm2_forward(x, Δx, norm(x))
+end
+function frule((_, Δx), ::typeof(norm), x::Number, p::Real)
+    y = norm(x, p)
+    ∂y = if iszero(Δx) || iszero(p)
+        zero(real(x)) * zero(real(Δx))
+    else
+        signx = x isa Real ? sign(x) : x * pinv(y)
+        _realconjtimes(signx, Δx)
+    end
+    return y, ∂y
+end
+
+function rrule(
+    ::typeof(norm),
+    x::Union{StridedArray, LinearAlgebra.AbstractTriangular, Diagonal},
+    p::Real,
+)
+    y = LinearAlgebra.norm(x, p)
+    function norm_pullback(Δy)
+        ∂x = Thunk() do
+            return if isempty(x) || p == 0
+                zero.(x) .* (zero(y) * zero(real(Δy)))
+            elseif p == 2
+                _norm2_back(x, y, Δy)
+            elseif p == 1
+                _norm1_back(x, y, Δy)
+            elseif p == Inf
+                _normInf_back(x, y, Δy)
+            elseif p == -Inf
+                _normInf_back(x, y, Δy)
+            else
+                _normp_back_x(x, p, y, Δy)
+            end
+        end
+        ∂p = @thunk _normp_back_p(x, p, y, Δy)
+        return (NO_FIELDS, ∂x, ∂p)
+    end
+    norm_pullback(::Zero) = (NO_FIELDS, Zero(), Zero())
+    return y, norm_pullback
+end
+function rrule(
+    ::typeof(norm),
+    x::Union{StridedArray, LinearAlgebra.AbstractTriangular, Diagonal},
+)
+    y = LinearAlgebra.norm(x)
+    function norm_pullback(Δy)
+        ∂x = if isempty(x)
+            zero.(x) .* (zero(y) * zero(real(Δy)))
+        else
+            _norm2_back(x, y, Δy)
+        end
+        return (NO_FIELDS, ∂x)
+    end
+    norm_pullback(::Zero) = (NO_FIELDS, Zero())
+    return y, norm_pullback
+end
+function rrule(
+    ::typeof(norm),
+    x::Union{LinearAlgebra.TransposeAbsVec, LinearAlgebra.AdjointAbsVec},
+    p::Real,
+)
+    y, inner_pullback = rrule(norm, parent(x), p)
+    function norm_pullback(Δy)
+        (∂self, ∂x′, ∂p) = inner_pullback(Δy)
+        fdual = x isa Transpose ? transpose : adjoint
+        ∂x = @thunk fdual(unthunk(∂x′))
+        return (∂self, ∂x, ∂p)
+    end
+    return y, norm_pullback
+end
+function rrule(::typeof(norm), x::Number, p::Real)
+    y = norm(x, p)
+    function norm_pullback(Δy)
+        ∂x = if iszero(Δy) || iszero(p)
+            zero(x) * zero(real(Δy))
+        else
+            signx = x isa Real ? sign(x) : x * pinv(y)
+            signx * real(Δy)
+        end
+        return (NO_FIELDS, ∂x, Zero())
+    end
+    norm_pullback(::Zero) = (NO_FIELDS, Zero(), Zero())
+    return y, norm_pullback
+end
+
+#####
+##### `normp`
+#####
+
+function rrule(
+    ::typeof(LinearAlgebra.normp),
+    x::Union{StridedArray, LinearAlgebra.AbstractTriangular, Diagonal},
+    p,
+)
+    y = LinearAlgebra.normp(x, p)
+    function normp_pullback(Δy)
+        ∂x = @thunk _normp_back_x(x, p, y, Δy)
+        ∂p = @thunk _normp_back_p(x, p, y, Δy)
+        return (NO_FIELDS, ∂x, ∂p)
+    end
+    normp_pullback(::Zero) = (NO_FIELDS, Zero(), Zero())
+    return y, normp_pullback
+end
+
+function _normp_back_x(x, p, y, Δy)
+    c = real(Δy) / y
+    ∂x = broadcast(x) do xi
+        a = norm(xi)
+        ∂xi = xi * ((a / y)^(p - 2) * c)
+        return ifelse(isfinite(∂xi), ∂xi, zero(∂xi))
+    end
+    return ∂x
+end
+
+function _normp_back_p(x, p, y, Δy)
+    y > 0 && isfinite(y) && !iszero(p) || return zero(real(Δy)) * zero(y) / one(p)
+    s = sum(x) do xi
+        a = norm(xi)
+        c = (a / y)^(p - 1) * a * log(a)
+        return ifelse(isfinite(c), c, zero(c))
+    end
+    ∂p = real(Δy) * (s - y * log(y)) / p
+    return ∂p
+end
+
+#####
+##### `normMinusInf`/`normInf`
+#####
+
+function rrule(
+    ::typeof(LinearAlgebra.normMinusInf),
+    x::Union{StridedArray, LinearAlgebra.AbstractTriangular, Diagonal},
+)
+    y = LinearAlgebra.normMinusInf(x)
+    normMinusInf_pullback(Δy) = (NO_FIELDS, _normInf_back(x, y, Δy))
+    normMinusInf_pullback(::Zero) = (NO_FIELDS, Zero())
+    return y, normMinusInf_pullback
+end
+
+function rrule(
+    ::typeof(LinearAlgebra.normInf),
+    x::Union{StridedArray,LinearAlgebra.AbstractTriangular,Diagonal},
+)
+    y = LinearAlgebra.normInf(x)
+    normInf_pullback(Δy) = (NO_FIELDS, _normInf_back(x, y, Δy))
+    normInf_pullback(::Zero) = (NO_FIELDS, Zero())
+    return y, normInf_pullback
+end
+
+function _normInf_back(x, y, Δy)
+    Δu = real(Δy)
+    T = typeof(zero(float(eltype(x))) * zero(Δu))
+    ∂x = fill!(similar(x, T), 0)
+    # if multiple `xi`s have the exact same norm, then they must have been identically
+    # produced, e.g. with `fill`. So we set only one to be non-zero.
+    # we choose last index to match the `frule`.
+    yind = findlast(xi -> norm(xi) == y, x)
+    yind === nothing && throw(ArgumentError("y is not the correct norm of x"))
+    @inbounds ∂x[yind] = sign(x[yind]) * Δu
+    return ∂x
+end
+
+#####
+##### `norm1`
+#####
+
+function rrule(
+    ::typeof(LinearAlgebra.norm1),
+    x::Union{StridedArray,LinearAlgebra.AbstractTriangular,Diagonal},
+)
+    y = LinearAlgebra.norm1(x)
+    norm1_pullback(Δy) = (NO_FIELDS, _norm1_back(x, y, Δy))
+    norm1_pullback(::Zero) = (NO_FIELDS, Zero())
+    return y, norm1_pullback
+end
+
+_norm1_back(x, y, Δy) = sign.(x) .* real(Δy)
+
+#####
+##### `norm2`
+#####
+
+function frule((_, Δx), ::typeof(LinearAlgebra.norm2), x)
+    y = LinearAlgebra.norm2(x)
+    return y, _norm2_forward(x, Δx, y)
+end
+
+function rrule(
+    ::typeof(LinearAlgebra.norm2),
+    x::Union{StridedArray,LinearAlgebra.AbstractTriangular,Diagonal},
+)
+    y = LinearAlgebra.norm2(x)
+    norm2_pullback(Δy) = (NO_FIELDS, _norm2_back(x, y, Δy))
+    norm2_pullback(::Zero) = (NO_FIELDS, Zero())
+    return y, norm2_pullback
+end
+
+function _norm2_forward(x, Δx, y)
+    ∂y = real(dot(x, Δx)) * pinv(y)
+    return ∂y
+end
+_norm2_back(x, y, Δy) = x .* (real(Δy) * pinv(y))
+
+#####
+##### `normalize`
+#####
+
+function rrule(::typeof(normalize), x::AbstractVector, p::Real)
+    nrm, inner_pullback = rrule(norm, x, p)
+    Ty = typeof(first(x) / nrm)
+    y = copyto!(similar(x, Ty), x)
+    LinearAlgebra.__normalize!(y, nrm)
+    function normalize_pullback(Δy)
+        invnrm = pinv(nrm)
+        ∂nrm = -dot(y, Δy) * invnrm
+        (_, ∂xnorm, ∂p) = inner_pullback(∂nrm)
+        ∂x = @thunk unthunk(∂xnorm) .+ Δy .* invnrm
+        return (NO_FIELDS, ∂x, ∂p)
+    end
+    normalize_pullback(::Zero) = (NO_FIELDS, Zero(), Zero())
+    return y, normalize_pullback
+end
+function rrule(::typeof(normalize), x::AbstractVector)
+    nrm = LinearAlgebra.norm2(x)
+    Ty = typeof(first(x) / nrm)
+    y = copyto!(similar(x, Ty), x)
+    LinearAlgebra.__normalize!(y, nrm)
+    function normalize_pullback(Δy)
+        ∂x = (Δy .- real(dot(y, Δy)) .* y) .* pinv(nrm)
+        return (NO_FIELDS, ∂x)
+    end
+    normalize_pullback(::Zero) = (NO_FIELDS, Zero())
+    return y, normalize_pullback
+end
diff --git a/test/rulesets/LinearAlgebra/dense.jl b/test/rulesets/LinearAlgebra/dense.jl
@@ -130,12 +130,4 @@
         frule_test(tr, (randn(N, N), randn(N, N)))
         rrule_test(tr, randn(), (randn(N, N), randn(N, N)))
     end
-    @testset "norm" begin
-        for dims in [(), (5,), (3, 2), (7, 3, 2)]
-            A = randn(dims...)
-            p = randn()
-            ȳ = randn()
-            rrule_test(norm, ȳ, (A, randn(dims...)), (p, randn()))
-        end
-    end
 end
diff --git a/test/rulesets/LinearAlgebra/norm.jl b/test/rulesets/LinearAlgebra/norm.jl
diff --git a/test/runtests.jl b/test/runtests.jl