go back to division

mcabbott · mcabbott · commit 504e8dc944d6 · 2021-09-02T22:07:11.000-04:00
diff --git a/src/rulesets/Base/base.jl b/src/rulesets/Base/base.jl
@@ -183,44 +183,23 @@ end
 # Note that rules for `^` are defined in the fastmath_able.jl
 
 function frule((_, _, Δx, _), ::typeof(Base.literal_pow), ::typeof(^), x::Real, ::Val{p}) where p
+    y = Base.literal_pow(^, x, Val(p))
     yox = Base.literal_pow(^, x, Val(p-1))
-    if p < 0 && iseven(p)
-        # When p<0 and x==0, using yox * x for the primal gives NaN instead of +-Inf
-        y = ifelse(iszero(x), oftype(yox, Inf), yox * x)
-    elseif p < 0
-        y = ifelse(iszero(x), copysign(oftype(yox, Inf), x), yox * x)
-    else
-        y = yox * x
-    end
     return y, p * yox * Δx
 end
-frule((_, _, Δx, _), ::typeof(Base.literal_pow), ::typeof(^), x::Real, ::Val{1}) = x^1, Δx
 frule((_, _, Δx, _), ::typeof(Base.literal_pow), ::typeof(^), x::Real, ::Val{0}) = x^0, zero(Δx)
 
 function rrule(::typeof(Base.literal_pow), ::typeof(^), x::Real, ::Val{p}) where p
-    yox = Base.literal_pow(^, x, Val(p-1))
-    project = ProjectTo(x)
+    y = Base.literal_pow(^, x, Val(p))
     @inline function literal_pow_pullback(dy)
-        return NoTangent(), NoTangent(), project(p * yox * dy), NoTangent()
-    end
-    if p < 0 && iseven(p)
-        # When p<0 and x==0, using yox * x for the primal gives NaN instead of +-Inf
-        y = ifelse(iszero(x), oftype(yox, Inf), yox * x)
-    elseif p < 0
-        y = ifelse(iszero(x), copysign(oftype(yox, Inf), x), yox * x)
-    else
-        y = yox * x
+        # Calling literal_pow a 2nd time is the easy way to get all the edge cases right.
+        # It should be cheap up to p=4, which is the main use of literal powers, right?
+        yox = Base.literal_pow(^, x, Val(p-1))
+        return (NoTangent(), NoTangent(), ProjectTo(x)(p * yox * dy), NoTangent())
     end
     return y, literal_pow_pullback
 end
-function rrule(::typeof(Base.literal_pow), ::typeof(^), x::Real, ::Val{1})
-    project = ProjectTo(x)
-    literal_pow_one_pullback(dy) = NoTangent(), NoTangent(), project(dy), NoTangent()
-    return x^1, literal_pow_one_pullback
-end
 function rrule(::typeof(Base.literal_pow), ::typeof(^), x::Real, ::Val{0})
-    # Since 0^0 == 1 == 0.001^0, this gradient should not be NaN at x==0
-    project = ProjectTo(x)
-    literal_pow_zero_pullback(dy) = NoTangent(), NoTangent(), project(zero(dy)), NoTangent()
+    literal_pow_zero_pullback(dy) = (NoTangent(), NoTangent(), ProjectTo(x)(zero(dy)), NoTangent())
     return x^0, literal_pow_zero_pullback
 end
diff --git a/src/rulesets/Base/fastmath_able.jl b/src/rulesets/Base/fastmath_able.jl
@@ -166,14 +166,23 @@ let
         ## power
         # literal_pow is in base.jl
         function frule((_, Δx, Δp), ::typeof(^), x::Number, p::Number)
-            yox = x ^ (p-1)
-            y = yox * x
+            y = x ^ p
+            thegrad = (p * y / x)
+            thelog = Δp isa AbstractZero ? Δp : log(oftype(y, x))
+            return y, muladd(y * thelog, Δp, thegrad * Δx)
+        end
+        function frule((_, Δx, Δp), ::typeof(^), x::Real, p::Real)
+            y = x ^ p
+            thegrad = ifelse(!iszero(x) | (p<0), (p * y / x),
+                        ifelse(isone(p), one(y),
+                          ifelse(0<p<1,  oftype(y, Inf), zero(y) )))
             thelog = if Δp isa AbstractZero
                 # Then don't waste time computing log
-                NoTangent()
-            elseif x isa Real && p isa Real
+                Δp
+            else# if x isa Real && p isa Real
                 # For positive x we'd like a real answer, including any Δp.
                 # For negative x, this is a DomainError unless isinteger(p)...
+
                 # could decide that implues that p is non-differentiable:
                 # log(ifelse(x<0, one(x), x))
 
@@ -187,25 +196,26 @@ julia> frule((0,0,1), ^, -4, 3.0), unthunk.(rrule(^, -4, 3.0)[2](1))
 julia> frule((0,0,1), ^, 4, 3.0), unthunk.(rrule(^, 4, 3.0)[2](1))
 ((64.0, 88.722839111673), (NoTangent(), 48.0, 88.722839111673))
 =#
-            else
-                # This promotion handles e.g. real x & complex p
-                log(oftype(y, x))
             end
-            return y, muladd(y * thelog, Δp, p * yox * Δx)
+            return y, muladd(y * thelog, Δp, thegrad * Δx)
         end
+
         function rrule(::typeof(^), x::Number, p::Number)
-            yox = x ^ (p-1)
+            y = x^p
             project_x, project_p = ProjectTo(x), ProjectTo(p)
             @inline function power_pullback(dy)
-                dx = project_x(conj(p * yox) * dy)
-                dp = @thunk if x isa Real && p isa Real
-                    project_p(conj(yox * x * log(complex(x))) * dy)
+                if x isa Real && p isa Real
+                    thegrad = ifelse(!iszero(x) | (p<0), (p * y / x),
+                                ifelse(isone(p), one(y),
+                                  ifelse(0<p<1,  oftype(y, Inf), zero(y) )))
                 else
-                    project_p(conj(yox * x * log(oftype(yox, x))) * dy)
+                    thegrad = (p * y / x)
                 end
+                dx = project_x(conj(thegrad) * dy)
+                dp = @thunk project_p(conj(y * log(complex(x))) * dy)
                 return (NoTangent(), dx, dp)
             end
-            return yox * x, power_pullback
+            return y, power_pullback
         end
 
         @scalar_rule(
diff --git a/test/rulesets/Base/fastmath_able.jl b/test/rulesets/Base/fastmath_able.jl
@@ -177,50 +177,51 @@ const FASTABLE_AST = quote
         #     test_rrule(^, randn(T) + 3, p ⊢ NoTangent())
         # end
 
-        @testset "^(x::Float64, p::$S) near x=0, p=1,0,-1,-2" for S in (Int, Float64)
-            p = S(+2)
-            @test frule((1,1,1), ^, 0.0, p)[1] == 0
-            @test_broken frule((1,1,1), ^, 0.0, p)[2] == 0
-            @test rrule(^, 0.0, p)[1] == 0
-            @test unthunk(rrule(^, 0.0, p)[2](1.0)[2]) == 0
-
-            # Identity function x^1, at zero
-            p = S(+1)
-            @test frule((1,1,1), ^, 0.0, p)[1] == 0
-            @test_broken frule((1,1,1), ^, 0.0, p)[2] == 1
-            @test rrule(^, 0.0, p)[1] == 0
-            @test unthunk(rrule(^, 0.0, p)[2](1.0)[2]) == 1
-
-            # Trivial singularity: 0^0 == 1 in Julia
-            p = S(0)
-            @test_skip frule((1,1,1), ^, 0.0, p)[1] == (0.0)^0
-            @test_broken frule((1,1,1), ^, 0.0, p)[2] == 0
-            @test_broken unthunk(rrule(^, 0.0, p)[2](1.0)[3]) == 0.0
+        # @testset "^(x::Float64, p::$S) near x=0, p=1,0,-1,-2" for S in (Int, Float64)
+        #     # x^2. Easy to get NaN here by mistake.
+        #     p = S(+2)
+        #     @test frule((1,1,1), ^, 0.0, p)[1] == 0         # value
+        #     @test_broken frule((1,1,1), ^, 0.0, p)[2] == 0  # gradient, forwards
+        #     @test rrule(^, 0.0, p)[1] == 0                  # value
+        #     @test unthunk(rrule(^, 0.0, p)[2](1.0)[2]) == 0 # gradient, reverse
+
+        #     # Identity function x^1, at zero
+        #     p = S(+1)
+        #     @test frule((1,1,1), ^, 0.0, p)[1] == 0
+        #     @test_broken frule((1,1,1), ^, 0.0, p)[2] == 1
+        #     @test rrule(^, 0.0, p)[1] == 0
+        #     @test unthunk(rrule(^, 0.0, p)[2](1.0)[2]) == 1
+
+        #     # Trivial singularity: 0^0 == 1 in Julia
+        #     p = S(0)
+        #     @test_skip frule((1,1,1), ^, 0.0, p)[1] == (0.0)^0
+        #     @test_broken frule((1,1,1), ^, 0.0, p)[2] == 0
+        #     @test_broken unthunk(rrule(^, 0.0, p)[2](1.0)[3]) == 0.0
             
-            # Odd power, 1/x
-            p = S(-1)
-            @test_skip frule((1,1,1), ^, 0.0, p)[1] == (0.0)^-1
-            @test_broken frule((1,1,1), ^, 0.0, p)[2] == -Inf
-            @test_skip rrule(^, 0.0, p)[1] == (0.0)^-1 == Inf
-            @test unthunk(rrule(^, 0.0, p)[2](1.0)[2]) == -Inf
-
-            @test_skip frule((1,1,1), ^, -0.0, p)[1] == (-0.0)^-1
-            @test_broken frule((1,1,1), ^, -0.0, p)[2] == -Inf
-            @test_skip rrule(^, -0.0, p)[1] == (-0.0)^-1 == -Inf
-            @test unthunk(rrule(^, -0.0, p)[2](1.0)[2]) == -Inf
-
-            # Even power, 1/x^2
-            p = S(-2)
-            @test_skip frule((1,1,1), ^, 0.0, p)[1] == (0.0)^-2
-            @test_broken frule((1,1,1), ^, 0.0, p)[2] == -Inf
-            @test_skip rrule(^, 0.0, p)[1] == (0.0)^-2 == Inf
-            @test unthunk(rrule(^, 0.0, p)[2](1.0)[2]) == -Inf
-
-            @test_skip frule((1,1,1), ^, -0.0, p)[1] == (-0.0)^-2
-            @test_broken frule((1,1,1), ^, -0.0, p)[2] == +Inf
-            @test_skip rrule(^, -0.0, p)[1] == (-0.0)^-2 == Inf
-            @test unthunk(rrule(^, -0.0, p)[2](1.0)[2]) == +Inf
-        end
+        #     # Odd power, 1/x
+        #     p = S(-1)
+        #     @test_skip frule((1,1,1), ^, 0.0, p)[1] == (0.0)^-1
+        #     @test_broken frule((1,1,1), ^, 0.0, p)[2] == -Inf
+        #     @test_skip rrule(^, 0.0, p)[1] == (0.0)^-1 == Inf
+        #     @test unthunk(rrule(^, 0.0, p)[2](1.0)[2]) == -Inf
+
+        #     @test_skip frule((1,1,1), ^, -0.0, p)[1] == (-0.0)^-1
+        #     @test_broken frule((1,1,1), ^, -0.0, p)[2] == -Inf
+        #     @test_skip rrule(^, -0.0, p)[1] == (-0.0)^-1 == -Inf
+        #     @test unthunk(rrule(^, -0.0, p)[2](1.0)[2]) == -Inf
+
+        #     # Even power, 1/x^2
+        #     p = S(-2)
+        #     @test_skip frule((1,1,1), ^, 0.0, p)[1] == (0.0)^-2
+        #     @test_broken frule((1,1,1), ^, 0.0, p)[2] == -Inf
+        #     @test_skip rrule(^, 0.0, p)[1] == (0.0)^-2 == Inf
+        #     @test unthunk(rrule(^, 0.0, p)[2](1.0)[2]) == -Inf
+
+        #     @test_skip frule((1,1,1), ^, -0.0, p)[1] == (-0.0)^-2
+        #     @test_broken frule((1,1,1), ^, -0.0, p)[2] == +Inf
+        #     @test_skip rrule(^, -0.0, p)[1] == (-0.0)^-2 == Inf
+        #     @test unthunk(rrule(^, -0.0, p)[2](1.0)[2]) == +Inf
+        # end
 
         #     T <: Real && @testset "discontinuity for ^(x::Real, n::Int) when x ≤ 0" begin
         #         # finite differences doesn't work for x < 0, so we check manually
@@ -242,6 +243,90 @@ const FASTABLE_AST = quote
         # end
     end
 
+POWERGRADS = [ # (x,p) => (dx,dp)
+# some regular points, sanity checks
+  (1.0, 2)   => (2.0, 0.0),
+  (2.0, 2)   => (4.0, 2.772588722239781),
+# at x=0, gradients for x seem clear, 
+# for p I've just written here what it gives 
+  (0.0, 2)   => (0.0, NaN),
+  (-0.0, 2)  => (-0.0, NaN),
+  (0.0, 1)   => (1.0, NaN), # or zero?
+  (-0.0, 1)  => (1.0, NaN),
+  (0.0, 0)   => (0.0, -Inf),
+  (-0.0, 0)  => (0.0, -Inf),
+  (0.0, -1)  => (-Inf, -Inf),
+  (-0.0, -1) => (-Inf, Inf),
+  (0.0, -2)  => (-Inf, -Inf),
+  (-0.0, -2) => (Inf, -Inf),
+# non-integer powers
+  (0.0, 0.5)   => (Inf, NaN),
+  (0.0, 3.5)   => (0.0, NaN),
+
+]
+for ((x,p), (gx, gp)) in POWERGRADS
+    y = x^p
+
+    y_f = frule((1,1,1), ^, x, p)[1]
+    isequal(y, y_f) || println("^ forward value for $x^$p: got $y_f, expected $y")
+
+    y_r = rrule(^, x, p)[1]
+    isequal(y, y_r) || println("^ reverse value for $x^$p: got $y_r, expected $y")
+
+    gx_f = frule((0,1,0), ^, x, p)[1]
+    gp_f = frule((0,0,1), ^, x, p)[2]
+    # isequal(gx, gx_f) || println("^ forward `x` gradient for $x^$p: got $gx_f, expected $gx, maybe")
+    # isequal(gp, gp_f) || println("^ forward `p` gradient for $x^$p: got $gp_f, expected $gp, maybe")
+
+    gx_r, gp_r = unthunk.(rrule(^, x, p)[2](1))[2:3]
+    isequal(gx, gx_r) || println("^ reverse `x` gradient for $x^$p: got $gx_r, expected $gx")
+    isequal(gp, gp_r) || println("^ reverse `p` gradient for $x^$p: got $gp_r, expected $gp")
+
+end
+for ((x,p), (gx, gp)) in POWERGRADS
+    p isa Int || continue
+    x isa Real || continue
+
+    y = x^p
+
+    y_f = frule((1,1,1,1), Base.literal_pow, ^, x, Val(p))[1]
+    isequal(y, y_f) || println("literal_pow forward value for $x^$p: got $y_f, expected $y")
+
+    y_r = rrule(Base.literal_pow, ^, x, Val(p))[1]
+    isequal(y, y_r) || println("literal_pow reverse value for $x^$p: got $y_r, expected $y")
+
+    gx_r = unthunk(rrule(Base.literal_pow, ^, x, Val(p))[2](1))[3]
+    isequal(gx, gx_r) || println("literal_pow `x` gradient for $x^$p: got $gx_r, expected $gx")
+
+    gx_f = frule((0,0,1,0), Base.literal_pow, ^, x, Val(p))[1]
+    # isequal(gx, gx_f) || println("literal_pow forward `x` gradient for $x^$p: got $gx_f, expected $gx, maybe")
+end
+
+
+for x in Any[0.0, -0.0, 0.0+0im], p in Any[2, 1.5, 1, 0.5, 0, -0.5, -1, -1.5, -2]
+
+    y = x^p
+    yr = rrule(^, x, p)[1]
+    # isequal(y, yr) || printstyled("runtime $x^$p = $y, but rrule gives $yr \n", color=:red)
+
+    gx, gp = unthunk.(rrule(^, x, p)[2](1)[2:3])
+    println("runtime $x^$p gradient from rrule: $gx, $gp")
+
+    p isa Int || continue  # e.g. Meta.@lower x^5.0
+    x isa Real || continue # limitation of methods here?
+    y = Base.literal_pow(^, x, Val(p))
+
+    # yr = rrule(Base.literal_pow, ^, x, Val(p))[1]
+    # isequal(y, yr) || printstyled("literal $x^$p = $y, but rrule gives $yr\n", color=:red)
+
+    # gx = unthunk(rrule(Base.literal_pow, ^, x, Val(p))[2](1))[3]
+    # println("literal $x^$p gradient from rrule: $gx")
+
+    # gg[(x,p)] = (gx, nothing)
+end
+
+
+
     @testset "sign" begin
         @testset "real" begin
             @testset "at $x" for x in (-1.1, -1.1, 0.5, 100.0)