Merge pull request #285 from JuliaDiff/mz/docs

mzgubic · web-flow · commit 37a6fec1f0ab · 2021-01-22T18:01:24.000Z
add "when to write rules" advice
diff --git a/docs/Manifest.toml b/docs/Manifest.toml
@@ -10,15 +10,25 @@ uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
 version = "0.5.10"
 
 [[ChainRulesCore]]
-deps = ["LinearAlgebra", "SparseArrays"]
+deps = ["Compat", "LinearAlgebra", "SparseArrays"]
 path = ".."
 uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
-version = "0.9.24"
+version = "0.9.26"
+
+[[Compat]]
+deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
+git-tree-sha1 = "919c7f3151e79ff196add81d7f4e45d91bbf420b"
+uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
+version = "3.25.0"
 
 [[Dates]]
 deps = ["Printf"]
 uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
 
+[[DelimitedFiles]]
+deps = ["Mmap"]
+uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
+
 [[Distributed]]
 deps = ["Random", "Serialization", "Sockets"]
 uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
@@ -115,13 +125,21 @@ version = "0.1.0"
 [[Serialization]]
 uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
 
+[[SharedArrays]]
+deps = ["Distributed", "Mmap", "Random", "Serialization"]
+uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
+
 [[Sockets]]
 uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
 
 [[SparseArrays]]
 deps = ["LinearAlgebra", "Random"]
 uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 
+[[Statistics]]
+deps = ["LinearAlgebra", "SparseArrays"]
+uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+
 [[Test]]
 deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
 uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/docs/Project.toml b/docs/Project.toml
@@ -1,5 +1,6 @@
 [deps]
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
+Compat = "34da2185-b29b-5c13-b0c7-acf172513d20"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 DocumenterTools = "35a29f4d-8980-5a13-9543-d66fff28ecb8"
 Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a"
diff --git a/docs/src/writing_good_rules.md b/docs/src/writing_good_rules.md
@@ -75,11 +75,11 @@ Rule definition tools can help you write more `frule`s and the `rrule`s with les
 
 For non-differentiable functions the [`@non_differentiable`](@ref) macro can be used.
 For example, instead of manually defining the `frule` and the `rrule` for string concatenation `*(String..)`, the macro call
-```
+```julia
 @non_differentiable *(String...)
 ```
 defines the following `frule` and `rrule` automatically
-```
+```julia
 function ChainRulesCore.frule(var"##_#1600", ::Core.Typeof(*), String::Any...; kwargs...)
     return (*(String...; kwargs...), DoesNotExist())
 end
@@ -103,16 +103,186 @@ In fact, any number of scalar arguments is supported, as is returning a tuple of
 See docstrings for the comprehensive usage instructions.
 ## Write tests
 
-In [ChainRulesTestUtils.jl](https://github.com/JuliaDiff/ChainRulesTestUtils.jl)
-there are fairly decent tools for writing tests based on [FiniteDifferences.jl](https://github.com/JuliaDiff/FiniteDifferences.jl).
-Take a look at existing [ChainRules.jl](https://github.com/JuliaDiff/ChainRules.jl) tests and you should see how to do stuff.
+[ChainRulesTestUtils.jl](https://github.com/JuliaDiff/ChainRulesTestUtils.jl)
+provides tools for writing tests based on [FiniteDifferences.jl](https://github.com/JuliaDiff/FiniteDifferences.jl).
+Take a look at the documentation or the existing [ChainRules.jl](https://github.com/JuliaDiff/ChainRules.jl) tests to see how to write the tests.
 
 !!! warning
-    Use finite differencing to test derivatives.
     Don't use analytical derivations for derivatives in the tests.
     Those are what you use to define the rules, and so can not be confidently used in the test.
     If you misread/misunderstood them, then your tests/implementation will have the same mistake.
+    Use finite differencing methods instead, as they are based on the primal computation.
 
 ## CAS systems are your friends.
 
 It is very easy to check gradients or derivatives with a computer algebra system (CAS) like [WolframAlpha](https://www.wolframalpha.com/input/?i=gradient+atan2%28x%2Cy%29).
+
+## Which functions need rules?
+
+In principle, a perfect AD system only needs rules for basic operations and can infer the rules for more complicated functions automatically.
+In practice, performance needs to be considered as well.
+
+Some functions use `ccall` internally, for example [`^`](https://github.com/JuliaLang/julia/blob/v1.5.3/base/math.jl#L886).
+These functions can not be differentiated through by AD systems, and need custom rules.
+
+Other functions can in principle be differentiated through by an AD system, but there exists a mathematical insight that can dramatically improve the computation of the derivative.
+An example is numerical integration, where writing a rule removes the need to perform AD through numerical integration.
+
+Furthermore, AD systems make different trade-offs in performance due to their design.
+This means that a certain rule will help one AD system, but not improve (and also not harm) another.
+Below, we list some patterns relevant for the [Zygote.jl](https://github.com/FluxML/Zygote.jl) AD system.
+
+### Patterns that need rules in [Zygote.jl](https://github.com/FluxML/Zygote.jl)
+
+There are a few classes of functions that Zygote can not differentiate through.
+Custom rules will need to be written for these to make AD work.
+
+Other patterns can be AD'ed through, but the backward pass performance can be greatly improved by writing a rule.
+
+#### Functions which mutate arrays
+For example,
+```julia
+function addone!(array)
+    array .+= 1
+    return sum(array)
+end
+```
+complains that
+```julia
+julia> using Zygote
+julia> gradient(addone!, a)
+ERROR: Mutating arrays is not supported
+```
+However, upon adding the `rrule` (restart the REPL after calling `gradient`)
+```julia
+function ChainRules.rrule(::typeof(addone!), a)
+    y = addone!(a)
+    function addone!_pullback(ȳ)
+        return NO_FIELDS, ones(length(a))
+    end
+    return y, addone!_pullback
+end
+```
+the gradient can be evaluated:
+```julia
+julia> gradient(addone!, a)
+([1.0, 1.0, 1.0],)
+```
+
+!!! note "Why restarting REPL after calling `gradient`?"
+    When `gradient` is called in `Zygote` for a function with no `rrule` defined, a backward pass for the function call is generated and cached.
+    When `gradient` is called for the second time on the same function signature, the backward pass is reused without checking whether an an `rrule` has been defined between the two calls to `gradient`.
+    
+    If an `rrule` is defined before the first call to `gradient` it should register the rule and use it, but that prevents comparing what happens before and after the `rrule` is defined.
+    To compare both versions with and without an `rrule` in the REPL simultaneously, define a function `f(x) = <body>` (no `rrule`), another function `f_cr(x) = f(x)`, and an `rrule` for `f_cr`.
+
+#### Exception handling
+
+Zygote does not support differentiating through `try`/`catch` statements.
+For example, differentiating through
+```julia
+function exception(x)
+    try
+        return x^2
+    catch e
+        println("could not square input")
+        throw(e)
+    end
+end
+```
+does not work
+```julia
+julia> gradient(exception, 3.0)
+ERROR: Compiling Tuple{typeof(exception),Int64}: try/catch is not supported.
+```
+without an `rrule` defined (restart the REPL after calling `gradient`)
+```julia
+function ChainRulesCore.rrule(::typeof(exception), x)
+    y = exception(x)
+    function exception_pullback(ȳ)
+        return NO_FIELDS, 2*x
+    end
+    return y, exception_pullback
+end
+```
+
+```julia
+julia> gradient(exception, 3.0)
+(6.0,)
+```
+
+
+#### Loops
+
+Julia runs loops fast.
+Unfortunately Zygote differentiates through loops slowly.
+So, for example, computing the mean squared error by using a loop
+```julia
+function mse(y, ŷ)
+    N = length(y)
+    s = 0.0
+    for i in 1:N
+        s +=  (y[i] - ŷ[i])^2.0
+    end
+    return s/N
+end
+```
+takes a lot longer to AD through
+```julia
+julia> y = rand(30)
+julia> ŷ = rand(30)
+julia> @btime gradient(mse, $y, $ŷ)
+  38.180 μs (993 allocations: 65.00 KiB)
+```
+than if we supply an `rrule`, (restart the REPL after calling `gradient`)
+```julia
+function ChainRules.rrule(::typeof(mse), x, x̂)
+    output = mse(x, x̂)
+    function mse_pullback(ȳ)
+        N = length(x)
+        g = (2 ./ N) .* (x .- x̂) .* ȳ
+        return NO_FIELDS, g, -g
+    end
+    return output, mse_pullback
+end
+```
+which is much faster
+```julia
+julia> @btime gradient(mse, $y, $ŷ)
+  143.697 ns (2 allocations: 672 bytes)
+```
+
+#### Inplace accumulation
+
+Inplace accumulation of gradients is slow in `Zygote`.
+The issue, demonstrated in the folowing example, is that the gradient of `getindex` allocates an array of zeros with a single non-zero element. 
+```julia
+function sum3(array)
+    x = array[1]
+    y = array[2]
+    z = array[3]
+    return x+y+z
+end
+```
+```julia
+julia> @btime gradient(sum3, rand(30))
+  424.510 ns (9 allocations: 2.06 KiB)
+```
+Computing the gradient with only a single array allocation using an `rrule` (restart the REPL after calling `gradient`)
+```julia
+function ChainRulesCore.rrule(::typeof(sum3), a)
+    y = sum3(a)
+    function sum3_pullback(ȳ)
+        grad = zeros(length(a))
+        grad[1:3] .+= 1.0
+        return NO_FIELDS, grad
+    end
+    return y, sum3_pullback
+end
+```
+turns out to be significantly faster 
+```julia
+julia> @btime gradient(sum3, rand(30))
+  192.818 ns (3 allocations: 784 bytes)
+```
+