From a6d93cb839e668bb8cfe691e6819a6716c2a566d Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Mon, 29 Aug 2022 18:30:30 -0400
Subject: [PATCH 1/9] move docs around

---
 docs/Project.toml              |   1 +
 docs/make.jl                   |  23 ++---
 docs/src/community.md          |   5 --
 docs/src/datasets.md           |   6 --
 docs/src/ecosystem.md          |   6 +-
 docs/src/index.md              |   6 ++
 docs/src/models/layers.md      |   9 ++
 docs/src/outputsize.md         |  47 +++++++++++
 docs/src/training/callbacks.md |  77 +++++++++++++++++
 docs/src/utilities.md          | 148 ++-------------------------------
 10 files changed, 166 insertions(+), 162 deletions(-)
 delete mode 100644 docs/src/community.md
 delete mode 100644 docs/src/datasets.md
 create mode 100644 docs/src/outputsize.md
 create mode 100644 docs/src/training/callbacks.md

diff --git a/docs/Project.toml b/docs/Project.toml
index 65ec0c0aa1..a8057d2e68 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -1,5 +1,6 @@
 [deps]
 BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"
+ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
 MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
diff --git a/docs/make.jl b/docs/make.jl
index 46b55aefc3..0e562d2815 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -14,28 +14,29 @@ makedocs(
             "Overview" => "models/overview.md",
             "Basics" => "models/basics.md",
             "Recurrence" => "models/recurrence.md",
-            "Model Reference" => "models/layers.md",
+            "Layer Reference" => "models/layers.md",
             "Loss Functions" => "models/losses.md",
             "Regularisation" => "models/regularisation.md",
             "Advanced Model Building" => "models/advanced.md",
-            "Neural Network primitives from NNlib.jl" => "models/nnlib.md",
-            "Recursive transformations from Functors.jl" => "models/functors.md"
+            "NNlib.jl" => "models/nnlib.md",
+            "Functors.jl" => "models/functors.md",
          ],
          "Handling Data" => [
-             "One-Hot Encoding with OneHotArrays.jl" => "data/onehot.md",
-             "Working with data using MLUtils.jl" => "data/mlutils.md"
+             "MLUtils.jl" => "data/mlutils.md",
+             "OneHotArrays.jl" => "data/onehot.md",
          ],
          "Training Models" => [
              "Optimisers" => "training/optimisers.md",
-             "Training" => "training/training.md"
+             "Training" => "training/training.md",
          ],
          "GPU Support" => "gpu.md",
-         "Saving & Loading" => "saving.md",
-         "The Julia Ecosystem" => "ecosystem.md",
-         "Utility Functions" => "utilities.md",
+         "Model Tools" => [
+             "Saving & Loading" => "saving.md",
+             "Size Propagation" => "outputsize.md",
+             "Weight Initialisation" => "utilities.md",
+         ],
          "Performance Tips" => "performance.md",
-         "Datasets" => "datasets.md",
-         "Community" => "community.md"
+         "Flux's Ecosystem" => "ecosystem.md",
     ],
     format = Documenter.HTML(
         analytics = "UA-36890222-9",
diff --git a/docs/src/community.md b/docs/src/community.md
deleted file mode 100644
index e9ef999b2c..0000000000
--- a/docs/src/community.md
+++ /dev/null
@@ -1,5 +0,0 @@
-# Community
-
-All Flux users are welcome to join our community on the [Julia forum](https://discourse.julialang.org/), or the [slack](https://discourse.julialang.org/t/announcing-a-julia-slack/4866) (channel #machine-learning). If you have questions or issues we'll try to help you out.
-
-If you're interested in hacking on Flux, the [source code](https://github.com/FluxML/Flux.jl) is open and easy to understand -- it's all just the same Julia code you work with normally. You might be interested in our [intro issues](https://github.com/FluxML/Flux.jl/labels/good%20first%20issue) to get started or our [contributing guide](https://github.com/FluxML/Flux.jl/blob/master/CONTRIBUTING.md).
diff --git a/docs/src/datasets.md b/docs/src/datasets.md
deleted file mode 100644
index 9e4eaa913f..0000000000
--- a/docs/src/datasets.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# Datasets
-
-Commonly used machine learning datasets are provided by the following packages in the julia ecosystem:
-
-- [MLDatasets.jl](https://github.com/JuliaML/MLDatasets.jl): utility package for accessing common machine learning datasets.
-- [GraphMLDatasets.jl](https://github.com/yuehhua/GraphMLDatasets.jl): a library for machine learning datasets on graph.
diff --git a/docs/src/ecosystem.md b/docs/src/ecosystem.md
index e96680aeec..9bcefc8d28 100644
--- a/docs/src/ecosystem.md
+++ b/docs/src/ecosystem.md
@@ -1,4 +1,4 @@
-# The Julia Ecosystem
+# The Julia Ecosystem around Flux
 
 One of the main strengths of Julia lies in an ecosystem of packages 
 globally providing a rich and consistent user experience.
@@ -49,7 +49,10 @@ Utility tools you're unlikely to have met if you never used Flux!
 
 ### Datasets
 
+Commonly used machine learning datasets are provided by the following packages in the julia ecosystem:
+
 - [MLDatasets.jl](https://github.com/JuliaML/MLDatasets.jl) focuses on downloading, unpacking, and accessing benchmark datasets.
+- [GraphMLDatasets.jl](https://github.com/yuehhua/GraphMLDatasets.jl): a library for machine learning datasets on graph.
 
 ### Plumbing
  
@@ -87,6 +90,7 @@ Packages based on differentiable programming but not necessarily related to Mach
 
 - [OnlineStats.jl](https://github.com/joshday/OnlineStats.jl) provides single-pass algorithms for statistics.
 
+
 ## Useful miscellaneous packages
 
 Some useful and random packages!
diff --git a/docs/src/index.md b/docs/src/index.md
index 28f99a80de..e56c0aefd1 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -18,3 +18,9 @@ NOTE: Flux used to have a CuArrays.jl dependency until v0.10.4, replaced by CUDA
 ## Learning Flux
 
 There are several different ways to learn Flux. If you just want to get started writing models, the [model zoo](https://github.com/FluxML/model-zoo/) gives good starting points for many common ones. This documentation provides a reference to all of Flux's APIs, as well as a from-scratch introduction to Flux's take on models and how they work. Once you understand these docs, congratulations, you also understand [Flux's source code](https://github.com/FluxML/Flux.jl), which is intended to be concise, legible and a good reference for more advanced concepts.
+
+## Community
+
+All Flux users are welcome to join our community on the [Julia forum](https://discourse.julialang.org/), or the [slack](https://discourse.julialang.org/t/announcing-a-julia-slack/4866) (channel #machine-learning). If you have questions or issues we'll try to help you out.
+
+If you're interested in hacking on Flux, the [source code](https://github.com/FluxML/Flux.jl) is open and easy to understand -- it's all just the same Julia code you work with normally. You might be interested in our [intro issues](https://github.com/FluxML/Flux.jl/labels/good%20first%20issue) to get started or our [contributing guide](https://github.com/FluxML/Flux.jl/blob/master/CONTRIBUTING.md).
diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md
index 6230637744..a954ed1609 100644
--- a/docs/src/models/layers.md
+++ b/docs/src/models/layers.md
@@ -86,3 +86,12 @@ Many normalisation layers behave differently under training and inference (testi
 Flux.testmode!
 trainmode!
 ```
+
+
+## Listing All Layers
+
+The `modules` command uses Functors to extract a flat list of all layers:
+
+```@docs
+Flux.modules
+```
diff --git a/docs/src/outputsize.md b/docs/src/outputsize.md
new file mode 100644
index 0000000000..3cdffdcce3
--- /dev/null
+++ b/docs/src/outputsize.md
@@ -0,0 +1,47 @@
+## Model Building
+
+Flux provides some utility functions to help you generate models in an automated fashion.
+
+[`Flux.outputsize`](@ref) enables you to calculate the output sizes of layers like [`Conv`](@ref)
+when applied to input samples of a given size. This is achieved by passing a "dummy" array into
+the model that preserves size information without running any computation.
+`outputsize(f, inputsize)` works for all layers (including custom layers) out of the box.
+By default, `inputsize` expects the batch dimension,
+but you can exclude the batch size with `outputsize(f, inputsize; padbatch=true)` (assuming it to be one).
+
+Using this utility function lets you automate model building for various inputs like so:
+```julia
+"""
+    make_model(width, height, inchannels, nclasses;
+               layer_config = [16, 16, 32, 32, 64, 64])
+
+Create a CNN for a given set of configuration parameters.
+
+# Arguments
+- `width`: the input image width
+- `height`: the input image height
+- `inchannels`: the number of channels in the input image
+- `nclasses`: the number of output classes
+- `layer_config`: a vector of the number of filters per each conv layer
+"""
+function make_model(width, height, inchannels, nclasses;
+                    layer_config = [16, 16, 32, 32, 64, 64])
+  # construct a vector of conv layers programmatically
+  conv_layers = [Conv((3, 3), inchannels => layer_config[1])]
+  for (infilters, outfilters) in zip(layer_config, layer_config[2:end])
+    push!(conv_layers, Conv((3, 3), infilters => outfilters))
+  end
+
+  # compute the output dimensions for the conv layers
+  # use padbatch=true to set the batch dimension to 1
+  conv_outsize = Flux.outputsize(conv_layers, (width, height, nchannels); padbatch=true)
+
+  # the input dimension to Dense is programatically calculated from
+  #  width, height, and nchannels
+  return Chain(conv_layers..., Dense(prod(conv_outsize) => nclasses))
+end
+```
+
+```@docs
+Flux.outputsize
+```
\ No newline at end of file
diff --git a/docs/src/training/callbacks.md b/docs/src/training/callbacks.md
new file mode 100644
index 0000000000..3343a1664c
--- /dev/null
+++ b/docs/src/training/callbacks.md
@@ -0,0 +1,77 @@
+## Callback Helpers
+
+```@docs
+Flux.throttle
+Flux.stop
+Flux.skip
+```
+
+## Patience Helpers
+
+Flux provides utilities for controlling your training procedure according to some monitored condition and a maximum `patience`. For example, you can use `early_stopping` to stop training when the model is converging or deteriorating, or you can use `plateau` to check if the model is stagnating.
+
+For example, below we create a pseudo-loss function that decreases, bottoms out, and then increases. The early stopping trigger will break the loop before the loss increases too much.
+```julia
+# create a pseudo-loss that decreases for 4 calls, then starts increasing
+# we call this like loss()
+loss = let t = 0
+  () -> begin
+    t += 1
+    (t - 4) ^ 2
+  end
+end
+
+# create an early stopping trigger
+# returns true when the loss increases for two consecutive steps
+es = early_stopping(loss, 2; init_score = 9)
+
+# this will stop at the 6th (4 decreasing + 2 increasing calls) epoch
+@epochs 10 begin
+  es() && break
+end
+```
+
+The keyword argument `distance` of `early_stopping` is a function of the form `distance(best_score, score)`. By default `distance` is `-`, which implies that the monitored metric `f` is expected to be decreasing and minimized. If you use some increasing metric (e.g. accuracy), you can customize the `distance` function: `(best_score, score) -> score - best_score`.
+```julia
+# create a pseudo-accuracy that increases by 0.01 each time from 0 to 1
+# we call this like acc()
+acc = let v = 0
+  () -> v = max(1, v + 0.01)
+end
+
+# create an early stopping trigger for accuracy
+es = early_stopping(acc, 3; delta = (best_score, score) -> score - best_score)
+
+# this will iterate until the 10th epoch
+@epochs 10 begin
+  es() && break
+end
+```
+
+`early_stopping` and `plateau` are both built on top of `patience`. You can use `patience` to build your own triggers that use a patient counter. For example, if you want to trigger when the loss is below a threshold for several consecutive iterations:
+```julia
+threshold(f, thresh, delay) = patience(delay) do
+  f() < thresh
+end
+```
+
+Both `predicate` in `patience` and `f` in `early_stopping` / `plateau` can accept extra arguments. You can pass such extra arguments to `predicate` or `f` through the returned function:
+```julia
+trigger = patience((a; b) -> a > b, 3)
+
+# this will iterate until the 10th epoch
+@epochs 10 begin
+  trigger(1; b = 2) && break
+end
+
+# this will stop at the 3rd epoch
+@epochs 10 begin
+  trigger(3; b = 2) && break
+end
+```
+
+```@docs
+Flux.patience
+Flux.early_stopping
+Flux.plateau
+```
diff --git a/docs/src/utilities.md b/docs/src/utilities.md
index 28f6bc4a18..5f092c0216 100644
--- a/docs/src/utilities.md
+++ b/docs/src/utilities.md
@@ -1,9 +1,4 @@
-# Utility Functions
-
-Flux provides utility functions which can be used to initialize your layers
-or to regularly execute callback functions.
-
-## Layer Initialisation
+# Random Weight Initialisation
 
 Flux initialises convolutional layers and recurrent cells with `glorot_uniform` by default.
 Most layers accept a function as an `init` keyword, which replaces this default. For example:
@@ -32,6 +27,8 @@ julia> Dense(4 => 5, tanh; init=Flux.randn32(MersenneTwister(1)))
 Dense(4 => 5, tanh)  # 25 parameters
 ```
 
+## Initialisation Functions
+
 ```@docs
 Flux.glorot_uniform
 Flux.glorot_normal
@@ -45,8 +42,14 @@ Flux.ones32
 Flux.zeros32
 Flux.rand32
 Flux.randn32
+```
+
+These functions call:
+
+```@docs
 Flux.rng_from_array
 Flux.default_rng_value
+Flux.nfan
 ```
 
 ## Changing the type of model parameters
@@ -58,136 +61,3 @@ The `eltype` of model `m` can be changed to `Float64` by `f64(m)`:
 Flux.f64
 Flux.f32
 ```
-
-## Model Building
-
-Flux provides some utility functions to help you generate models in an automated fashion.
-
-[`Flux.outputsize`](@ref) enables you to calculate the output sizes of layers like [`Conv`](@ref)
-when applied to input samples of a given size. This is achieved by passing a "dummy" array into
-the model that preserves size information without running any computation.
-`outputsize(f, inputsize)` works for all layers (including custom layers) out of the box.
-By default, `inputsize` expects the batch dimension,
-but you can exclude the batch size with `outputsize(f, inputsize; padbatch=true)` (assuming it to be one).
-
-Using this utility function lets you automate model building for various inputs like so:
-```julia
-"""
-    make_model(width, height, inchannels, nclasses;
-               layer_config = [16, 16, 32, 32, 64, 64])
-
-Create a CNN for a given set of configuration parameters.
-
-# Arguments
-- `width`: the input image width
-- `height`: the input image height
-- `inchannels`: the number of channels in the input image
-- `nclasses`: the number of output classes
-- `layer_config`: a vector of the number of filters per each conv layer
-"""
-function make_model(width, height, inchannels, nclasses;
-                    layer_config = [16, 16, 32, 32, 64, 64])
-  # construct a vector of conv layers programmatically
-  conv_layers = [Conv((3, 3), inchannels => layer_config[1])]
-  for (infilters, outfilters) in zip(layer_config, layer_config[2:end])
-    push!(conv_layers, Conv((3, 3), infilters => outfilters))
-  end
-
-  # compute the output dimensions for the conv layers
-  # use padbatch=true to set the batch dimension to 1
-  conv_outsize = Flux.outputsize(conv_layers, (width, height, nchannels); padbatch=true)
-
-  # the input dimension to Dense is programatically calculated from
-  #  width, height, and nchannels
-  return Chain(conv_layers..., Dense(prod(conv_outsize) => nclasses))
-end
-```
-
-```@docs
-Flux.outputsize
-```
-
-## Model Abstraction
-
-```@docs
-Flux.modules
-Flux.nfan
-```
-
-## Callback Helpers
-
-```@docs
-Flux.throttle
-Flux.stop
-Flux.skip
-```
-
-## Patience Helpers
-
-Flux provides utilities for controlling your training procedure according to some monitored condition and a maximum `patience`. For example, you can use `early_stopping` to stop training when the model is converging or deteriorating, or you can use `plateau` to check if the model is stagnating.
-
-For example, below we create a pseudo-loss function that decreases, bottoms out, and then increases. The early stopping trigger will break the loop before the loss increases too much.
-```julia
-# create a pseudo-loss that decreases for 4 calls, then starts increasing
-# we call this like loss()
-loss = let t = 0
-  () -> begin
-    t += 1
-    (t - 4) ^ 2
-  end
-end
-
-# create an early stopping trigger
-# returns true when the loss increases for two consecutive steps
-es = early_stopping(loss, 2; init_score = 9)
-
-# this will stop at the 6th (4 decreasing + 2 increasing calls) epoch
-@epochs 10 begin
-  es() && break
-end
-```
-
-The keyword argument `distance` of `early_stopping` is a function of the form `distance(best_score, score)`. By default `distance` is `-`, which implies that the monitored metric `f` is expected to be decreasing and minimized. If you use some increasing metric (e.g. accuracy), you can customize the `distance` function: `(best_score, score) -> score - best_score`.
-```julia
-# create a pseudo-accuracy that increases by 0.01 each time from 0 to 1
-# we call this like acc()
-acc = let v = 0
-  () -> v = max(1, v + 0.01)
-end
-
-# create an early stopping trigger for accuracy
-es = early_stopping(acc, 3; delta = (best_score, score) -> score - best_score)
-
-# this will iterate until the 10th epoch
-@epochs 10 begin
-  es() && break
-end
-```
-
-`early_stopping` and `plateau` are both built on top of `patience`. You can use `patience` to build your own triggers that use a patient counter. For example, if you want to trigger when the loss is below a threshold for several consecutive iterations:
-```julia
-threshold(f, thresh, delay) = patience(delay) do
-  f() < thresh
-end
-```
-
-Both `predicate` in `patience` and `f` in `early_stopping` / `plateau` can accept extra arguments. You can pass such extra arguments to `predicate` or `f` through the returned function:
-```julia
-trigger = patience((a; b) -> a > b, 3)
-
-# this will iterate until the 10th epoch
-@epochs 10 begin
-  trigger(1; b = 2) && break
-end
-
-# this will stop at the 3rd epoch
-@epochs 10 begin
-  trigger(3; b = 2) && break
-end
-```
-
-```@docs
-Flux.patience
-Flux.early_stopping
-Flux.plateau
-```

From e5f67dd6caf6f531149708c25834866d97c60166 Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Mon, 29 Aug 2022 18:30:39 -0400
Subject: [PATCH 2/9] add a page for Zygote

---
 docs/make.jl                |  1 +
 docs/src/training/zygote.md | 22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+)
 create mode 100644 docs/src/training/zygote.md

diff --git a/docs/make.jl b/docs/make.jl
index 0e562d2815..2091791beb 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -28,6 +28,7 @@ makedocs(
          "Training Models" => [
              "Optimisers" => "training/optimisers.md",
              "Training" => "training/training.md",
+             "Zygote.jl" => "training/zygote.md",
          ],
          "GPU Support" => "gpu.md",
          "Model Tools" => [
diff --git a/docs/src/training/zygote.md b/docs/src/training/zygote.md
new file mode 100644
index 0000000000..8687885912
--- /dev/null
+++ b/docs/src/training/zygote.md
@@ -0,0 +1,22 @@
+# Automatic Differentiation using Zygote.jl
+
+Flux re-exports the `gradient` from [Zygote](https://github.com/FluxML/Zygote.jl), and uses this function within [`train!`](@ref) to differentiate the model. Zygote has its own [documentation](https://fluxml.ai/Zygote.jl/dev/), in particulat listing some [limitations](https://fluxml.ai/Zygote.jl/dev/limitations/).
+
+```@docs
+Zygote.gradient
+Zygote.jacobian
+Zygote.withgradient
+```
+
+Sometimes it is necessary to exclude some code, or a whole function, from automatic differentiation. This can be done using [ChainRules](https://github.com/JuliaDiff/ChainRules.jl):
+
+```@docs
+ChainRulesCore.ignore_derivatives
+ChainRulesCore.@non_differentiable
+```
+
+To manually supply the gradient for one function, you should define a method of `rrule`. ChainRules has [detailed documentation](https://juliadiff.org/ChainRulesCore.jl/stable/) on how this works.
+
+```@docs
+ChainRulesCore.rrule
+```
\ No newline at end of file

From d31694dd28770c7c69d478b9bd967adc5bfe15a7 Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Mon, 29 Aug 2022 19:20:10 -0400
Subject: [PATCH 3/9] fixup

---
 docs/make.jl              | 6 +++---
 docs/src/data/onehot.md   | 2 ++
 docs/src/models/losses.md | 4 ++--
 docs/src/outputsize.md    | 2 +-
 docs/src/utilities.md     | 4 ++--
 5 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/docs/make.jl b/docs/make.jl
index 2091791beb..c3d7d138fa 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -1,13 +1,13 @@
-using Documenter, Flux, NNlib, Functors, MLUtils, BSON, Optimisers, OneHotArrays
+using Documenter, Flux, NNlib, Functors, MLUtils, BSON, Optimisers, OneHotArrays, Zygote, ChainRulesCore
 
 
 DocMeta.setdocmeta!(Flux, :DocTestSetup, :(using Flux); recursive = true)
 
 makedocs(
-    modules = [Flux, NNlib, Functors, MLUtils, BSON, Optimisers, OneHotArrays],
+    modules = [Flux, NNlib, Functors, MLUtils, BSON, Optimisers, OneHotArrays, Zygote, ChainRulesCore, Base],
     doctest = false,
     sitename = "Flux",
-    strict = [:cross_references,],
+    # strict = [:cross_references,],
     pages = [
         "Home" => "index.md",
         "Building Models" => [
diff --git a/docs/src/data/onehot.md b/docs/src/data/onehot.md
index 1df5cd27e8..d51c9963b9 100644
--- a/docs/src/data/onehot.md
+++ b/docs/src/data/onehot.md
@@ -51,6 +51,8 @@ julia> onecold(ans, [:a, :b, :c])
 
 Note that these operations returned `OneHotVector` and `OneHotMatrix` rather than `Array`s. `OneHotVector`s behave like normal vectors but avoid any unnecessary cost compared to using an integer index directly. For example, multiplying a matrix with a one-hot vector simply slices out the relevant row of the matrix under the hood.
 
+### Function listing
+
 ```@docs
 OneHotArrays.onehot
 OneHotArrays.onecold
diff --git a/docs/src/models/losses.md b/docs/src/models/losses.md
index 39703ec365..3929411db7 100644
--- a/docs/src/models/losses.md
+++ b/docs/src/models/losses.md
@@ -3,7 +3,7 @@
 Flux provides a large number of common loss functions used for training machine learning models.
 They are grouped together in the `Flux.Losses` module.
 
-Loss functions for supervised learning typically expect as inputs a target `y`, and a prediction `ŷ`.
+Loss functions for supervised learning typically expect as inputs a target `y`, and a prediction `ŷ` from your model.
 In Flux's convention, the order of the arguments is the following
 
 ```julia
@@ -21,7 +21,7 @@ loss(ŷ, y, agg=x->mean(w .* x))    # weighted mean
 loss(ŷ, y, agg=identity)           # no aggregation.
 ```
 
-## Losses Reference
+### Function listing
 
 ```@docs
 Flux.Losses.mae
diff --git a/docs/src/outputsize.md b/docs/src/outputsize.md
index 3cdffdcce3..b15bf1a8b4 100644
--- a/docs/src/outputsize.md
+++ b/docs/src/outputsize.md
@@ -1,4 +1,4 @@
-## Model Building
+# Size Propagation
 
 Flux provides some utility functions to help you generate models in an automated fashion.
 
diff --git a/docs/src/utilities.md b/docs/src/utilities.md
index 5f092c0216..a6f963fa58 100644
--- a/docs/src/utilities.md
+++ b/docs/src/utilities.md
@@ -27,7 +27,7 @@ julia> Dense(4 => 5, tanh; init=Flux.randn32(MersenneTwister(1)))
 Dense(4 => 5, tanh)  # 25 parameters
 ```
 
-## Initialisation Functions
+## Initialisation functions
 
 ```@docs
 Flux.glorot_uniform
@@ -52,7 +52,7 @@ Flux.default_rng_value
 Flux.nfan
 ```
 
-## Changing the type of model parameters
+## Changing the type of all parameters
 
 The default `eltype` for models is `Float32` since models are often trained/run on GPUs.
 The `eltype` of model `m` can be changed to `Float64` by `f64(m)`:

From 67ac0b177baf078c32db760a18721467cc038586 Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Mon, 29 Aug 2022 19:20:27 -0400
Subject: [PATCH 4/9] move NNlib's activation functions to their own page

---
 docs/make.jl                  |  1 +
 docs/src/models/activation.md | 39 +++++++++++++++++++++++++++++++++++
 docs/src/models/nnlib.md      | 33 +----------------------------
 3 files changed, 41 insertions(+), 32 deletions(-)
 create mode 100644 docs/src/models/activation.md

diff --git a/docs/make.jl b/docs/make.jl
index c3d7d138fa..21b2102502 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -19,6 +19,7 @@ makedocs(
             "Regularisation" => "models/regularisation.md",
             "Advanced Model Building" => "models/advanced.md",
             "NNlib.jl" => "models/nnlib.md",
+            "Activation Functions" => "models/activation.md",
             "Functors.jl" => "models/functors.md",
          ],
          "Handling Data" => [
diff --git a/docs/src/models/activation.md b/docs/src/models/activation.md
new file mode 100644
index 0000000000..8754090a04
--- /dev/null
+++ b/docs/src/models/activation.md
@@ -0,0 +1,39 @@
+
+# Activation Functions from NNlib.jl
+
+These non-linearities used between layers of your model are exported by the [NNlib](https://github.com/FluxML/NNlib.jl) package.
+
+Note that, unless otherwise stated, activation functions operate on scalars. To apply them to an array you can call `σ.(xs)`, `relu.(xs)` and so on. Alternatively, they can be passed to a layer like `Dense(784 => 1024, relu)` which will handle this broadcasting.
+
+```@docs
+celu
+elu
+gelu
+hardsigmoid
+sigmoid_fast
+hardtanh
+tanh_fast
+leakyrelu
+lisht
+logcosh
+logsigmoid
+mish
+relu
+relu6
+rrelu
+selu
+sigmoid
+softplus
+softshrink
+softsign
+swish
+hardswish
+tanhshrink
+trelu
+```
+
+Julia's `Base.Math` also provide `tanh`, which can be used as an activation function:
+
+```@docs
+tanh
+```
\ No newline at end of file
diff --git a/docs/src/models/nnlib.md b/docs/src/models/nnlib.md
index 4ca4c85104..cf42cc99bf 100644
--- a/docs/src/models/nnlib.md
+++ b/docs/src/models/nnlib.md
@@ -1,37 +1,6 @@
 # Neural Network primitives from NNlib.jl
 
-Flux re-exports all of the functions exported by the [NNlib](https://github.com/FluxML/NNlib.jl) package.
-
-## Activation Functions
-
-Non-linearities that go between layers of your model. Note that, unless otherwise stated, activation functions operate on scalars. To apply them to an array you can call `σ.(xs)`, `relu.(xs)` and so on.
-
-```@docs
-celu
-elu
-gelu
-hardsigmoid
-sigmoid_fast
-hardtanh
-tanh_fast
-leakyrelu
-lisht
-logcosh
-logsigmoid
-mish
-relu
-relu6
-rrelu
-selu
-sigmoid
-softplus
-softshrink
-softsign
-swish
-hardswish
-tanhshrink
-trelu
-```
+Flux re-exports all of the functions exported by the [NNlib](https://github.com/FluxML/NNlib.jl) package. This includes activation functions, described on the next page. Many of the functions on this page exist primarily as the internal implementation of Flux layer, but can also be used independently.
 
 ## Softmax
 

From 9f16d82ef153793b488e887dfb7a052f2b2ad4a9 Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Mon, 29 Aug 2022 19:47:04 -0400
Subject: [PATCH 5/9] restore callback helpers

---
 docs/make.jl                   | 1 +
 docs/src/training/callbacks.md | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/make.jl b/docs/make.jl
index 21b2102502..6c9dad2623 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -29,6 +29,7 @@ makedocs(
          "Training Models" => [
              "Optimisers" => "training/optimisers.md",
              "Training" => "training/training.md",
+             "Callback Helpers" => "training/callbacks.md",
              "Zygote.jl" => "training/zygote.md",
          ],
          "GPU Support" => "gpu.md",
diff --git a/docs/src/training/callbacks.md b/docs/src/training/callbacks.md
index 3343a1664c..99c80986f1 100644
--- a/docs/src/training/callbacks.md
+++ b/docs/src/training/callbacks.md
@@ -1,4 +1,4 @@
-## Callback Helpers
+# Callback Helpers
 
 ```@docs
 Flux.throttle

From b9c2352327fd2c68f1297d80414c83dbf0a6b400 Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Mon, 29 Aug 2022 19:47:20 -0400
Subject: [PATCH 6/9] re-name advanced... page

---
 docs/make.jl                | 2 +-
 docs/src/models/advanced.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/make.jl b/docs/make.jl
index 6c9dad2623..60089a1aad 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -17,7 +17,7 @@ makedocs(
             "Layer Reference" => "models/layers.md",
             "Loss Functions" => "models/losses.md",
             "Regularisation" => "models/regularisation.md",
-            "Advanced Model Building" => "models/advanced.md",
+            "Custom Layers" => "models/advanced.md",
             "NNlib.jl" => "models/nnlib.md",
             "Activation Functions" => "models/activation.md",
             "Functors.jl" => "models/functors.md",
diff --git a/docs/src/models/advanced.md b/docs/src/models/advanced.md
index c9895492a3..dcb4edfa25 100644
--- a/docs/src/models/advanced.md
+++ b/docs/src/models/advanced.md
@@ -1,4 +1,4 @@
-# Advanced Model Building and Customisation
+# Defining Customised Layers
 
 Here we will try and describe usage of some more advanced features that Flux provides to give more control over model building.
 

From bf49b664eb7f211d7f85ed9938fb45782d303304 Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Mon, 29 Aug 2022 19:48:21 -0400
Subject: [PATCH 7/9] tweak Zygote page

---
 docs/Project.toml           |  1 +
 docs/src/training/zygote.md | 23 ++++++++++++++++++++---
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/docs/Project.toml b/docs/Project.toml
index a8057d2e68..0879636f3c 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -7,6 +7,7 @@ MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 OneHotArrays = "0b1bfda6-eb8a-41d2-88d8-f5af5cad476f"
 Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
+Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
 Documenter = "0.27"
diff --git a/docs/src/training/zygote.md b/docs/src/training/zygote.md
index 8687885912..7e766754fb 100644
--- a/docs/src/training/zygote.md
+++ b/docs/src/training/zygote.md
@@ -2,12 +2,29 @@
 
 Flux re-exports the `gradient` from [Zygote](https://github.com/FluxML/Zygote.jl), and uses this function within [`train!`](@ref) to differentiate the model. Zygote has its own [documentation](https://fluxml.ai/Zygote.jl/dev/), in particulat listing some [limitations](https://fluxml.ai/Zygote.jl/dev/limitations/).
 
+### Implicit style
+
+Flux uses primarily what Zygote calls "implicit" gradients, [described here](https://fluxml.ai/Zygote.jl/dev/#Explicit-and-Implicit-Parameters-1) in its documentation. 
+
 ```@docs
-Zygote.gradient
-Zygote.jacobian
-Zygote.withgradient
+Zygote.gradient(f, pars::Zygote.Params)
+Zygote.Params
+Zygote.Grads
 ```
 
+### Explicit style
+
+The other way of using Zygote, and using most other AD packages, is to explicitly provide a function and its arguments.
+
+```@docs
+Zygote.gradient(f, args...)
+Zygote.withgradient(f, args...)
+Zygote.jacobian(f, args...)
+```
+
+
+### ChainRules
+
 Sometimes it is necessary to exclude some code, or a whole function, from automatic differentiation. This can be done using [ChainRules](https://github.com/JuliaDiff/ChainRules.jl):
 
 ```@docs

From 92f3b10ddf49b60d04b5d6da154fe2a233e96297 Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Thu, 15 Sep 2022 12:46:34 -0400
Subject: [PATCH 8/9] shape inference sounds better

---
 docs/make.jl           |  2 +-
 docs/src/outputsize.md | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/docs/make.jl b/docs/make.jl
index 60089a1aad..4cdcedce54 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -35,7 +35,7 @@ makedocs(
          "GPU Support" => "gpu.md",
          "Model Tools" => [
              "Saving & Loading" => "saving.md",
-             "Size Propagation" => "outputsize.md",
+             "Shape Inference" => "outputsize.md",
              "Weight Initialisation" => "utilities.md",
          ],
          "Performance Tips" => "performance.md",
diff --git a/docs/src/outputsize.md b/docs/src/outputsize.md
index b15bf1a8b4..d692816b46 100644
--- a/docs/src/outputsize.md
+++ b/docs/src/outputsize.md
@@ -1,10 +1,10 @@
-# Size Propagation
+# Shape Inference
 
-Flux provides some utility functions to help you generate models in an automated fashion.
+To help you generate models in an automated fashion, [`Flux.outputsize`](@ref) lets you 
+calculate the size returned produced by layers for a given size input.
+This is especially useful for layers like [`Conv`](@ref).
 
-[`Flux.outputsize`](@ref) enables you to calculate the output sizes of layers like [`Conv`](@ref)
-when applied to input samples of a given size. This is achieved by passing a "dummy" array into
-the model that preserves size information without running any computation.
+It works by passing a "dummy" array into the model that preserves size information without running any computation.
 `outputsize(f, inputsize)` works for all layers (including custom layers) out of the box.
 By default, `inputsize` expects the batch dimension,
 but you can exclude the batch size with `outputsize(f, inputsize; padbatch=true)` (assuming it to be one).
@@ -44,4 +44,4 @@ end
 
 ```@docs
 Flux.outputsize
-```
\ No newline at end of file
+```

From 37982aa343ac3d6da3ca5b2871d379dca42c5f8c Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Thu, 15 Sep 2022 12:47:01 -0400
Subject: [PATCH 9/9] move functors down

---
 docs/make.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/make.jl b/docs/make.jl
index 4cdcedce54..84f20eb45c 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -20,7 +20,6 @@ makedocs(
             "Custom Layers" => "models/advanced.md",
             "NNlib.jl" => "models/nnlib.md",
             "Activation Functions" => "models/activation.md",
-            "Functors.jl" => "models/functors.md",
          ],
          "Handling Data" => [
              "MLUtils.jl" => "data/mlutils.md",
@@ -37,6 +36,7 @@ makedocs(
              "Saving & Loading" => "saving.md",
              "Shape Inference" => "outputsize.md",
              "Weight Initialisation" => "utilities.md",
+             "Functors.jl" => "models/functors.md",
          ],
          "Performance Tips" => "performance.md",
          "Flux's Ecosystem" => "ecosystem.md",