Refactor loadmodel! to use a custom recursion instead of fmap. Add more tests.

darsnack · darsnack · commit a6cdfdd7022e · 2022-04-05T08:13:39.000-05:00
diff --git a/docs/src/saving.md b/docs/src/saving.md
@@ -85,23 +85,9 @@ This ensures that the model loaded from `"mymodel.bson"` matches the structure o
 
 ```@docs
 Flux.loadmodel!
-Flux.loadto!
-Flux.isloadleaf
 Flux.loadleaf!
 ```
 
-### Customizing `loadmodel!` for a custom layer
-
-By default, [`loadmodel!`](@ref) will recursively walk a nested model (like a `Chain`) using [`Functors.fmap`](@ref) until it encounters a loading *leaf node*. A leaf node is defined as any node for which [`Flux.isloadleaf`](@ref) returns `true`. For example, consider the model
-
-```julia
-model = Chain(Dense(10 => 5), Parallel(+, Dense(5 => 2), Dense(5 => 2)))
-```
-
-Here, the `Chain` and `Parallel` layers are not leaf nodes, but all the `Dense` layers are leaf nodes. This makes sense, because `Dense` layers are the ones with parameters that we need to copy. The default behavior for [`Flux.isloadleaf`](@ref) should work for most custom layers, but you can override this function for your type.
-
-Once a pair of leaf nodes is encountered, `loadmodel!` will call [`Flux.loadto!](@ref) on them. By default, this just copies the parameters from one leaf node to the other, but you can customize the behavior by overriding `loadto!` for your pair of types.
-
 ## Checkpointing
 
 In longer training runs it's a good idea to periodically save your model, so that you can resume if training is interrupted (for example, if there's a power cut). You can do this by saving the model in the [callback provided to `train!`](training/training.md).
diff --git a/src/loading.jl b/src/loading.jl
@@ -1,104 +1,68 @@
 """
-    isloadleaf(x)
-
-Return `true` whenever `x` should be treated as a "leaf node"
-for the purposes of loading parameters.
-By default, `isloadleaf` returns `true` if [`Functors.isleaf`](@ref)
-is `true` for all [`Functors.children(x)`](@ref `Functors.children`).
-
-You can override this function for a specific type if needed.
-"""
-isloadleaf(x) = all(Functors.isleaf, Functors.children(x))
+    loadleaf!(dst, src, err)
 
+Copy `src` to `dst` or throw `err` when their sizes are mismatched.
+By default, use `copyto!` when `dst` and `src` are arrays.
+When only `dst` is an array, set every element to `src`.
+Otherwise, just return `dst`.
 """
-    loadleaf!(x, x̄, err)
-
-Copy `x̄` to `x` or throw `err` when their sizes are mismatched.
-By default, use `copyto!` when `x` and `x̄` are arrays.
-Otherwise, just return `x`.
-"""
-loadleaf!(x, x̄, err) = x
-function loadleaf!(x::AbstractArray, x̄, err)
-  x .= x̄
-  return x
-end
-function loadleaf!(x::AbstractArray, x̄::AbstractArray, err)
-  (size(x) == size(x̄)) || throw(err)
-  copyto!(x, x̄)
+loadleaf!(dst, src, err) = dst
+function loadleaf!(dst::AbstractArray, src, err)
+  dst .= src
+  return dst
 end
-
-"""
-    loadto!(m, m̄)
-
-Load a leaf node `m̄` into `m`.
-
-By default, call [`Flux.loadleaf!`](@ref) on each pair of children
-in `zip(Functors.children(m), Functors.children(m̄))`.
-"""
-function loadto!(m::T, m̄::S) where {T, S}
-  (nameof(T) == nameof(S)) || throw(ArgumentError("Tried to load $m̄ into $m."))
-  
-  ls, _ = functor(m)
-  l̄s, _ = functor(m̄)
-  (keys(ls) == keys(l̄s)) ||
-    throw(ArgumentError("Tried to load $m̄ into $m but the structures do not match."))
-
-  err = DimensionMismatch("Tried to load $m̄ into $m but the parameter sizes do not match.")
-  foreach((l, l̄) -> loadleaf!(l, l̄, err), ls, l̄s)
-
-  return m
+function loadleaf!(dst::AbstractArray, src::AbstractArray, err)
+  (size(dst) == size(src)) || throw(err)
+  copyto!(dst, src)
 end
 
 """
-    loadmodel!(m, m̄)
+    loadmodel!(dst, src)
 
-Copy all the parameters (trainable and non-trainable) from `m̄` to `m`.
+Copy all the parameters (trainable and non-trainable) from `src` to `dst`.
 
-`loadmodel!` recursively walks `m` and `m̄` until it encounters
-a subfield, `x`, (i.e. layer) where `isloadleaf(x)` is true.
-The parameters of the matching subfield, `x̄`, are copied to `x`,
-throwing an error whenever:
-- `x` and `x̄` are not the same type (e.g. loading a `Conv` to a `Dense`)
-- `x` and `x̄` do not share the same fields
-- the parameter sizes are mismatched between `x` and `x̄`
+`loadmodel!` recursively walks the [`Functors.children`](@ref) of `dst` and `src`
+calling `loadleaf!` on any pair of children where [`Functors.isleaf`](@ref) is true.
+It throws an error whenever:
+- `dst` and `src` do not share the same fields (at any level)
+- the sizes of leaf nodes are mismatched between `dst` and `src`
 
 ```julia
 julia> using Flux: loadmodel!
 
-julia> m = Chain(Dense(Flux.ones32(2, 5)), Dense(2 => 1))
+julia> dst = Chain(Dense(Flux.ones32(2, 5)), Dense(2 => 1))
 Chain(
   Dense(5 => 2),                        # 12 parameters
   Dense(2 => 1),                        # 3 parameters
 )                   # Total: 4 arrays, 15 parameters, 316 bytes.
 
-julia> m̄ = Chain(Dense(5 => 2), Dense(2 => 1));
+julia> src = Chain(Dense(5 => 2), Dense(2 => 1));
 
-julia> all(isone, m[1].weight)
+julia> all(isone, dst[1].weight)
 true
 
-julia> m = loadmodel!(m, m̄)
+julia> dst = loadmodel!(dst, src)
 Chain(
   Dense(5 => 2),                        # 12 parameters
   Dense(2 => 1),                        # 3 parameters
 )                   # Total: 4 arrays, 15 parameters, 316 bytes.
 
-julia> all(isone, m[1].weight)
+julia> all(isone, dst[1].weight)
 false
 
-julia> m[1].weight == m̄[1].weight
+julia> dst[1].weight == src[1].weight
 true
 
-julia> m[2].bias == m̄[2].bias
+julia> dst[2].bias == src[2].bias
 true
 ```
 
 See [`Flux.loadleaf!`](@ref) for more details on the copy behavior.
-See [`Flux.isloadleaf`](@ref) for more details on which layers are considered leaves.
 
 !!! warning
-    This function allows `m̄` to be a vector or `Params` for backwards-compatibility.
+    This function allows `src` to be a `Params` for backwards-compatibility.
     You should avoid using `loadmodel!` this way, because it skips most of the structural
-    checking used when `m̄` is also a struct. Silent errors may occur.
+    checking used when `src` is also a nested structure. Silent errors may occur.
 """
 function loadmodel!(m, xs::Params)
   for (p, x) in zip(params(m), xs)
@@ -107,5 +71,16 @@ function loadmodel!(m, xs::Params)
     copyto!(p, x)
   end
 end
-loadmodel!(m, xs::AbstractVector) = loadmodel!(m, params(xs))
-loadmodel!(m, m̄) = fmap(loadto!, m, m̄; exclude = isloadleaf)
+function loadmodel!(dst, src)
+  ldsts, _ = functor(dst)
+  lsrcs, _ = functor(src)
+  (keys(ldsts) == keys(lsrcs)) ||
+    throw(ArgumentError("Tried to load $src into $dst but the structures do not match."))
+
+  err = DimensionMismatch("Tried to load $src into $dst but the parameter sizes do not match.")
+  foreach(ldsts, lsrcs) do ldst, lsrc
+    Functors.isleaf(ldst) ? loadleaf!(ldst, lsrc, err) : loadmodel!(ldst, lsrc)
+  end
+
+  return dst
+end
diff --git a/test/utils.jl b/test/utils.jl
@@ -378,7 +378,7 @@ end
     end
   end
 
-  @testset "loadmodel!(m, m̄)" begin
+  @testset "loadmodel!(dst, src)" begin
     import Flux: loadmodel!, Zeros
 
     m1 = Chain(Dense(10, 5), Dense(5, 2, relu))
@@ -389,17 +389,111 @@ end
     m6 = Chain(Dense(10, 5), Parallel(+, Dense(5, 2), Dense(5, 2)))
 
     loadmodel!(m1, m2)
+    # trainable parameters copy over
     @test m1[1].weight == m2[1].weight
     @test m1[1].bias == m2[1].bias
+    # non-array leaves are untouched
     @test m1[2].σ == relu
+
     loadmodel!(m5, m6)
+    # more complex nested structures also work
     @test m5[1].weight == m6[1].weight
     @test m5[2][1].weight == m6[2][1].weight
-    @test m5[2][1].bias == Zeros()
+    # false bias is not overwritten
+    @test m5[2][1].bias == false
 
+    # mismatched nodes throw an error
     @test_throws ArgumentError loadmodel!(m1, m3)
-    @test_throws DimensionMismatch loadmodel!(m1, m4)
     @test_throws ArgumentError loadmodel!(m1, m5)
+    # size mismatches throw an error
+    @test_throws DimensionMismatch loadmodel!(m1, m4)
+
+    m1 = Chain(Conv((3, 3), 3 => 16), BatchNorm(16), Flux.flatten, Dropout(0.2))
+    m2 = Chain(Conv((3, 3), 3 => 16), BatchNorm(16), x -> reshape(x, :, size(x)[end]), Dropout(0.1))
+    m2[2].μ .= rand(Float32, size(m2[2].μ)...)
+    loadmodel!(m1, m2)
+    # non-trainable parameters are copied as well
+    @test m1[2].μ == m2[2].μ
+    # functions are not copied
+    @test m1[3] == Flux.flatten
+    # dropout rate is not copied
+    @test m1[4].p == 0.2
+
+    # from LegolasFlux (https://github.com/beacon-biosignals/LegolasFlux.jl/blob/80569ab63a8248a8a063c76e0bbf701f4ada9bd4/examples/digits.jl#L33)
+    # tests Chain(...) vs Chain([...])
+    # tests MaxPool
+    # tests testmode!/trainmode! is not copied
+    # tests Dense, Conv, BatchNorm, Dropout (like above) but in a bigger model
+    chain1 = Chain(Dropout(0.2),
+                   Conv((3, 3), 1 => 32, relu),
+                   BatchNorm(32, relu),
+                   MaxPool((2, 2)),
+                   Dropout(0.2),
+                   Conv((3, 3), 32 => 16, relu),
+                   Dropout(0.2),
+                   MaxPool((2, 2)),
+                   Dropout(0.2),
+                   Conv((3, 3), 16 => 10, relu),
+                   Dropout(0.2),
+                   x -> reshape(x, :, size(x, 4)),
+                   Dropout(0.2),
+                   Dense(90, 10),
+                   softmax)
+    chain2 = Chain([Dropout(0.1),
+                   Conv((3, 3), 1 => 32, relu),
+                   BatchNorm(32, relu),
+                   MaxPool((3, 3)),
+                   Dropout(0.1),
+                   Conv((3, 3), 32 => 16, relu),
+                   Dropout(0.1),
+                   MaxPool((3, 3)),
+                   Dropout(0.1),
+                   Conv((3, 3), 16 => 10, relu),
+                   Dropout(0.1),
+                   x -> reshape(x, :, size(x, 4)),
+                   Dropout(0.1),
+                   Dense(90, 10),
+                   softmax])
+    chain2[3].μ .= 5f0
+    chain2[3].σ² .= 2f0
+    testmode!(chain2)
+    loadmodel!(chain1, chain2)
+    for (dst, src) in zip(chain1, chain2)
+      if dst isa Dropout
+        @test dst.p == 0.2
+      elseif dst isa Union{Conv, Dense}
+        @test dst.weight == src.weight
+        @test dst.bias == src.bias
+      elseif dst isa MaxPool
+        @test dst.k == (2, 2)
+      elseif dst isa BatchNorm
+        @test dst.μ == src.μ
+        @test dst.σ² == src.σ²
+        @test isnothing(dst.active)
+      end
+    end
+
+    # copy only a subset of the model
+    chain1[end - 1].weight .= 1f0
+    chain1[3].μ .= 3f0
+    chain1[2].bias .= 5f0
+    loadmodel!(chain2[end - 1], chain1[end - 1])
+    loadmodel!(chain2[3], chain1[3])
+    @test chain2[end - 1].weight == chain1[end - 1].weight
+    @test chain2[3].μ == chain1[3].μ
+    @test chain2[2].bias != chain1[2].bias
+
+    # test shared weights
+    m1 = Chain(Dense(10 => 5), Dense(5 => 2))
+    m2 = Chain(Dense(transpose(m1[2].weight)), Dense(permutedims(m1[1].weight)))
+    m3 = Chain(Dense(m1[1].weight), Dense(m1[2].weight))
+    m2[2].weight .= 1f0
+    loadmodel!(m1, m3)
+    @test m1[2].weight === parent(m2[1].weight)
+    @test m1[2].weight == transpose(m2[1].weight)
+    @test m1[1].weight === m3[1].weight
+    @test m2[2].weight != transpose(m1[1].weight)
+    @test m3[2].weight == transpose(m2[1].weight)
   end
 
   @testset "destructure" begin