Merge #1636

bors[bot] · mcabbott · web-flow · commit 8162b8ae5607 · 2021-07-11T14:13:41.000Z
1636: Add warnings for mismatched sizes in losses r=mcabbott a=mcabbott Closes #1599, I think, by making loss functions give a warning if the sizes don't match: ```julia julia> mse([1,0], [1 0 0]) ┌ Error: size mismatch in loss function! In future this will be an error; in Flux 0.12 broadcasting acceps some mismatches │ summary(ŷ) = "2-element Vector{Int64}" │ summary(y) = "1×3 Matrix{Int64}" └ @ Flux.Losses ~/.julia/dev/Flux/src/losses/utils.jl:29 0.5 julia> @Btime gradient(sum∘mse, $(rand(10,100)), $(rand(10,100))); 19.709 μs (130 allocations: 51.25 KiB) 19.625 μs (130 allocations: 51.25 KiB) ``` Appears to have no effect on speed, although Zygote is weird and maybe someone has a better test of that. Edit -- closes #1522, too. Co-authored-by: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
diff --git a/src/losses/functions.jl b/src/losses/functions.jl
@@ -18,7 +18,10 @@ julia> Flux.mae(y_model, 1:3)
 0.10000000000000009
 ```
 """
-mae(ŷ, y; agg = mean) = agg(abs.(ŷ .- y))
+function mae(ŷ, y; agg = mean)
+  _check_sizes(ŷ, y)
+  agg(abs.(ŷ .- y))
+end
 
 """
     mse(ŷ, y; agg = mean)
@@ -39,7 +42,10 @@ julia> Flux.mse(y_model, y_true)
 0.010000000000000018
 ```
 """
-mse(ŷ, y; agg = mean) = agg((ŷ .- y) .^ 2)
+function mse(ŷ, y; agg = mean)
+  _check_sizes(ŷ, y)
+  agg((ŷ .- y) .^ 2)
+end
 
 """
     msle(ŷ, y; agg = mean, ϵ = eps(ŷ))
@@ -60,8 +66,10 @@ julia> Flux.msle(Float32[0.9, 1.8, 2.7], 1:3)
 0.011100831f0
 ```
 """
-msle(ŷ, y; agg = mean, ϵ = epseltype(ŷ)) =
+function msle(ŷ, y; agg = mean, ϵ = epseltype(ŷ))
+  _check_sizes(ŷ, y)
   agg((log.((ŷ .+ ϵ) ./ (y .+ ϵ))) .^2 )
+end
 
 """
     huber_loss(ŷ, y; δ = 1, agg = mean)
@@ -74,6 +82,7 @@ given the prediction `ŷ` and true values `y`.
                  |  δ * (|ŷ - y| - 0.5 * δ), otherwise
 """
 function huber_loss(ŷ, y; agg = mean, δ = ofeltype(ŷ, 1))
+   _check_sizes(ŷ, y)
    abs_error = abs.(ŷ .- y)
    #TODO: remove dropgrad when Zygote can handle this function with CuArrays
    temp = Zygote.dropgrad(abs_error .<  δ)
@@ -203,7 +212,8 @@ julia> Flux.crossentropy(y_model, y_smooth)
 ```
 """
 function crossentropy(ŷ, y; dims = 1, agg = mean, ϵ = epseltype(ŷ))
-    agg(.-sum(xlogy.(y, ŷ .+ ϵ); dims = dims))
+  _check_sizes(ŷ, y)
+  agg(.-sum(xlogy.(y, ŷ .+ ϵ); dims = dims))
 end
 
 """
@@ -241,7 +251,8 @@ julia> Flux.crossentropy(softmax(y_model), y_label)
 ```
 """
 function logitcrossentropy(ŷ, y; dims = 1, agg = mean)
-    agg(.-sum(y .* logsoftmax(ŷ; dims = dims); dims = dims))
+  _check_sizes(ŷ, y)
+  agg(.-sum(y .* logsoftmax(ŷ; dims = dims); dims = dims))
 end
 
 """
@@ -289,7 +300,8 @@ julia> Flux.crossentropy(y_prob, y_hot)
 ```
 """
 function binarycrossentropy(ŷ, y; agg = mean, ϵ = epseltype(ŷ))
-    agg(@.(-xlogy(y, ŷ + ϵ) - xlogy(1 - y, 1 - ŷ + ϵ)))
+  _check_sizes(ŷ, y)
+  agg(@.(-xlogy(y, ŷ + ϵ) - xlogy(1 - y, 1 - ŷ + ϵ)))
 end
 
 """
@@ -318,7 +330,8 @@ julia> Flux.binarycrossentropy(sigmoid.(y_model), y_bin)
 ```
 """
 function logitbinarycrossentropy(ŷ, y; agg = mean)
-    agg(@.((1 - y) * ŷ - logσ(ŷ)))
+  _check_sizes(ŷ, y)
+  agg(@.((1 - y) * ŷ - logσ(ŷ)))
 end
 
 """
@@ -357,6 +370,7 @@ Inf
 ```
 """
 function kldivergence(ŷ, y; dims = 1, agg = mean, ϵ = epseltype(ŷ))
+  _check_sizes(ŷ, y)
   entropy = agg(sum(xlogx.(y), dims = dims))
   cross_entropy = crossentropy(ŷ, y; dims = dims, agg = agg, ϵ = ϵ)
   return entropy + cross_entropy
@@ -370,7 +384,10 @@ end
 
 [More information.](https://peltarion.com/knowledge-center/documentation/modeling-view/build-an-ai-model/loss-functions/poisson).
 """
-poisson_loss(ŷ, y; agg = mean) = agg(ŷ .- xlogy.(y, ŷ))
+function poisson_loss(ŷ, y; agg = mean)
+  _check_sizes(ŷ, y)
+  agg(ŷ .- xlogy.(y, ŷ))
+end
 
 """
     hinge_loss(ŷ, y; agg = mean)
@@ -381,8 +398,10 @@ prediction `ŷ` and true labels `y` (containing 1 or -1); calculated as
 
 See also: [`squared_hinge_loss`](@ref)
 """
-hinge_loss(ŷ, y; agg = mean) =
+function hinge_loss(ŷ, y; agg = mean)
+  _check_sizes(ŷ, y)
   agg(max.(0, 1 .- ŷ .* y))
+end
 
 """
     squared_hinge_loss(ŷ, y)
@@ -392,8 +411,10 @@ Return the squared hinge_loss loss given the prediction `ŷ` and true labels `y
 
 See also: [`hinge_loss`](@ref)
 """
-squared_hinge_loss(ŷ, y; agg = mean) =
+function squared_hinge_loss(ŷ, y; agg = mean)
+  _check_sizes(ŷ, y)
   agg((max.(0, 1 .- ŷ .* y)) .^ 2)
+end
 
 """
     dice_coeff_loss(ŷ, y; smooth = 1)
@@ -405,8 +426,10 @@ Similar to the F1_score. Calculated as:
 
     1 - 2*sum(|ŷ .* y| + smooth) / (sum(ŷ.^2) + sum(y.^2) + smooth)
 """
-dice_coeff_loss(ŷ, y; smooth = ofeltype(ŷ, 1.0)) =
+function dice_coeff_loss(ŷ, y; smooth = ofeltype(ŷ, 1.0))
+  _check_sizes(ŷ, y)
   1 - (2 * sum(y .* ŷ) + smooth) / (sum(y .^ 2) + sum(ŷ .^ 2) + smooth) #TODO agg
+end
 
 """
     tversky_loss(ŷ, y; β = 0.7)
@@ -418,6 +441,7 @@ Calculated as:
     1 - sum(|y .* ŷ| + 1) / (sum(y .* ŷ + β*(1 .- y) .* ŷ + (1 - β)*y .* (1 .- ŷ)) + 1)
 """
 function tversky_loss(ŷ, y; β = ofeltype(ŷ, 0.7))
+    _check_sizes(ŷ, y)
     #TODO add agg
     num = sum(y .* ŷ) + 1
     den = sum(y .* ŷ + β * (1 .- y) .* ŷ + (1 - β) * y .* (1 .- ŷ)) + 1
@@ -454,6 +478,7 @@ See also: [`Losses.focal_loss`](@ref) for multi-class setting
 
 """
 function binary_focal_loss(ŷ, y; agg=mean, γ=2, ϵ=epseltype(ŷ))
+    _check_sizes(ŷ, y)
     ŷ = ŷ .+ ϵ
     p_t = y .* ŷ  + (1 .- y) .* (1 .- ŷ)
     ce = -log.(p_t)
@@ -497,9 +522,11 @@ See also: [`Losses.binary_focal_loss`](@ref) for binary (not one-hot) labels
 
 """
 function focal_loss(ŷ, y; dims=1, agg=mean, γ=2, ϵ=epseltype(ŷ))
+    _check_sizes(ŷ, y)
     ŷ = ŷ .+ ϵ
     agg(sum(@. -y * (1 - ŷ)^γ * log(ŷ); dims=dims))
 end
+
 ```@meta
 DocTestFilters = nothing
 ```
diff --git a/src/losses/utils.jl b/src/losses/utils.jl
@@ -1,17 +1,17 @@
 """
-xlogx(x)
+    xlogx(x)
 
-Return `x * log(x)` for `x ≥ 0`, handling `x = 0` by taking the downward limit.
+Return `x * log(x)` for `x ≥ 0`, handling `x == 0` by taking the limit from above, to get zero.
 """
 function xlogx(x)
   result = x * log(x)
   ifelse(iszero(x), zero(result), result)
 end
 
 """
-xlogy(x, y)
+    xlogy(x, y)
 
-Return `x * log(y)` for `y > 0` with correct limit at `x = 0`.
+Return `x * log(y)` for `y > 0`, and zero when `x == 0`.
 """
 function xlogy(x, y)
   result = x * log(y)
@@ -22,3 +22,15 @@ end
   res = xlogy.(x, y)
   res, Δ -> (nothing, Zygote.unbroadcast(x, xlogy.(Δ, y)), Zygote.unbroadcast(y, Δ .* x ./ y))
 end
+
+# This can be made an error in Flux v0.13, for now just a warning
+function _check_sizes(ŷ::AbstractArray, y::AbstractArray)
+  for d in 1:max(ndims(ŷ), ndims(y)) 
+    if size(ŷ,d) != size(y,d)
+      @warn "Size mismatch in loss function! In future this will be an error. In Flux <= 0.12 broadcasting accepts this, but may not give sensible results" summary(ŷ) summary(y) maxlog=3 _id=hash(size(y))
+    end
+  end
+end
+_check_sizes(ŷ, y) = nothing  # pass-through, for constant label e.g. y = 1
+
+Zygote.@nograd _check_sizes
diff --git a/test/losses.jl b/test/losses.jl
@@ -104,6 +104,7 @@ yls = y.*(1-2sf).+sf
   @test binarycrossentropy.(σ.(logŷ), label_smoothing(y, 2sf; dims=0); ϵ=0) ≈ -yls.*log.(σ.(logŷ)) - (1 .- yls).*log.(1 .- σ.(logŷ))
   @test binarycrossentropy(σ.(logŷ), y; ϵ=0) ≈ mean(-y.*log.(σ.(logŷ)) - (1 .- y).*log.(1 .- σ.(logŷ)))
   @test binarycrossentropy(σ.(logŷ), y) ≈ mean(-y.*log.(σ.(logŷ) .+ eps.(σ.(logŷ))) - (1 .- y).*log.(1 .- σ.(logŷ) .+ eps.(σ.(logŷ))))
+  @test binarycrossentropy([0.1,0.2,0.9], 1) ≈ -mean(log, [0.1,0.2,0.9])  # constant label
 end
 
 @testset "logitbinarycrossentropy" begin