add embedding layer

CarloLucibello · darsnack · CarloLucibello · commit 7175c362baa5 · 2021-07-11T08:06:29.000+02:00
Co-authored-by: Kyle Daruwalla &lt;daruwalla.k.public@icloud.com&gt;
diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md
@@ -58,6 +58,7 @@ SkipConnection
 Parallel
 Flux.Bilinear
 Flux.Diagonal
+Flux.Embedding
 ```
 
 ## Normalisation & Regularisation
diff --git a/src/Flux.jl b/src/Flux.jl
@@ -11,7 +11,7 @@ using Zygote: Params, @adjoint, gradient, pullback, @nograd
 export gradient
 
 export Chain, Dense, Maxout, SkipConnection, Parallel, flatten,
-       RNN, LSTM, GRU,
+       RNN, LSTM, GRU, Embedding,
        SamePad, Conv, CrossCor, ConvTranspose, DepthwiseConv,
        AdaptiveMaxPool, AdaptiveMeanPool, GlobalMaxPool, GlobalMeanPool, MaxPool, MeanPool,
        Dropout, AlphaDropout, LayerNorm, BatchNorm, InstanceNorm, GroupNorm,
diff --git a/src/layers/basic.jl b/src/layers/basic.jl
@@ -8,6 +8,7 @@ on a given input.
 `m[1:3](x)` will calculate the output of the first three layers.
 
 # Examples
+
 ```jldoctest
 julia> m = Chain(x -> x^2, x -> x+1);
 
@@ -428,3 +429,55 @@ function Base.show(io::IO, m::Parallel)
   join(io, m.layers, ", ")
   print(io, ")")
 end
+
+"""
+    Embedding(in, out; init=randn)
+
+A lookup table that stores embeddings of dimension `out` 
+for a vocabulary of size `in`. 
+
+This layers is often used to store word embeddings and retrieve them using indices. 
+The input to the layer can be either a vector of indexes
+or the corresponding [onehot encoding](@ref Flux.OneHotArray). 
+
+# Examples
+
+```julia-repl
+julia> vocab_size, embed_size = 1000, 4;
+
+julia> model = Embedding(vocab_size, embed_size)
+Embedding(1000, 4)
+
+julia> vocab_idxs = [1, 722, 53, 220, 3]
+
+julia> x = OneHotMatrix(vocab_idxs, vocab_size);
+
+julia> model(x)
+4×5 Matrix{Float32}:
+  0.91139    0.670462    0.463217   0.670462    0.110932
+  0.247225  -0.0823874   0.698694  -0.0823874   0.945958
+ -0.393626  -0.590136   -0.545422  -0.590136    0.77743
+ -0.497621   0.87595    -0.870251   0.87595    -0.772696
+```
+
+julia> model(vocab_idxs) == model(x)
+true
+"""
+struct Embedding{W}
+  weight::W
+end
+
+@functor Embedding
+
+function Embedding(in::Integer, out::Integer;
+               init = (i...) -> randn(Float32, i...))
+  return Embedding(init(out, in))
+end
+
+(m::Embedding)(x::Union{OneHotVector, OneHotMatrix}) = m.weight * x # equivalent to m.weight[:,onecold(x)]
+(m::Embedding)(x::Union{Int,AbstractVector}) = m.weight[:, x]
+(m::Embedding)(x::AbstractArray) = reshape(m(vec(x)), :, size(x)...)
+
+function Base.show(io::IO, m::Embedding)
+  print(io, "Embedding($(size(m.weight, 2)), $(size(m.weight, 1)))")
+end
diff --git a/test/cuda/layers.jl b/test/cuda/layers.jl
@@ -259,3 +259,18 @@ end
     end
   end
 end
+
+@testset "Embedding" begin
+  vocab_size, embed_size = 10, 4
+  m = Embedding(vocab_size, embed_size)
+  x = rand(1:vocab_size, 3)
+  y = m(x)
+  m_g = m |> gpu
+  x_g = x |> gpu
+  y_g = m_g(x_g)
+  @test collect(y_g) == y
+  gs = gradient(() -> sum(tanh.(m(x))), params(m))
+  gs_g = gradient(() -> sum(tanh.(m_g(x_g))), params(m_g))
+  @test collect(gs_g[m_g.weight]) ≈ gs[m.weight]
+end
+
diff --git a/test/layers/basic.jl b/test/layers/basic.jl
@@ -191,4 +191,29 @@ import Flux: activations
       @test size(Parallel(+, Dense(10, 2), Dense(5, 2), Dense(4, 2))(inputs)) == (2,)
     end
   end
+
+  @testset "Embedding" begin
+    vocab_size, embed_size = 10, 4
+    m = Embedding(vocab_size, embed_size)
+    @test size(m.weight) == (embed_size, vocab_size)
+    
+    x = rand(1:vocab_size, 3)
+    y = m(x)
+    @test y isa Matrix{Float32}
+    @test y ≈ m.weight[:,x]
+    x2 = OneHotMatrix(x, vocab_size)
+    y2 = m(x2)
+    @test y2 isa Matrix{Float32}
+    @test y2 ≈ y
+    @test_throws DimensionMismatch m(OneHotMatrix(x, 1000))
+
+    x = rand(1:vocab_size, 3, 4)
+    y = m(x)
+    @test y isa Array{Float32, 3}
+    @test size(y) ==  (embed_size, 3, 4)
+
+    @test m(2) ≈ m.weight[:,2]
+    @test m(OneHotVector(3, vocab_size)) ≈ m.weight[:,3]
+    @test_throws DimensionMismatch m(OneHotVector(3, 1000))
+  end
 end