cl/embed

CarloLucibello · CarloLucibello · commit 003134003157 · 2021-07-09T18:01:56.000+02:00
diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md
@@ -58,6 +58,7 @@ SkipConnection
 Parallel
 Flux.Bilinear
 Flux.Diagonal
+Flux.Embedding
 ```
 
 ## Normalisation & Regularisation
diff --git a/src/Flux.jl b/src/Flux.jl
@@ -11,7 +11,7 @@ using Zygote: Params, @adjoint, gradient, pullback, @nograd
 export gradient
 
 export Chain, Dense, Maxout, SkipConnection, Parallel, flatten,
-       RNN, LSTM, GRU,
+       RNN, LSTM, GRU, Embedding,
        SamePad, Conv, CrossCor, ConvTranspose, DepthwiseConv,
        AdaptiveMaxPool, AdaptiveMeanPool, GlobalMaxPool, GlobalMeanPool, MaxPool, MeanPool,
        Dropout, AlphaDropout, LayerNorm, BatchNorm, InstanceNorm, GroupNorm,
diff --git a/src/layers/basic.jl b/src/layers/basic.jl
@@ -8,6 +8,7 @@ on a given input.
 `m[1:3](x)` will calculate the output of the first three layers.
 
 # Examples
+
 ```jldoctest
 julia> m = Chain(x -> x^2, x -> x+1);
 
@@ -388,7 +389,8 @@ If called with multiple inputs, they are `zip`ped with the layers, thus `Paralle
 
 ```jldoctest
 julia> model = Chain(Dense(3, 5),
-                     Parallel(vcat, Dense(5, 4), Chain(Dense(5, 7), Dense(7, 4))),
+                     Parallel(vcat, De
+                     print(io, ")")nse(5, 4), Chain(Dense(5, 7), Dense(7, 4))),
                      Dense(8, 17));
 
 julia> size(model(rand(3)))
@@ -421,4 +423,57 @@ function Base.show(io::IO, m::Parallel)
   print(io, "Parallel(", m.connection, ", ")
   join(io, m.layers, ", ")
   print(io, ")")
-end
+end
+
+"""
+    Embedding(in, out; init=randn)
+
+A lookup table that stores embeddings of dimension `out` 
+for a vocabulary of size `in`. 
+
+This layers is often used to store word embeddings and retrieve them using indices. 
+The input to the layer can be either a vector of indexes
+or the corresponding onehot encoding. 
+
+# Examples
+
+```julia-repl
+julia> vocab_size, embed_size = 1000, 4;
+
+julia> model = Embedding(vocab_size, embed_size)
+Embedding(1000, 4)
+
+julia> vocab_idxs = [1, 722, 53, 220, 3]
+
+julia> x = OneHotMatrix(vocab_idxs, vocab_size);
+
+julia> model(x)
+4×5 Matrix{Float32}:
+  0.91139    0.670462    0.463217   0.670462    0.110932
+  0.247225  -0.0823874   0.698694  -0.0823874   0.945958
+ -0.393626  -0.590136   -0.545422  -0.590136    0.77743
+ -0.497621   0.87595    -0.870251   0.87595    -0.772696
+```
+
+julia> model(vocab_idxs) # same as above
+"""
+struct Embedding{W}
+  weight::W
+end
+
+@functor Embedding
+
+function Embedding(in::Integer, out::Integer;
+               init = (i...) -> randn(Float32, i...))
+  return Embedding(init(out, in))
+end
+
+(m::Embedding)(x::OneHotMatrix) = m.weight * x # equivalent to m.weight[:, onecold(x)]
+(m::Embedding)(x::OneHotVector) = m.weight * x
+(m::Embedding)(x::AbstractVector) = m.weight[:, x]
+(m::Embedding)(x::Int) = m.weight[:, x]
+
+function Base.show(io::IO, m::Embedding)
+  print(io, "Embedding($(size(m.weight, 2)), $(size(m.weight, 1)))")
+end
+>>>>>>> b22cd2dc (cl/embed)
diff --git a/test/layers/basic.jl b/test/layers/basic.jl
@@ -191,4 +191,25 @@ import Flux: activations
       @test size(Parallel(+, Dense(10, 2), Dense(5, 2), Dense(4, 2))(inputs)) == (2,)
     end
   end
+
+  @testset "Embedding" begin
+    vocab_size, embed_size = 10, 4
+    m = Embedding(vocab_size, embed_size)
+    @test size(m.weight) == (embed_size, vocab_size)
+    
+    x = rand(1:vocab_size, 3)
+    y = m(x)
+    @test y isa Matrix{Float32}
+    @test y ≈ m.weight[:,x]
+
+    x2 = OneHotMatrix(x, vocab_size)
+    y2 = m(x2)
+    @test y2 isa Matrix{Float32}
+    @test y2 ≈ y
+    @test_throws DimensionMismatch m(OneHotMatrix(x, 1000))
+
+    @test m(2) ≈ m.weight[:,2]
+    @test m(OneHotVector(3, vocab_size)) ≈ m.weight[:,3]
+    @test_throws DimensionMismatch m(OneHotVector(3, 1000))
+  end
 end