ufal · varisd · Feb 6, 2019 · Feb 6, 2019 · Feb 6, 2019 · jlibovicky
diff --git a/neuralmonkey/dataset.py b/neuralmonkey/dataset.py
@@ -95,6 +95,84 @@ def __init__(self,
 # pylint: enable=too-few-public-methods
 
 
+def _bucket_boundaries(max_length, min_length=8, length_bucket_step=1.1):
+    """Create a default set of length-bucket boundaries."""
+    assert length_bucket_step > 1.0
+    x = min_length
+    boundaries = []
+    while x < max_length:
+        boundaries.append(x)
+        x = max(x + 1, int(x * length_bucket_step))
+    return boundaries
+
+
+def get_batching_scheme(batch_size: int,
+                        max_length: int = None,
+                        min_length_bucket: int = 8,
+                        length_bucket_step: float = 1.1,
+                        shard_multiplier: int = 1,
+                        length_multiplier: int = 1,
+                        min_length: int = 0) -> BatchingScheme:
+    """Create a batching scheme based on model hyperparameters.
+
+    Every batch contains a number of sequences divisible by `shard_multiplier`.
+
+    Args:
+        batch_size: int, total number of tokens in a batch.
+        max_length: int, sequences longer than this will be skipped. Defaults
+            to batch_size.
+        min_length_bucket: int
+        length_bucket_step: float greater than 1.0
+        shard_multiplier: an integer increasing the batch_size to suit
+            splitting across datashards.
+        length_multiplier: an integer multiplier that is used to increase the
+            batch sizes and sequence length tolerance.
+        min_length: int, sequences shorter than this will be skipped.
+    Return:
+         A dictionary with parameters that can be passed to input_pipeline:
+             * boundaries: list of bucket boundaries
+             * batch_sizes: list of batch sizes for each length bucket
+             * max_length: int, maximum length of an example
+    Raises:
+        ValueError: If min_length > max_length
+    """
+    max_length = max_length or batch_size
+    if max_length < min_length:
+        raise ValueError("max_length must be greater or equal to min_length")
+
+    boundaries = _bucket_boundaries(max_length, min_length_bucket,
+                                    length_bucket_step)
+    boundaries = [boundary * length_multiplier for boundary in boundaries]
+    max_length *= length_multiplier
+
+    batch_sizes = [
+        max(1, batch_size // length) for length in boundaries + [max_length]
+    ]
+    max_batch_size = max(batch_sizes)
+    # Since the Datasets API only allows a single constant for window_size,
+    # and it needs divide all bucket_batch_sizes, we pick a highly-composite
+    # window size and then round down all batch sizes to divisors of that
+    # window size, so that a window can always be divided evenly into batches.
+    highly_composite_numbers = [
+        1, 2, 4, 6, 12, 24, 36, 48, 60, 120, 180, 240, 360, 720, 840, 1260,
+        1680, 2520, 5040, 7560, 10080, 15120, 20160, 25200, 27720, 45360,
+        50400, 55440, 83160, 110880, 166320, 221760, 277200, 332640, 498960,
+        554400, 665280, 720720, 1081080, 1441440, 2162160, 2882880, 3603600,
+        4324320, 6486480, 7207200, 8648640, 10810800, 14414400, 17297280,
+        21621600, 32432400, 36756720, 43243200, 61261200, 73513440, 110270160
+    ]
+    window_size = max(
+        [i for i in highly_composite_numbers if i <= 3 * max_batch_size])
+    divisors = [i for i in range(1, window_size + 1) if window_size % i == 0]
+    batch_sizes = [max([d for d in divisors if d <= bs]) for bs in batch_sizes]
+    window_size *= shard_multiplier
+    batch_sizes = [bs * shard_multiplier for bs in batch_sizes]
+
+    ret = BatchingScheme(bucket_boundaries=boundaries,
+                         bucket_batch_sizes=batch_sizes)
+    return ret
+
+
 # The protected functions below are designed to convert the ambiguous spec
 # structures to a normalized form.
 

diff --git a/neuralmonkey/decoders/autoregressive.py b/neuralmonkey/decoders/autoregressive.py
@@ -173,7 +173,8 @@ def embedding_size(self) -> int:
         if self.embeddings_source is not None:
             if self._embedding_size is not None:
                 warn("Overriding the embedding_size parameter with the "
-                     "size of the reused embeddings from the encoder.")
+                     "size of the reused embeddings from the "
+                     "`embeddings_source`.")
 
         return self.embeddings_source.embedding_matrix.get_shape()[1].value
 

diff --git a/neuralmonkey/decoders/sequence_labeler.py b/neuralmonkey/decoders/sequence_labeler.py
@@ -1,12 +1,12 @@
-from typing import Dict, Union
+from typing import Dict
 
 import tensorflow as tf
 from typeguard import check_argument_types
 
 from neuralmonkey.dataset import Dataset
 from neuralmonkey.decorators import tensor
-from neuralmonkey.encoders.recurrent import RecurrentEncoder
-from neuralmonkey.encoders.facebook_conv import SentenceEncoder
+from neuralmonkey.model.stateful import TemporalStateful
+from neuralmonkey.model.sequence import EmbeddedSequence
 from neuralmonkey.model.feedable import FeedDict
 from neuralmonkey.model.parameterized import InitializerSpecs
 from neuralmonkey.model.model_part import ModelPart
@@ -20,9 +20,10 @@ class SequenceLabeler(ModelPart):
     # pylint: disable=too-many-arguments
     def __init__(self,
                  name: str,
-                 encoder: Union[RecurrentEncoder, SentenceEncoder],
-                 vocabulary: Vocabulary,
+                 encoder: TemporalStateful,
                  data_id: str,
+                 vocabulary: Vocabulary = None,
+                 embeddings_source: EmbeddedSequence = None,
                  dropout_keep_prob: float = 1.0,
                  reuse: ModelPart = None,
                  save_checkpoint: str = None,
@@ -32,10 +33,20 @@ def __init__(self,
         ModelPart.__init__(self, name, reuse, save_checkpoint, load_checkpoint,
                            initializers)
 
+        self.embeddings_source = embeddings_source
         self.encoder = encoder
-        self.vocabulary = vocabulary
         self.data_id = data_id
         self.dropout_keep_prob = dropout_keep_prob
+
+        # We provide only embedding_source when we want to tie input and output
+        # projections
+        if (self.embeddings_source is None) == (vocabulary is None):
+            raise ValueError("You must specify either `vocabulary or` or "
+                             "`embeddings_source`, not both")
+        elif self.embeddings_source is not None:
+            self.vocabulary = self.embeddings_source.vocabulary
+        elif vocabulary is not None:
+            self.vocabulary = vocabulary
     # pylint: enable=too-many-arguments
 
     @property
@@ -64,23 +75,33 @@ def rnn_size(self) -> int:
 
     @tensor
     def decoding_w(self) -> tf.Variable:
-        return get_variable(
-            name="state_to_word_W",
-            shape=[self.rnn_size, len(self.vocabulary)])
+        if (self.embeddings_source is not None
+                and self.embeddings_source.dimension != self.rnn_size):
+            raise ValueError(
+                "Dimension of the embeddings_source ({}) must be equal "
+                "to the encoder `rnn_size` ({}) when defined".format(
+                    self.embeddings_source.dimension, self.rnn_size))
+
+        with tf.name_scope("output_projection"):
+            if self.embeddings_source is not None:
+                return tf.transpose(self.embeddings_source.embedding_matrix)
+
+            # NOTE: default glorot initializer - is this alright?
+            return get_variable(
+                name="state_to_word_W",
+                shape=[self.rnn_size, len(self.vocabulary)])
 
     @tensor
     def decoding_b(self) -> tf.Variable:
-        return get_variable(
-            name="state_to_word_b",
-            shape=[len(self.vocabulary)],
-            initializer=tf.zeros_initializer())
+        if self.embeddings_source:
+            return tf.zeros(
+                self.embeddings_source.embedding_matrix.get_shape()[0])
 
-    @tensor
-    def decoding_residual_w(self) -> tf.Variable:
-        input_dim = self.encoder.input_sequence.dimension
-        return get_variable(
-            name="emb_to_word_W",
-            shape=[input_dim, len(self.vocabulary)])
+        with tf.name_scope("output_projection"):
+            return get_variable(
+                name="state_to_word_b",
+                shape=[len(self.vocabulary)],
+                initializer=tf.zeros_initializer())
 
     @tensor
     def logits(self) -> tf.Tensor:
@@ -99,16 +120,7 @@ def logits(self) -> tf.Tensor:
 
         biases_3d = tf.expand_dims(tf.expand_dims(self.decoding_b, 0), 0)
 
-        embedded_inputs = tf.expand_dims(
-            self.encoder.input_sequence.temporal_states, 2)
-        dweights_4d = tf.expand_dims(
-            tf.expand_dims(self.decoding_residual_w, 0), 0)
-
-        dmultiplication = tf.nn.conv2d(
-            embedded_inputs, dweights_4d, [1, 1, 1, 1], "SAME")
-        dmultiplication_3d = tf.squeeze(dmultiplication, axis=[2])
-
-        logits = multiplication_3d + dmultiplication_3d + biases_3d
+        logits = multiplication_3d + biases_3d
         return logits
 
     @tensor

diff --git a/neuralmonkey/readers/string_vector_reader.py b/neuralmonkey/readers/string_vector_reader.py
@@ -13,7 +13,7 @@ def process_line(line: str, lineno: int, path: str) -> np.ndarray:
 
         return np.array(numbers, dtype=dtype)
 
-    def reader(files: List[str])-> Iterable[List[np.ndarray]]:
+    def reader(files: List[str]) -> Iterable[List[np.ndarray]]:
         for path in files:
             current_line = 0
 

diff --git a/neuralmonkey/runners/label_runner.py b/neuralmonkey/runners/label_runner.py
@@ -60,7 +60,7 @@ def __init__(self,
     def fetches(self) -> Dict[str, tf.Tensor]:
         return {
             "label_logprobs": self.decoder.logprobs,
-            "input_mask": self.decoder.encoder.input_sequence.temporal_mask,
+            "input_mask": self.decoder.encoder.temporal_mask,
             "loss": self.decoder.cost}
 
     @property

diff --git a/scripts/preprocess_bert.py b/scripts/preprocess_bert.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python3
+# Creates training data for the BERT network training
+# (noisified + masked gold predictions) using the input corpus
+# TODO: add support for other NM vocabularies (aside from t2t)
+
+import argparse
+import os
+
+import numpy as np
+
+from neuralmonkey.logging import log as _log
+from neuralmonkey.vocabulary import (
+    Vocabulary, PAD_TOKEN, UNK_TOKEN, from_wordlist)
+
+
+def log(message: str, color: str = "blue") -> None:
+    _log(message, color)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--input_file", type=str, default="/dev/stdin")
+    parser.add_argument("--vocabulary", type=str, required=True)
+    parser.add_argument("--output_prefix", type=str, default=None)
+    parser.add_argument("--mask_token", type=str, default=UNK_TOKEN)
+    parser.add_argument("--coverage", type=float, default=0.15)
+    parser.add_argument("--mask_prob", type=float, default=0.8)
+    parser.add_argument("--replace_prob", type=float, default=0.1)
+    parser.add_argument("--vocab_contains_header", type=bool, default=True)
+    parser.add_argument("--vocab_contains_frequencies",
+                        type=bool, default=True)
+    args = parser.parse_args()
+
+    assert (args.coverage <= 1 and args.coverage >= 0)
+    assert (args.mask_prob <= 1 and args.mask_prob >= 0)
+    assert (args.replace_prob <= 1 and args.replace_prob >= 0)
+
+    log("Loading vocabulary.")
+    vocabulary = from_wordlist(
+        args.vocabulary,
+        contains_header=args.vocab_contains_header,
+        contains_frequencies=args.vocab_contains_freqeuencies)
+
+    # Tuple[keep_prob
+    mask_prob = args.mask_prob
+    replace_prob = args.replace_prob
+    keep_prob = 1 - mask_prob - replace_prob
+    sample_probs = (keep_prob, mask_prob, replace_prob)
+
+    output_prefix = args.output_prefix
+    if output_prefix is None:
+        output_prefix = args.input_file
+    out_f_noise = "{}.noisy".format(output_prefix)
+    out_f_mask = "{}.mask".format(output_prefix)
+
+    out_noise_h = open(out_f_noise, "w", encoding="utf-8")
+    out_mask_h = open(out_f_mask, "w", encoding="utf-8")
+    log("Processing data.")
+    with open(args.input_file, "r", encoding="utf-8") as input_h:
+        # TODO: performance optimizations
+        for line in input_h:
+            line = line.strip().split(" ")
+            num_samples = int(args.coverage * len(line))
+            sampled_indices = np.random.choice(len(line), num_samples, False)
+
+            output_noisy = list(line)
+            output_masked = [PAD_TOKEN] * len(line)
+            for i in sampled_indices:
+                random_token = np.random.choice(vocabulary.index_to_word[4:])
+                new_token = np.random.choice(
+                    [line[i], args.mask_token, random_token], p=sample_probs)
+                output_noisy[i] = new_token
+                output_masked[i] = line[i]
+            out_noise_h.write(str(" ".join(output_noisy)) + "\n")
+            out_mask_h.write(str(" ".join(output_masked)) + "\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/bert.ini b/tests/bert.ini
@@ -0,0 +1,87 @@
+[main]
+name="BERT LM"
+output="tests/outputs/bert"
+tf_manager=<tf_manager>
+
+train_dataset=<train_data>
+val_dataset=<val_data>
+test_datasets=[<val_data>]
+
+runners=[<runner>]
+trainer=<trainer>
+evaluation=[("source", "source_masked", evaluators.Accuracy)]
+
+batch_size=10
+epochs=2
+
+validation_period="10s"
+logging_period="2s"
+overwrite_output_dir=True
+
+
+[batching_scheme]
+class=dataset.get_batching_scheme
+batch_size=32
+
+[tf_manager]
+class=tf_manager.TensorFlowManager
+num_sessions=1
+num_threads=4
+
+
+[train_data]
+class=dataset.load
+batching=<batching_scheme>
+# source_masked masks all tokens that weren't ``noisified''
+series=["source_noisy", "source_masked"]
+data=["tests/data/bert/train.pcedt.forms.noisy", "tests/data/bert/train.pcedt.forms.mask"]
+
+[val_data]
+class=dataset.load
+series=["source_noisy", "source_masked"]
+data=["tests/data/bert/val.pcedt.forms", "tests/data/bert/val.pcedt.forms"]
+
+
+[vocabulary]
+class=vocabulary.from_wordlist
+path="tests/data/factored_decoder_vocab.tsv"
+
+[sequence]
+class=model.sequence.EmbeddedSequence
+vocabulary=<vocabulary>
+data_id="source_noisy"
+embedding_size=6
+scale_embeddings_by_depth=True
+
+[encoder]
+class=encoders.transformer.TransformerEncoder
+name="encoder_bert"
+input_sequence=<sequence>
+ff_hidden_size=10
+depth=2
+n_heads=3
+dropout_keep_prob=0.9
+
+[labeler]
+class=decoders.sequence_labeler.SequenceLabeler
+name="labeler_bert"
+encoder=<encoder>
+data_id="source_masked"
+dropout_keep_prob=0.5
+embeddings_source=<sequence>
+
+[trainer]
+class=trainers.delayed_update_trainer.DelayedUpdateTrainer
+batches_per_update=5
+l2_weight=1.0e-8
+clip_norm=1.0
+objectives=[<obj>]
+
+[obj]
+class=trainers.cross_entropy_trainer.CostObjective
+decoder=<labeler>
+
+[runner]
+class=runners.LabelRunner
+decoder=<labeler>
+output_series="source"