Skip to content

added simplified BERT support #791

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 78 additions & 0 deletions neuralmonkey/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,84 @@ def __init__(self,
# pylint: enable=too-few-public-methods


def _bucket_boundaries(max_length, min_length=8, length_bucket_step=1.1):
"""Create a default set of length-bucket boundaries."""
assert length_bucket_step > 1.0
x = min_length
boundaries = []
while x < max_length:
boundaries.append(x)
x = max(x + 1, int(x * length_bucket_step))
return boundaries


def get_batching_scheme(batch_size: int,
max_length: int = None,
min_length_bucket: int = 8,
length_bucket_step: float = 1.1,
shard_multiplier: int = 1,
length_multiplier: int = 1,
min_length: int = 0) -> BatchingScheme:
"""Create a batching scheme based on model hyperparameters.

Every batch contains a number of sequences divisible by `shard_multiplier`.

Args:
batch_size: int, total number of tokens in a batch.
max_length: int, sequences longer than this will be skipped. Defaults
to batch_size.
min_length_bucket: int
length_bucket_step: float greater than 1.0
shard_multiplier: an integer increasing the batch_size to suit
splitting across datashards.
length_multiplier: an integer multiplier that is used to increase the
batch sizes and sequence length tolerance.
min_length: int, sequences shorter than this will be skipped.
Return:
A dictionary with parameters that can be passed to input_pipeline:
* boundaries: list of bucket boundaries
* batch_sizes: list of batch sizes for each length bucket
* max_length: int, maximum length of an example
Raises:
ValueError: If min_length > max_length
"""
max_length = max_length or batch_size
if max_length < min_length:
raise ValueError("max_length must be greater or equal to min_length")

boundaries = _bucket_boundaries(max_length, min_length_bucket,
length_bucket_step)
boundaries = [boundary * length_multiplier for boundary in boundaries]
max_length *= length_multiplier

batch_sizes = [
max(1, batch_size // length) for length in boundaries + [max_length]
]
max_batch_size = max(batch_sizes)
# Since the Datasets API only allows a single constant for window_size,
# and it needs divide all bucket_batch_sizes, we pick a highly-composite
# window size and then round down all batch sizes to divisors of that
# window size, so that a window can always be divided evenly into batches.
highly_composite_numbers = [
1, 2, 4, 6, 12, 24, 36, 48, 60, 120, 180, 240, 360, 720, 840, 1260,
1680, 2520, 5040, 7560, 10080, 15120, 20160, 25200, 27720, 45360,
50400, 55440, 83160, 110880, 166320, 221760, 277200, 332640, 498960,
554400, 665280, 720720, 1081080, 1441440, 2162160, 2882880, 3603600,
4324320, 6486480, 7207200, 8648640, 10810800, 14414400, 17297280,
21621600, 32432400, 36756720, 43243200, 61261200, 73513440, 110270160
]
window_size = max(
[i for i in highly_composite_numbers if i <= 3 * max_batch_size])
divisors = [i for i in range(1, window_size + 1) if window_size % i == 0]
batch_sizes = [max([d for d in divisors if d <= bs]) for bs in batch_sizes]
window_size *= shard_multiplier
batch_sizes = [bs * shard_multiplier for bs in batch_sizes]

ret = BatchingScheme(bucket_boundaries=boundaries,
bucket_batch_sizes=batch_sizes)
return ret


# The protected functions below are designed to convert the ambiguous spec
# structures to a normalized form.

Expand Down
3 changes: 2 additions & 1 deletion neuralmonkey/decoders/autoregressive.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,8 @@ def embedding_size(self) -> int:
if self.embeddings_source is not None:
if self._embedding_size is not None:
warn("Overriding the embedding_size parameter with the "
"size of the reused embeddings from the encoder.")
"size of the reused embeddings from the "
"`embeddings_source`.")

return self.embeddings_source.embedding_matrix.get_shape()[1].value

Expand Down
70 changes: 41 additions & 29 deletions neuralmonkey/decoders/sequence_labeler.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
from typing import Dict, Union
from typing import Dict
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OMG, přesně tyhlety změny mám už od října v nějaký větvi, ale čekal jsem, až Jindra dodělá tf.Dataset a mám s tím nějaké modely, co bych dál rád používal. Máš tohle už v hodně modelech?

Copy link
Member Author

@varisd varisd Feb 21, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Experimenty s BERTem mam zatim on-hold, takze no problem.

Klidne tenhle PR zavri a pouzij tu svoji vetev.


import tensorflow as tf
from typeguard import check_argument_types

from neuralmonkey.dataset import Dataset
from neuralmonkey.decorators import tensor
from neuralmonkey.encoders.recurrent import RecurrentEncoder
from neuralmonkey.encoders.facebook_conv import SentenceEncoder
from neuralmonkey.model.stateful import TemporalStateful
from neuralmonkey.model.sequence import EmbeddedSequence
from neuralmonkey.model.feedable import FeedDict
from neuralmonkey.model.parameterized import InitializerSpecs
from neuralmonkey.model.model_part import ModelPart
Expand All @@ -20,9 +20,10 @@ class SequenceLabeler(ModelPart):
# pylint: disable=too-many-arguments
def __init__(self,
name: str,
encoder: Union[RecurrentEncoder, SentenceEncoder],
vocabulary: Vocabulary,
encoder: TemporalStateful,
data_id: str,
vocabulary: Vocabulary = None,
embeddings_source: EmbeddedSequence = None,
dropout_keep_prob: float = 1.0,
reuse: ModelPart = None,
save_checkpoint: str = None,
Expand All @@ -32,10 +33,20 @@ def __init__(self,
ModelPart.__init__(self, name, reuse, save_checkpoint, load_checkpoint,
initializers)

self.embeddings_source = embeddings_source
self.encoder = encoder
self.vocabulary = vocabulary
self.data_id = data_id
self.dropout_keep_prob = dropout_keep_prob

# We provide only embedding_source when we want to tie input and output
# projections
if (self.embeddings_source is None) == (vocabulary is None):
raise ValueError("You must specify either `vocabulary or` or "
"`embeddings_source`, not both")
elif self.embeddings_source is not None:
self.vocabulary = self.embeddings_source.vocabulary
elif vocabulary is not None:
self.vocabulary = vocabulary
# pylint: enable=too-many-arguments

@property
Expand Down Expand Up @@ -64,23 +75,33 @@ def rnn_size(self) -> int:

@tensor
def decoding_w(self) -> tf.Variable:
return get_variable(
name="state_to_word_W",
shape=[self.rnn_size, len(self.vocabulary)])
if (self.embeddings_source is not None
and self.embeddings_source.dimension != self.rnn_size):
raise ValueError(
"Dimension of the embeddings_source ({}) must be equal "
"to the encoder `rnn_size` ({}) when defined".format(
self.embeddings_source.dimension, self.rnn_size))

with tf.name_scope("output_projection"):
if self.embeddings_source is not None:
return tf.transpose(self.embeddings_source.embedding_matrix)

# NOTE: default glorot initializer - is this alright?
return get_variable(
name="state_to_word_W",
shape=[self.rnn_size, len(self.vocabulary)])

@tensor
def decoding_b(self) -> tf.Variable:
return get_variable(
name="state_to_word_b",
shape=[len(self.vocabulary)],
initializer=tf.zeros_initializer())
if self.embeddings_source:
return tf.zeros(
self.embeddings_source.embedding_matrix.get_shape()[0])

@tensor
def decoding_residual_w(self) -> tf.Variable:
input_dim = self.encoder.input_sequence.dimension
return get_variable(
name="emb_to_word_W",
shape=[input_dim, len(self.vocabulary)])
with tf.name_scope("output_projection"):
return get_variable(
name="state_to_word_b",
shape=[len(self.vocabulary)],
initializer=tf.zeros_initializer())

@tensor
def logits(self) -> tf.Tensor:
Expand All @@ -99,16 +120,7 @@ def logits(self) -> tf.Tensor:

biases_3d = tf.expand_dims(tf.expand_dims(self.decoding_b, 0), 0)

embedded_inputs = tf.expand_dims(
self.encoder.input_sequence.temporal_states, 2)
dweights_4d = tf.expand_dims(
tf.expand_dims(self.decoding_residual_w, 0), 0)

dmultiplication = tf.nn.conv2d(
embedded_inputs, dweights_4d, [1, 1, 1, 1], "SAME")
dmultiplication_3d = tf.squeeze(dmultiplication, axis=[2])

logits = multiplication_3d + dmultiplication_3d + biases_3d
logits = multiplication_3d + biases_3d
return logits

@tensor
Expand Down
2 changes: 1 addition & 1 deletion neuralmonkey/readers/string_vector_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def process_line(line: str, lineno: int, path: str) -> np.ndarray:

return np.array(numbers, dtype=dtype)

def reader(files: List[str])-> Iterable[List[np.ndarray]]:
def reader(files: List[str]) -> Iterable[List[np.ndarray]]:
for path in files:
current_line = 0

Expand Down
2 changes: 1 addition & 1 deletion neuralmonkey/runners/label_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def __init__(self,
def fetches(self) -> Dict[str, tf.Tensor]:
return {
"label_logprobs": self.decoder.logprobs,
"input_mask": self.decoder.encoder.input_sequence.temporal_mask,
"input_mask": self.decoder.encoder.temporal_mask,
"loss": self.decoder.cost}

@property
Expand Down
79 changes: 79 additions & 0 deletions scripts/preprocess_bert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#!/usr/bin/env python3
# Creates training data for the BERT network training
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tohle se nepíše do komentáže, ale prostě začneš psát text do """ stringu, on se pak uloží do proměnné __doc__ a python ho považuje za dokumentaci.

# (noisified + masked gold predictions) using the input corpus
# TODO: add support for other NM vocabularies (aside from t2t)

import argparse
import os

import numpy as np

from neuralmonkey.logging import log as _log
from neuralmonkey.vocabulary import (
Vocabulary, PAD_TOKEN, UNK_TOKEN, from_wordlist)


def log(message: str, color: str = "blue") -> None:
_log(message, color)


def main() -> None:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--input_file", type=str, default="/dev/stdin")
parser.add_argument("--vocabulary", type=str, required=True)
parser.add_argument("--output_prefix", type=str, default=None)
parser.add_argument("--mask_token", type=str, default=UNK_TOKEN)
parser.add_argument("--coverage", type=float, default=0.15)
parser.add_argument("--mask_prob", type=float, default=0.8)
parser.add_argument("--replace_prob", type=float, default=0.1)
parser.add_argument("--vocab_contains_header", type=bool, default=True)
parser.add_argument("--vocab_contains_frequencies",
type=bool, default=True)
args = parser.parse_args()

assert (args.coverage <= 1 and args.coverage >= 0)
assert (args.mask_prob <= 1 and args.mask_prob >= 0)
assert (args.replace_prob <= 1 and args.replace_prob >= 0)

log("Loading vocabulary.")
vocabulary = from_wordlist(
args.vocabulary,
contains_header=args.vocab_contains_header,
contains_frequencies=args.vocab_contains_freqeuencies)

# Tuple[keep_prob
mask_prob = args.mask_prob
replace_prob = args.replace_prob
keep_prob = 1 - mask_prob - replace_prob
sample_probs = (keep_prob, mask_prob, replace_prob)

output_prefix = args.output_prefix
if output_prefix is None:
output_prefix = args.input_file
out_f_noise = "{}.noisy".format(output_prefix)
out_f_mask = "{}.mask".format(output_prefix)

out_noise_h = open(out_f_noise, "w", encoding="utf-8")
out_mask_h = open(out_f_mask, "w", encoding="utf-8")
log("Processing data.")
with open(args.input_file, "r", encoding="utf-8") as input_h:
# TODO: performance optimizations
for line in input_h:
line = line.strip().split(" ")
num_samples = int(args.coverage * len(line))
sampled_indices = np.random.choice(len(line), num_samples, False)

output_noisy = list(line)
output_masked = [PAD_TOKEN] * len(line)
for i in sampled_indices:
random_token = np.random.choice(vocabulary.index_to_word[4:])
new_token = np.random.choice(
[line[i], args.mask_token, random_token], p=sample_probs)
output_noisy[i] = new_token
output_masked[i] = line[i]
out_noise_h.write(str(" ".join(output_noisy)) + "\n")
out_mask_h.write(str(" ".join(output_masked)) + "\n")


if __name__ == "__main__":
main()
87 changes: 87 additions & 0 deletions tests/bert.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
[main]
name="BERT LM"
output="tests/outputs/bert"
tf_manager=<tf_manager>

train_dataset=<train_data>
val_dataset=<val_data>
test_datasets=[<val_data>]

runners=[<runner>]
trainer=<trainer>
evaluation=[("source", "source_masked", evaluators.Accuracy)]

batch_size=10
epochs=2

validation_period="10s"
logging_period="2s"
overwrite_output_dir=True


[batching_scheme]
class=dataset.get_batching_scheme
batch_size=32

[tf_manager]
class=tf_manager.TensorFlowManager
num_sessions=1
num_threads=4


[train_data]
class=dataset.load
batching=<batching_scheme>
# source_masked masks all tokens that weren't ``noisified''
series=["source_noisy", "source_masked"]
data=["tests/data/bert/train.pcedt.forms.noisy", "tests/data/bert/train.pcedt.forms.mask"]

[val_data]
class=dataset.load
series=["source_noisy", "source_masked"]
data=["tests/data/bert/val.pcedt.forms", "tests/data/bert/val.pcedt.forms"]


[vocabulary]
class=vocabulary.from_wordlist
path="tests/data/factored_decoder_vocab.tsv"

[sequence]
class=model.sequence.EmbeddedSequence
vocabulary=<vocabulary>
data_id="source_noisy"
embedding_size=6
scale_embeddings_by_depth=True

[encoder]
class=encoders.transformer.TransformerEncoder
name="encoder_bert"
input_sequence=<sequence>
ff_hidden_size=10
depth=2
n_heads=3
dropout_keep_prob=0.9

[labeler]
class=decoders.sequence_labeler.SequenceLabeler
name="labeler_bert"
encoder=<encoder>
data_id="source_masked"
dropout_keep_prob=0.5
embeddings_source=<sequence>

[trainer]
class=trainers.delayed_update_trainer.DelayedUpdateTrainer
batches_per_update=5
l2_weight=1.0e-8
clip_norm=1.0
objectives=[<obj>]

[obj]
class=trainers.cross_entropy_trainer.CostObjective
decoder=<labeler>

[runner]
class=runners.LabelRunner
decoder=<labeler>
output_series="source"
Loading