b/keras_bert/README.md
+# BERT
+Keras implementation of BERT modified for compatibility with TensorFlow 2.0
+For extracting latent embeddings from medical question/answer data
+![](architecture.png)
+# Acknowledgement
+Based on [CyberZHG's Keras BERT implementation](https://github.com/CyberZHG/keras-bert)
+# Usage
+### Tokenizer
+Splits text and generates indices:
+```python
+from keras_bert import Tokenizer
+token_dict = {
+    '[CLS]': 0,
+    '[SEP]': 1,
+    'un': 2,
+    '##aff': 3,
+    '##able': 4,
+    '[UNK]': 5,
+}
+tokenizer = Tokenizer(token_dict)
+print(tokenizer.tokenize('unaffable'))  # The result should be `['[CLS]', 'un', '##aff', '##able', '[SEP]']`
+indices, segments = tokenizer.encode('unaffable')
+print(indices)  # Should be `[0, 2, 3, 4, 1]`
+print(segments)  # Should be `[0, 0, 0, 0, 0]`
+print(tokenizer.tokenize(first='unaffable', second='钢'))
+# The result should be `['[CLS]', 'un', '##aff', '##able', '[SEP]', '钢', '[SEP]']`
+indices, segments = tokenizer.encode(first='unaffable', second='钢', max_len=10)
+print(indices)  # Should be `[0, 2, 3, 4, 1, 5, 1, 0, 0, 0]`
+print(segments)  # Should be `[0, 0, 0, 0, 0, 1, 1, 1, 1, 1]`
+```
+### Training
+```python
+from tensorflow import keras
+from keras_bert import get_base_dict, get_model, gen_batch_inputs
+# A toy input example
+sentence_pairs = [
+    [['all', 'work', 'and', 'no', 'play'], ['makes', 'jack', 'a', 'dull', 'boy']],
+    [['from', 'the', 'day', 'forth'], ['my', 'arm', 'changed']],
+    [['and', 'a', 'voice', 'echoed'], ['power', 'give', 'me', 'more', 'power']],
+]
+# Build token dictionary
+token_dict = get_base_dict()  # A dict that contains some special tokens
+for pairs in sentence_pairs:
+    for token in pairs[0] + pairs[1]:
+        if token not in token_dict:
+            token_dict[token] = len(token_dict)
+token_list = list(token_dict.keys())  # Used for selecting a random word
+# Build & train the model
+model = get_model(
+    token_num=len(token_dict),
+    head_num=5,
+    transformer_num=12,
+    embed_dim=25,
+    feed_forward_dim=100,
+    seq_len=20,
+    pos_num=20,
+    dropout_rate=0.05,
+)
+model.summary()
+def _generator():
+    while True:
+        yield gen_batch_inputs(
+            sentence_pairs,
+            token_dict,
+            token_list,
+            seq_len=20,
+            mask_rate=0.3,
+            swap_sentence_rate=1.0,
+        )
+model.fit_generator(
+    generator=_generator(),
+    steps_per_epoch=1000,
+    epochs=100,
+    validation_data=_generator(),
+    validation_steps=100,
+    callbacks=[
+        keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
+    ],
+)
+# Use the trained model
+inputs, output_layer = get_model(  # `output_layer` is the last feature extraction layer (the last transformer)
+    token_num=len(token_dict),
+    head_num=5,
+    transformer_num=12,
+    embed_dim=25,
+    feed_forward_dim=100,
+    seq_len=20,
+    pos_num=20,
+    dropout_rate=0.05,
+    training=False,   # The input layers and output layer will be returned if `training` is `False`
+    trainable=False,  # Whether the model is trainable. The default value is the same with `training`
+)
+```
+### Custom Feature Extraction
+```python
+def _custom_layers(x, trainable=True):
+    return keras.layers.LSTM(
+        units=768,
+        trainable=trainable,
+        return_sequences=True,
+        name='LSTM',
+    )(x)
+model = get_model(
+    token_num=200,
+    embed_dim=768,
+    custom_layers=_custom_layers,
+)
+```