Deep-Drug-Coder / Git / [58db57] /ddc_pub/ddc

Models:
Amanda-D/
Deep-Drug-Coder
Downloads: 1
[58db57]: / ddc_pub / ddc_v3.py
History
Download this file
1462 lines (1201 with data), 53.1 kB

import os
os.environ[
    "TF_CPP_MIN_LOG_LEVEL"
] = "3"  # Suppress UserWarning of TensorFlow while loading the model

import shutil
import zipfile
import tempfile
import pickle
import numpy as np
from datetime import datetime
from functools import wraps

from tensorflow.keras.layers import (
    Input,
    Concatenate,
    Dense,
    Flatten,
    RepeatVector,
    TimeDistributed,
    Bidirectional,
    GaussianNoise,
    BatchNormalization,
)
from tensorflow.compat.v1.keras.layers import (
    CuDNNLSTM as LSTM,
)  # Faster drop-in for LSTM using CuDNN on TF backend on GPU
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, LearningRateScheduler
from tensorflow.keras.utils import multi_gpu_model, plot_model

from sklearn.preprocessing import StandardScaler  # For the descriptors
from sklearn.decomposition import PCA  # For the descriptors

# Custom dependencies
from .vectorizers import SmilesVectorizer
from .generators import CodeGenerator as DescriptorGenerator
from .generators import HetSmilesGenerator
from .custom_callbacks import ModelAndHistoryCheckpoint, LearningRateSchedule


def timed(func):
    """Timer decorator to benchmark functions."""
    @wraps(func)
    def wrapper(*args, **kwargs):
        tstart = datetime.now()
        result = func(*args, **kwargs)
        elapsed = (datetime.now() - tstart).microseconds / 1e6
        print("Elapsed time: %.3f seconds." % elapsed)
        return result
    return wrapper


class DDC:
    def __init__(self, **kwargs):
        """Initialize a DDC object from scratch or from a trained configuration. All binary mols are converted to SMILES strings internally, using the vectorizers.
        
        # Examples of __init__ usage
            To *train* a blank model with encoder (autoencoder):
                model = ddc.DDC(x              = mols,
                                y              = mols,
                                scaling        = True,
                                pca            = True,
                                dataset_info   = info,
                                noise_std      = 0.1,
                                lstm_dim       = 256,
                                dec_layers     = 3,
                                td_dense_dim   = 0,
                                batch_size     = 128,
                                codelayer_dim  = 128)
            
            To *train* a blank model without encoder:
                model = ddc.DDC(x              = descriptors,
                                y              = mols,
                                scaling        = True,
                                pca            = True,
                                dataset_info   = info,
                                noise_std      = 0.1,
                                lstm_dim       = 256,
                                dec_layers     = 3,
                                td_dense_dim   = 0,
                                batch_size     = 128)
                                
            To *re-train* a saved model with encoder (autoencoder):
                model = ddc.DDC(x              = mols,
                                y              = mols,
                                model_name     = saved_model_name)
            
            To *re-train* a saved model without encoder:
                model = ddc.DDC(x              = descriptors,
                                y              = mols,
                                model_name     = saved_model_name)
                
            To *test* a saved model:
                model = ddc.DDC(model_name     = saved_model_name)

        :param x: Encoder input
        :type x: list or numpy.ndarray
        :param y: Decoder input for teacher's forcing
        :type y: list or numpy.ndarray
        :param scaling: Flag to scale descriptor inputs, defaults to `False`
        :type scaling: boolean
        :param pca: Flag to apply PCA on descriptor inputs, defaults to `False`
        :type pca: boolean
        :param model_name: Filename of model to load
        :type model_name: str
        :param dataset_info: Metadata about dataset
        :type dataset_info: dict
        :param noise_std: Standard deviation of noise in the latent space, defaults to 0.01
        :type noise_std: float
        :param lstm_dim: Number of LSTM units in the encoder/decoder layers, defaults to 256
        :param dec_layers: Number of decoder layers, defaults to 2
        :type dec_layers: int
        :param td_dense_dim: Number of intermediate Dense units to squeeze LSTM outputs, defaults to 0
        :type td_dense_dim: int
        :param batch_size: Batch size to train with, defaults to 256
        :type batch_size: int
        :param codelayer_dim: Dimensionality of latent space
        :type codelayer_dim: int
        :param bn: Fla to enable batch normalization, defaults to `True`
        :type bn: boolean
        :param bn_momentum: Momentum value to be used in batch normalization, defaults to 0.9
        :type bn_momentum: float
        """

        # Identify the mode to start the model in
        if "x" in kwargs and "y" in kwargs:
            x = kwargs.get("x")
            y = kwargs.get("y")
            if "model_name" not in kwargs:
                self.__mode = "train"
            else:
                self.__mode = "retrain"
        elif "model_name" in kwargs:
            self.__mode = "test"
        else:
            raise NameError("Cannot infer mode from arguments.")

        print("Initializing model in %s mode." % self.__mode)

        if self.mode == "train":
            # Infer input type from type(x)
            if type(x[0]) == np.bytes_:
                print("Input type is 'binary mols'.")
                self.__input_type = "mols"  # binary RDKit mols
            else:
                print("Input type is 'molecular descriptors'.")
                self.__input_type = "descriptors"  # other molecular descriptors

                # If scaling is required
                if kwargs.get("scaling", False) is True:
                    # Normalize the input
                    print("Applying scaling on input.")
                    self.__scaler = StandardScaler()
                    x = self.__scaler.fit_transform(x)
                else:
                    self.__scaler = None

                # If PCA is required
                if kwargs.get("pca", False) is True:
                    print("Applying PCA on input.")
                    self.__pca = PCA(
                        n_components=x.shape[1]
                    )  # n_components=n_features for now
                    x = self.__pca.fit_transform(x)
                else:
                    self.__pca = None

            self.__maxlen = (
                kwargs.get("dataset_info")["maxlen"] + 10
            )  # Extend maxlen to avoid breaks in training
            self.__charset = kwargs.get("dataset_info")["charset"]
            self.__dataset_name = kwargs.get("dataset_info")["name"]
            self.__lstm_dim = kwargs.get("lstm_dim", 256)
            self.__h_activation = kwargs.get("h_activation", "relu")
            self.__bn = kwargs.get("bn", True)
            self.__bn_momentum = kwargs.get("bn_momentum", 0.9)
            self.__noise_std = kwargs.get("noise_std", 0.01)
            self.__td_dense_dim = kwargs.get(
                "td_dense_dim", 0
            )  # >0 squeezes RNN connections with Dense sandwiches
            self.__batch_size = kwargs.get("batch_size", 256)
            self.__dec_layers = kwargs.get("dec_layers", 2)

            if self.input_type == "descriptors":
                self.__codelayer_dim = x.shape[1]  # features
                if "codelayer_dim" in kwargs:
                    print(
                        "Ignoring requested codelayer_dim because it is inferred from the cardinality of the descriptors."
                    )
            else:
                self.__codelayer_dim = kwargs.get("codelayer_dim", 128)
            
            # Create the left/right-padding vectorizers
            self.__smilesvec1 = SmilesVectorizer(
                canonical=False,
                augment=True,
                maxlength=self.maxlen,
                charset=self.charset,
                binary=True,
            )

            self.__smilesvec2 = SmilesVectorizer(
                canonical=False,
                augment=True,
                maxlength=self.maxlen,
                charset=self.charset,
                binary=True,
                leftpad=False,
            )

            # self.train_gen.next() #This line is needed to set train_gen.dims (to be fixed in HetSmilesGenerator)
            self.__input_shape = self.smilesvec1.dims
            self.__dec_dims = list(self.smilesvec1.dims)
            self.__dec_dims[0] = self.dec_dims[0] - 1
            self.__dec_input_shape = self.dec_dims
            self.__output_len = self.smilesvec1.dims[0] - 1
            self.__output_dims = self.smilesvec1.dims[-1]

            # Build all sub-models as untrained models
            if self.input_type == "mols":
                self.__build_mol_to_latent_model()
            else:
                self.__mol_to_latent_model = None

            self.__build_latent_to_states_model()
            self.__build_batch_model()

            # Build data generators
            self.__build_generators(x, y)

        # Retrain or Test mode
        else:
            self.__model_name = kwargs.get("model_name")

            # Load the model
            self.__load(self.model_name)
            
            if self.mode == "retrain":
                # If scaling is required
                if self.scaler is not None:
                    print("Applying scaling on input.")
                    x = self.scaler.transform(x)

                # If PCA is required
                if self.pca is not None:
                    print("Applying PCA on input.")
                    x = self.pca.transform(x)
                
                # Build data generators
                self.__build_generators(x, y)

        # Build full model out of the sub-models
        self.__build_model()

        # Show the resulting full model
        print(self.model.summary())

    """
    Architecture properties.
    """

    @property
    def lstm_dim(self):
        return self.__lstm_dim

    @property
    def h_activation(self):
        return self.__h_activation

    @property
    def bn(self):
        return self.__bn

    @property
    def bn_momentum(self):
        return self.__bn_momentum

    @property
    def noise_std(self):
        return self.__noise_std

    @property
    def td_dense_dim(self):
        return self.__td_dense_dim

    @property
    def batch_size(self):
        return self.__batch_size

    @property
    def dec_layers(self):
        return self.__dec_layers

    @property
    def codelayer_dim(self):
        return self.__codelayer_dim

    @property
    def steps_per_epoch(self):
        return self.__steps_per_epoch

    @property
    def validation_steps(self):
        return self.__validation_steps

    @property
    def input_shape(self):
        return self.__input_shape

    @property
    def dec_dims(self):
        return self.__dec_dims

    @property
    def dec_input_shape(self):
        return self.__dec_input_shape

    @property
    def output_len(self):
        return self.__output_len

    @property
    def output_dims(self):
        return self.__output_dims

    @property
    def batch_input_length(self):
        return self.__batch_input_length

    @batch_input_length.setter
    def batch_input_length(self, value):
        self.__batch_input_length = value
        self.__build_sample_model(batch_input_length=value)

    """
    Models.
    """

    @property
    def mol_to_latent_model(self):
        return self.__mol_to_latent_model

    @property
    def latent_to_states_model(self):
        return self.__latent_to_states_model

    @property
    def batch_model(self):
        return self.__batch_model

    @property
    def sample_model(self):
        return self.__sample_model

    @property
    def multi_sample_model(self):
        return self.__multi_sample_model

    @property
    def model(self):
        return self.__model

    """
    Train properties.
    """

    @property
    def epochs(self):
        return self.__epochs

    @property
    def clipvalue(self):
        return self.__clipvalue

    @property
    def lr(self):
        return self.__lr

    @property
    def h(self):
        return self.__h

    @h.setter
    def h(self, value):
        self.__h = value

    """
    Other properties.
    """

    @property
    def mode(self):
        return self.__mode

    @property
    def dataset_name(self):
        return self.__dataset_name

    @property
    def model_name(self):
        return self.__model_name

    @property
    def input_type(self):
        return self.__input_type

    @property
    def maxlen(self):
        return self.__maxlen

    @property
    def charset(self):
        return self.__charset

    @property
    def smilesvec1(self):
        return self.__smilesvec1

    @property
    def smilesvec2(self):
        return self.__smilesvec2

    @property
    def train_gen(self):
        return self.__train_gen

    @property
    def valid_gen(self):
        return self.__valid_gen

    @property
    def scaler(self):
        try:
            return self.__scaler
        except:
            return None

    @property
    def pca(self):
        try:
            return self.__pca
        except:
            return None

    """
    Private methods.
    """

    def __build_generators(self, x, y, split=0.9):
        """Build data generators to be used for (re)training.
        
        :param x: Encoder input
        :type x: list
        :param y: Decoder input for teacher's forcing
        :type y: list
        :param split: Fraction of samples to keep for training (rest for validation), defaults to 0.9
        :type split: float, optional
        """

        # Sanity check
        assert len(x) == len(y)

        # Split dataset into train and validation sets
        cut = int(split * len(x))
        x_train = x[:cut]
        x_valid = x[cut:]
        y_train = y[:cut]
        y_valid = y[cut:]

        if self.input_type == "mols":
            self.__train_gen = HetSmilesGenerator(
                x_train,
                None,
                self.smilesvec1,
                self.smilesvec2,
                batch_size=self.batch_size,
                shuffle=True,
            )

            self.__valid_gen = HetSmilesGenerator(
                x_valid,
                None,
                self.smilesvec1,
                self.smilesvec2,
                batch_size=self.batch_size,
                shuffle=True,
            )

        else:
            self.__train_gen = DescriptorGenerator(
                x_train,
                y_train,
                self.smilesvec1,
                self.smilesvec2,
                batch_size=self.batch_size,
                shuffle=True,
            )

            self.__valid_gen = DescriptorGenerator(
                x_valid,
                y_valid,
                self.smilesvec1,
                self.smilesvec2,
                batch_size=self.batch_size,
                shuffle=True,
            )

        # Calculate number of batches per training/validation epoch
        train_samples = len(x_train)
        valid_samples = len(x_valid)
        self.__steps_per_epoch = train_samples // self.batch_size
        self.__validation_steps = valid_samples // self.batch_size

        print(
            "Model received %d train samples and %d validation samples."
            % (train_samples, valid_samples)
        )

    def __build_mol_to_latent_model(self):
        """Model that transforms binary molecules to their latent representation.
        Only used if input is mols.
        """

        # Input tensor (MANDATORY)
        encoder_inputs = Input(shape=self.input_shape, name="Encoder_Inputs")

        x = encoder_inputs

        # The two encoder layers, number of cells are halved as Bidirectional
        encoder = Bidirectional(
            LSTM(
                self.lstm_dim // 2,
                return_sequences=True,
                return_state=True,  # Return the states at end of the batch
                name="Encoder_LSTM_1",
            )
        )

        x, state_h, state_c, state_h_reverse, state_c_reverse = encoder(x)

        if self.bn:
            x = BatchNormalization(momentum=self.bn_momentum, name="BN_1")(x)

        encoder2 = Bidirectional(
            LSTM(
                self.lstm_dim // 2,
                return_state=True,  # Return the states at end of the batch
                name="Encoder_LSTM_2",
            )
        )

        _, state_h2, state_c2, state_h2_reverse, state_c2_reverse = encoder2(x)

        # Concatenate all states of the forward and the backward LSTM layers
        states = Concatenate(axis=-1, name="Concatenate_1")(
            [
                state_h,
                state_c,
                state_h2,
                state_c2,
                state_h_reverse,
                state_c_reverse,
                state_h2_reverse,
                state_c2_reverse,
            ]
        )

        if self.bn:
            states = BatchNormalization(momentum=self.bn_momentum, name="BN_2")(states)

        # A non-linear recombination
        neck_relu = Dense(
            self.codelayer_dim, activation=self.h_activation, name="Codelayer_Relu"
        )
        neck_outputs = neck_relu(states)

        if self.bn:
            neck_outputs = BatchNormalization(
                momentum=self.bn_momentum, name="BN_Codelayer"
            )(neck_outputs)

        # Add Gaussian noise to "spread" the distribution of the latent variables during training
        neck_outputs = GaussianNoise(self.noise_std, name="Gaussian_Noise")(
            neck_outputs
        )

        # Define the model
        self.__mol_to_latent_model = Model(encoder_inputs, neck_outputs)

        # Name it!
        self.mol_to_latent_model._name = "mol_to_latent_model"

    def __build_latent_to_states_model(self):
        """Model that constructs the initial states of the decoder from a latent molecular representation.
        """

        # Input tensor (MANDATORY)
        latent_input = Input(shape=(self.codelayer_dim,), name="Latent_Input")

        # Initialize list of state tensors for the decoder
        decoder_state_list = []

        for dec_layer in range(self.dec_layers):

            # The tensors for the initial states of the decoder
            name = "Dense_h_" + str(dec_layer)
            h_decoder = Dense(self.lstm_dim, activation="relu", name=name)(latent_input)

            name = "Dense_c_" + str(dec_layer)
            c_decoder = Dense(self.lstm_dim, activation="relu", name=name)(latent_input)

            if self.bn:
                name = "BN_h_" + str(dec_layer)
                h_decoder = BatchNormalization(momentum=self.bn_momentum, name=name)(
                    h_decoder
                )

                name = "BN_c_" + str(dec_layer)
                c_decoder = BatchNormalization(momentum=self.bn_momentum, name=name)(
                    c_decoder
                )

            decoder_state_list.append(h_decoder)
            decoder_state_list.append(c_decoder)

        # Define the model
        self.__latent_to_states_model = Model(latent_input, decoder_state_list)

        # Name it!
        self.latent_to_states_model._name = "latent_to_states_model"

    def __build_batch_model(self):
        """Model that returns a vectorized SMILES string of OHE characters.
        """

        # List of input tensors to batch_model
        inputs = []

        # This is the start character padded OHE smiles for teacher forcing
        decoder_inputs = Input(shape=self.dec_input_shape, name="Decoder_Inputs")
        inputs.append(decoder_inputs)

        # I/O tensor of the LSTM layers
        x = decoder_inputs

        for dec_layer in range(self.dec_layers):
            name = "Decoder_State_h_" + str(dec_layer)
            state_h = Input(shape=[self.lstm_dim], name=name)
            inputs.append(state_h)

            name = "Decoder_State_c_" + str(dec_layer)
            state_c = Input(shape=[self.lstm_dim], name=name)
            inputs.append(state_c)

            # RNN layer
            decoder_lstm = LSTM(
                self.lstm_dim,
                return_sequences=True,
                name="Decoder_LSTM_" + str(dec_layer),
            )

            x = decoder_lstm(x, initial_state=[state_h, state_c])

            if self.bn:
                x = BatchNormalization(
                    momentum=self.bn_momentum, name="BN_Decoder_" + str(dec_layer)
                )(x)

            # Squeeze LSTM interconnections using Dense layers
            if self.td_dense_dim > 0:
                x = TimeDistributed(
                    Dense(self.td_dense_dim), name="Time_Distributed_" + str(dec_layer)
                )(x)

        # Final Dense layer to return soft labels (probabilities)
        outputs = Dense(self.output_dims, activation="softmax", name="Dense_Decoder")(x)

        # Define the batch_model
        self.__batch_model = Model(inputs=inputs, outputs=[outputs])

        # Name it!
        self.batch_model._name = "batch_model"

    def __build_model(self):
        """Full model that constitutes the complete pipeline.
        """

        # IFF input is not encoded, stack the encoder (mol_to_latent_model)
        if self.input_type == "mols":
            # Input tensor (MANDATORY) - Same as the mol_to_latent_model input!
            encoder_inputs = Input(shape=self.input_shape, name="Encoder_Inputs")
            # Input tensor (MANDATORY) - Same as the batch_model input for teacher's forcing!
            decoder_inputs = Input(shape=self.dec_input_shape, name="Decoder_Inputs")

            # Stack the three models
            # Propagate tensors through 1st model
            x = self.mol_to_latent_model(encoder_inputs)
            # Propagate tensors through 2nd model
            x = self.latent_to_states_model(x)
            # Append the first input of the third model to be the one for teacher's forcing
            x = [decoder_inputs] + x
            # Propagate tensors through 3rd model
            x = self.batch_model(x)

            # Define full model (SMILES -> SMILES)
            self.__model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=[x])

        # Input is pre-encoded, no need for encoder
        else:
            # Input tensor (MANDATORY)
            latent_input = Input(shape=(self.codelayer_dim,), name="Latent_Input")
            # Input tensor (MANDATORY) - Same as the batch_model input for teacher's forcing!
            decoder_inputs = Input(shape=self.dec_input_shape, name="Decoder_Inputs")

            # Stack the two models
            # Propagate tensors through 1st model
            x = self.latent_to_states_model(latent_input)
            # Append the first input of the 2nd model to be the one for teacher's forcing
            x = [decoder_inputs] + x
            # Propagate tensors through 2nd model
            x = self.batch_model(x)

            # Define full model (latent -> SMILES)
            self.__model = Model(inputs=[latent_input, decoder_inputs], outputs=[x])

    def __build_sample_model(self, batch_input_length) -> dict:
        """Model that predicts a single OHE character.
        This model is generated from the modified config file of the batch_model.
        
        :param batch_input_length: Size of generated batch
        :type batch_input_length: int
        :return: The dictionary of the configuration
        :rtype: dict
        """

        self.__batch_input_length = batch_input_length

        # Get the configuration of the batch_model
        config = self.batch_model.get_config()

        # Keep only the "Decoder_Inputs" as single input to the sample_model
        config["input_layers"] = [config["input_layers"][0]]

        # Find decoder states that are used as inputs in batch_model and remove them
        idx_list = []
        for idx, layer in enumerate(config["layers"]):

            if "Decoder_State_" in layer["name"]:
                idx_list.append(idx)

        # Pop the layer from the layer list
        # Revert indices to avoid re-arranging after deleting elements
        for idx in sorted(idx_list, reverse=True):
            config["layers"].pop(idx)

        # Remove inbound_nodes dependencies of remaining layers on deleted ones
        for layer in config["layers"]:
            idx_list = []

            try:
                for idx, inbound_node in enumerate(layer["inbound_nodes"][0]):
                    if "Decoder_State_" in inbound_node[0]:
                        idx_list.append(idx)
            # Catch the exception for first layer (Decoder_Inputs) that has empty list of inbound_nodes[0]
            except:
                pass

            # Pop the inbound_nodes from the list
            # Revert indices to avoid re-arranging
            for idx in sorted(idx_list, reverse=True):
                layer["inbound_nodes"][0].pop(idx)

        # Change the batch_shape of input layer
        config["layers"][0]["config"]["batch_input_shape"] = (
            batch_input_length,
            1,
            self.dec_input_shape[-1],
        )

        # Finally, change the statefulness of the RNN layers
        for layer in config["layers"]:
            if "Decoder_LSTM_" in layer["name"]:
                layer["config"]["stateful"] = True
                # layer["config"]["return_sequences"] = True

        # Define the sample_model using the modified config file
        sample_model = Model.from_config(config)

        # Copy the trained weights from the trained batch_model to the untrained sample_model
        for layer in sample_model.layers:
            # Get weights from the batch_model
            weights = self.batch_model.get_layer(layer.name).get_weights()
            # Set the weights to the sample_model
            sample_model.get_layer(layer.name).set_weights(weights)

        if batch_input_length == 1:
            self.__sample_model = sample_model

        elif batch_input_length > 1:
            self.__multi_sample_model = sample_model

        return config

    def __load(self, model_name):
        """Load a DDC object from a zip file.
        
        :param model_name: Path to model
        :type model_name: string
        """

        print("Loading model.")
        tstart = datetime.now()

        # Temporary directory to extract the zipped information
        with tempfile.TemporaryDirectory() as dirpath:

            # Unzip the directory that contains the saved model(s)
            with zipfile.ZipFile(model_name + ".zip", "r") as zip_ref:
                zip_ref.extractall(dirpath)

            # Load metadata
            metadata = pickle.load(open(dirpath + "/metadata.pickle", "rb"))

            # Re-load metadata
            self.__dict__.update(metadata)

            # Load all sub-models
            try:
                self.__mol_to_latent_model = load_model(
                    dirpath + "/mol_to_latent_model.h5"
                )
            except:
                print("'mol_to_latent_model' not found, setting to None.")
                self.__mol_to_latent_model = None

            self.__latent_to_states_model = load_model(
                dirpath + "/latent_to_states_model.h5"
            )
            self.__batch_model = load_model(dirpath + "/batch_model.h5")
            
            # Build sample_model out of the trained batch_model
            self.__build_sample_model(batch_input_length=1)  # Single-output model
            self.__build_sample_model(
                batch_input_length=256 # could also be self.batch_size
            )  # Multi-output model

        print("Loading finished in %i seconds." % ((datetime.now() - tstart).seconds))

    """
    Public methods.
    """

    def fit(
        self,
        epochs,
        lr,
        mini_epochs,
        patience,
        model_name,
        gpus=1,
        workers=1,
        use_multiprocessing=False,
        verbose=2,
        max_queue_size=10,
        clipvalue=0,
        save_period=5,
        checkpoint_dir="/",
        lr_decay=False,
        sch_epoch_to_start=500,
        sch_last_epoch=999,
        sch_lr_init=1e-3,
        sch_lr_final=1e-6,
    ):

        """Fit the full model to the training data.
        Supports multi-gpu training if gpus set to >1.
        
        :param epochs: Training iterations over complete training set.
        :type epochs: int
        :param lr: Initial learning rate
        :type lr: float
        :param mini_epochs: Subdivisions of a single epoch to trick Keras into applying callbacks
        :type mini_epochs: int
        :param patience: minimum consecutive mini_epochs of stagnated learning rate to consider before lowering it with ReduceLROnPlateau 
        :type patience: int
        :param model_name: Base name of model checkpoints
        :type model_name: str
        :param gpus: Number of GPUs to be used for training, defaults to 1
        :type gpus: int, optional
        :param workers: Keras CPU workers, defaults to 1
        :type workers: int, optional
        :param use_multiprocessing: Multi-CPU processing, defaults to False
        :type use_multiprocessing: bool, optional
        :param verbose: Keras training verbosity, defaults to 2
        :type verbose: int, optional
        :param max_queue_size: Keras generator max number of fetched samples, defaults to 10
        :type max_queue_size: int, optional
        :param clipvalue: Gradient clipping value, defaults to 0
        :type clipvalue: int, optional
        :param save_period: Checkpoint period in miniepochs, defaults to 5
        :type save_period: int, optional
        :param checkpoint_dir: Directory to store checkpoints in, defaults to "/"
        :type checkpoint_dir: str, optional
        :param lr_decay: Flag to enable exponential learning rate decay, defaults to False
        :type lr_decay: bool, optional
        :param sch_epoch_to_start: Miniepoch to start exponential learning rate decay, defaults to 500
        :type sch_epoch_to_start: int, optional
        :param sch_last_epoch: Last miniepoch of exponential learning rate decay, defaults to 999
        :type sch_last_epoch: int, optional
        :param sch_lr_init: Initial learning rate to start exponential learning rate decay, defaults to 1e-3
        :type sch_lr_init: float, optional
        :param sch_lr_final: Target learning rate value to stop decaying, defaults to 1e-6
        :type sch_lr_final: float, optional
        """

        # Get parameter values if specified
        self.__epochs = epochs
        self.__lr = lr
        self.__clipvalue = clipvalue

        # Optimizer
        if clipvalue > 0:
            print("Using gradient clipping %.2f." % clipvalue)
            opt = Adam(lr=self.lr, clipvalue=self.clipvalue)

        else:
            opt = Adam(lr=self.lr)

        checkpoint_file = (
            checkpoint_dir + "%s--{epoch:02d}--{val_loss:.4f}--{lr:.7f}" % model_name
        )

        # If model is untrained, history is blank
        try:
            history = self.h

        # Else, append the history
        except:
            history = {}

        # Callback for saving intermediate models during training
        mhcp = ModelAndHistoryCheckpoint(
            filepath=checkpoint_file,
            model_dict=self.__dict__,
            monitor="val_loss",
            verbose=1,
            mode="min",
            period=save_period,
            history=history,
        )
        # Training history
        self.__h = mhcp.history

        if lr_decay:
            lr_schedule = LearningRateSchedule(
                epoch_to_start=sch_epoch_to_start,
                last_epoch=sch_last_epoch,
                lr_init=sch_lr_init,
                lr_final=sch_lr_final,
            )

            lr_scheduler = LearningRateScheduler(
                schedule=lr_schedule.exp_decay, verbose=1
            )

            callbacks = [lr_scheduler, mhcp]

        else:
            rlr = ReduceLROnPlateau(
                monitor="val_loss",
                factor=0.5,
                patience=patience,
                min_lr=1e-6,
                verbose=1,
                min_delta=1e-4,
            )

            callbacks = [rlr, mhcp]

        # Inspect training parameters at the start of the training
        self.summary()

        # Parallel training on multiple GPUs
        if gpus > 1:
            parallel_model = multi_gpu_model(self.model, gpus=gpus)
            parallel_model.compile(loss="categorical_crossentropy", optimizer=opt)
            # This `fit` call will be distributed on all GPUs.
            # Each GPU will process (batch_size/gpus) samples per batch.
            parallel_model.fit_generator(
                self.train_gen,
                steps_per_epoch=self.steps_per_epoch / mini_epochs,
                epochs=mini_epochs * self.epochs,
                validation_data=self.valid_gen,
                validation_steps=self.validation_steps / mini_epochs,
                callbacks=callbacks,
                max_queue_size=max_queue_size,
                workers=workers,
                use_multiprocessing=use_multiprocessing,
                verbose=verbose,
            )  # 1 to show progress bar

        elif gpus == 1:
            self.model.compile(loss="categorical_crossentropy", optimizer=opt)
            self.model.fit_generator(
                self.train_gen,
                steps_per_epoch=self.steps_per_epoch / mini_epochs,
                epochs=mini_epochs * self.epochs,
                validation_data=self.valid_gen,
                validation_steps=self.validation_steps / mini_epochs,
                callbacks=callbacks,
                max_queue_size=10,
                workers=workers,
                use_multiprocessing=use_multiprocessing,
                verbose=verbose,
            )  # 1 to show progress bar

        # Build sample_model out of the trained batch_model
        self.__build_sample_model(batch_input_length=1)  # Single-output model
        self.__build_sample_model(
            batch_input_length=self.batch_size
        )  # Multi-output model

    def vectorize(self, mols_test, leftpad=True):
        """Perform One-Hot Encoding (OHE) on a binary molecule.
        
        :param mols_test: Molecules to vectorize
        :type mols_test: list
        :param leftpad: Left zero-padding direction, defaults to True
        :type leftpad: bool, optional
        :return: One-Hot Encoded molecules
        :rtype: list
        """

        if leftpad:
            return self.smilesvec1.transform(mols_test)
        else:
            return self.smilesvec2.transform(mols_test)

    def transform(self, mols_ohe):
        """Encode a batch of OHE molecules into their latent representations.
        Must be called on the output of self.vectorize().
        
        :param mols_ohe: List of One-Hot Encoded molecules
        :type mols_ohe: list
        :return: Latent representation of input molecules
        :rtype: list
        """

        latent = self.mol_to_latent_model.predict(mols_ohe)
        return latent.reshape((latent.shape[0], 1, latent.shape[1]))

    # @timed
    def predict(self, latent, temp=1, rng_seed=None):
        """Generate a single SMILES string.
        The states of the RNN are set based on the latent input.
        Careful, "latent" must be: the output of self.transform()
                                   or
                                   an array of molecular descriptors.
        If temp>0, multinomial sampling is used instead of selecting 
        the single most probable character at each step.
        If temp==1, multinomial sampling without temperature scaling is used.
        
        :param latent: 1D Latent vector to steer the generation
        :type latent: numpy.ndarray
        :param temp: Temperatute of multinomial sampling (argmax if 0), defaults to 1
        :type temp: int, optional
        :return: The predicted SMILES string and its NLL of being sampled
        :rtype: list
        """

        # Pass rng_seed for repeatable sampling 
        if rng_seed is not None:
            np.random.seed(rng_seed)
        
        # Scale inputs if model is trained on scaled data
        if self.scaler is not None:
            latent = self.scaler.transform(
                latent.reshape(1, -1)
            )  # Re-shape because scaler complains

        # Apply PCA to input if model is trained accordingly
        if self.pca is not None:
            latent = self.pca.transform(latent)

        states = self.latent_to_states_model.predict(latent)

        # Decode states and reset the LSTM cells with them to bias the generation towards the desired properties
        for dec_layer in range(self.dec_layers):
            self.sample_model.get_layer("Decoder_LSTM_" + str(dec_layer)).reset_states(
                states=[states[2 * dec_layer], states[2 * dec_layer + 1]]
            )

        # Prepare the input char
        startidx = self.smilesvec1._char_to_int[self.smilesvec1.startchar]
        samplevec = np.zeros((1, 1, self.smilesvec1.dims[-1]))
        samplevec[0, 0, startidx] = 1
        smiles = ""
        # Initialize Negative Log-Likelihood (NLL)
        NLL = 0
        # Loop and predict next char
        for i in range(1000):
            o = self.sample_model.predict(samplevec)
            # Multinomial sampling with temperature scaling
            if temp > 0:
                temp = abs(temp)  # Handle negative values
                nextCharProbs = np.log(o) / temp
                nextCharProbs = np.exp(nextCharProbs)
                nextCharProbs = (
                    nextCharProbs / nextCharProbs.sum() - 1e-8
                )  # Re-normalize for float64 to make exactly 1.0 for np.random.multinomial
                sampleidx = np.random.multinomial(
                    1, nextCharProbs.squeeze(), 1
                ).argmax()

            # Else, select the most probable character
            else:
                sampleidx = np.argmax(o)

            samplechar = self.smilesvec1._int_to_char[sampleidx]
            if samplechar != self.smilesvec1.endchar:
                # Append the new character
                smiles += samplechar
                samplevec = np.zeros((1, 1, self.smilesvec1.dims[-1]))
                samplevec[0, 0, sampleidx] = 1
                # Calculate negative log likelihood for the selected character given the sequence so far
                NLL -= np.log(o[0][0][sampleidx])
            else:
                return smiles, NLL

    # @timed
    def predict_batch(self, latent, temp=1, rng_seed=None):
        """Generate multiple biased SMILES strings.
        Careful, "latent" must be: the output of self.transform()
                                   or
                                   an array of molecular descriptors.
        If temp>0, multinomial sampling is used instead of selecting 
        the single most probable character at each step.
        If temp==1, multinomial sampling without temperature scaling is used.
        Low temp leads to elimination of characters with low probabilities.
        predict_many() generates batch_input_length (default==batch_size) individual SMILES 
        strings per call. To change that, reset batch_input_length to a new value.
        
        :param latent: List of latent vectors
        :type latent: list
        :param temp: Temperatute of multinomial sampling (argmax if 0), defaults to 1
        :type temp: int, optional
        :return: List of predicted SMILES strings and their NLL of being sampled
        :rtype: list
        """

        # Pass rng_seed for repeatable sampling 
        if rng_seed is not None:
            np.random.seed(rng_seed)
        
        if latent.shape[0] == 1:
            # Make a batch prediction by repeating the same latent vector for every neuron
            latent = np.ones((self.batch_input_length, self.codelayer_dim)) * latent
        else:
            # Make sure it is squeezed
            latent = latent.squeeze()

        # Scale inputs if model is trained on scaled data
        if self.scaler is not None:
            latent = self.scaler.transform(latent)

        # Apply PCA to input if model is trained accordingly
        if self.pca is not None:
            latent = self.pca.transform(latent)

        # Decode states and reset the LSTM cells with them, to bias the generation towards the desired properties
        states = self.latent_to_states_model.predict(latent)

        for dec_layer in range(self.dec_layers):
            self.multi_sample_model.get_layer(
                "Decoder_LSTM_" + str(dec_layer)
            ).reset_states(states=[states[2 * dec_layer], states[2 * dec_layer + 1]])

        # Index of input char "^"
        startidx = self.smilesvec1._char_to_int[self.smilesvec1.startchar]
        # Vectorize the input char for all SMILES
        samplevec = np.zeros((self.batch_input_length, 1, self.smilesvec1.dims[-1]))
        samplevec[:, 0, startidx] = 1
        # Initialize arrays to store SMILES, their NLLs and their status
        smiles = np.array([""] * self.batch_input_length, dtype=object)
        NLL = np.zeros((self.batch_input_length,))
        finished = np.array([False] * self.batch_input_length)

        # Loop and predict next char
        for i in range(1000):
            o = self.multi_sample_model.predict(
                samplevec, batch_size=self.batch_input_length
            ).squeeze()

            # Multinomial sampling with temperature scaling
            if temp > 0:
                temp = abs(temp)  # No negative values
                nextCharProbs = np.log(o) / temp
                nextCharProbs = np.exp(nextCharProbs)  # .squeeze()

                # Normalize probabilities
                nextCharProbs = (nextCharProbs.T / nextCharProbs.sum(axis=1) - 1e-8).T
                sampleidc = np.asarray(
                    [
                        np.random.multinomial(1, nextCharProb, 1).argmax()
                        for nextCharProb in nextCharProbs
                    ]
                )

            else:
                sampleidc = np.argmax(o, axis=1)

            samplechars = [self.smilesvec1._int_to_char[idx] for idx in sampleidc]

            for idx, samplechar in enumerate(samplechars):
                if not finished[idx]:
                    if samplechar != self.smilesvec1.endchar:
                        # Append the SMILES with the next character
                        smiles[idx] += self.smilesvec1._int_to_char[sampleidc[idx]]
                        samplevec = np.zeros(
                            (self.batch_input_length, 1, self.smilesvec1.dims[-1])
                        )
                        # One-Hot Encode the character
                        # samplevec[:,0,sampleidc] = 1
                        for count, sampleidx in enumerate(sampleidc):
                            samplevec[count, 0, sampleidx] = 1
                        # Calculate negative log likelihood for the selected character given the sequence so far
                        NLL[idx] -= np.log(o[idx][sampleidc[idx]])
                    else:
                        finished[idx] = True
                        # print("SMILES has finished at %i" %i)

            # If all SMILES are finished, i.e. the endchar "$" has been generated, stop the generation
            if finished.sum() == len(finished):
                return smiles, NLL

    @timed
    def get_smiles_nll(self, latent, smiles_ref) -> float:
        """Back-calculate the NLL of a given SMILES string if its descriptors are used as RNN states.
        
        :param latent: Descriptors or latent representation of smiles_ref
        :type latent: list
        :param smiles_ref: Given SMILES to back-calculate its NLL
        :type smiles_ref: str
        :return: NLL of sampling smiles_ref given its latent representation (or descriptors)
        :rtype: float
        """

        # Scale inputs if model is trained on scaled data
        if self.scaler is not None:
            latent = self.scaler.transform(
                latent.reshape(1, -1)
            )  # Re-shape because scaler complains

        # Apply PCA to input if model is trained accordingly
        if self.pca is not None:
            latent = self.pca.transform(latent)

        states = self.latent_to_states_model.predict(latent)

        # Decode states and reset the LSTM cells with them to bias the generation towards the desired properties
        for dec_layer in range(self.dec_layers):
            self.sample_model.get_layer("Decoder_LSTM_" + str(dec_layer)).reset_states(
                states=[states[2 * dec_layer], states[2 * dec_layer + 1]]
            )

        # Prepare the input char
        startidx = self.smilesvec1._char_to_int[self.smilesvec1.startchar]
        samplevec = np.zeros((1, 1, self.smilesvec1.dims[-1]))
        samplevec[0, 0, startidx] = 1

        # Initialize Negative Log-Likelihood (NLL)
        NLL = 0
        # Loop and predict next char
        for i in range(1000):
            o = self.sample_model.predict(samplevec)

            samplechar = smiles_ref[i]
            sampleidx = self.smilesvec1._char_to_int[samplechar]

            if i != len(smiles_ref) - 1:
                samplevec = np.zeros((1, 1, self.smilesvec1.dims[-1]))
                samplevec[0, 0, sampleidx] = 1
                # Calculate negative log likelihood for the selected character given the sequence so far
                NLL -= np.log(o[0][0][sampleidx])
            else:
                return NLL

    @timed
    def get_smiles_nll_batch(self, latent, smiles_ref) -> list:
        """Back-calculate the individual NLL for a batch of known SMILES strings.
        Batch size is equal to self.batch_input_length so reset it if needed.
        
        :param latent: List of latent representations (or descriptors)
        :type latent: list
        :param smiles_ref: List of given SMILES to back-calculate their NLL
        :type smiles_ref: list
        :return: List of NLL of sampling smiles_ref given their latent representations (or descriptors)
        :rtype: list
        """

        assert (
            len(latent) <= self.batch_input_length
        ), "Input length must be less than or equal to batch_input_length."

        # Scale inputs if model is trained on scaled data
        if self.scaler is not None:
            latent = self.scaler.transform(latent)

        # Apply PCA to input if model is trained accordingly
        if self.pca is not None:
            latent = self.pca.transform(latent)

        # Decode states and reset the LSTM cells with them, to bias the generation towards the desired properties
        states = self.latent_to_states_model.predict(latent)

        for dec_layer in range(self.dec_layers):
            self.multi_sample_model.get_layer(
                "Decoder_LSTM_" + str(dec_layer)
            ).reset_states(states=[states[2 * dec_layer], states[2 * dec_layer + 1]])

        # Index of input char "^"
        startidx = self.smilesvec1._char_to_int[self.smilesvec1.startchar]
        # Vectorize the input char for all SMILES
        samplevec = np.zeros((self.batch_input_length, 1, self.smilesvec1.dims[-1]))
        samplevec[:, 0, startidx] = 1
        # Initialize arrays to store NLLs and flag if a SMILES is finished
        NLL = np.zeros((self.batch_input_length,))
        finished = np.array([False] * self.batch_input_length)

        # Loop and predict next char
        for i in range(1000):
            o = self.multi_sample_model.predict(
                samplevec, batch_size=self.batch_input_length
            ).squeeze()
            samplechars = []

            for smiles in smiles_ref:
                try:
                    samplechars.append(smiles[i])
                except:
                    # This is a finished SMILES, so "i" exceeds dimensions
                    samplechars.append("$")

            sampleidc = np.asarray(
                [self.smilesvec1._char_to_int[char] for char in samplechars]
            )

            for idx, samplechar in enumerate(samplechars):
                if not finished[idx]:
                    if i != len(smiles_ref[idx]) - 1:
                        samplevec = np.zeros(
                            (self.batch_input_length, 1, self.smilesvec1.dims[-1])
                        )
                        # One-Hot Encode the character
                        for count, sampleidx in enumerate(sampleidc):
                            samplevec[count, 0, sampleidx] = 1
                        # Calculate negative log likelihood for the selected character given the sequence so far
                        NLL[idx] -= np.log(o[idx][sampleidc[idx]])
                    else:
                        finished[idx] = True

            # If all SMILES are finished, i.e. the endchar "$" has been generated, stop the generation
            if finished.sum() == len(finished):
                return NLL

    def summary(self):
        """Echo the training configuration for inspection.
        """

        print(
            "\nModel trained with dataset %s that has maxlen=%d and charset=%s for %d epochs."
            % (self.dataset_name, self.maxlen, self.charset, self.epochs)
        )

        print(
            "noise_std: %.6f, lstm_dim: %d, dec_layers: %d, td_dense_dim: %d, batch_size: %d, codelayer_dim: %d, lr: %.6f."
            % (
                self.noise_std,
                self.lstm_dim,
                self.dec_layers,
                self.td_dense_dim,
                self.batch_size,
                self.codelayer_dim,
                self.lr,
            )
        )

    def get_graphs(self):
        """Export the graphs of the model and its submodels to png files.
        Requires "pydot" and "graphviz" to be installed (pip install graphviz && pip install pydot).
        """

        try:
            from keras.utils import plot_model
            from keras.utils.vis_utils import model_to_dot

            # from IPython.display import SVG

            plot_model(self.model, to_file="model.png")
            plot_model(
                self.latent_to_states_model, to_file="latent_to_states_model.png"
            )
            plot_model(self.batch_model, to_file="batch_model.png")
            if self.mol_to_latent_model is not None:
                plot_model(self.mol_to_latent_model, to_file="mol_to_latent_model.png")

            print("Models exported to png files.")

        except:
            print("Check pydot and graphviz installation.")

    @timed
    def save(self, model_name):
        """Save model in a zip file.
   
        :param model_name: Path to save model in
        :type model_name: str
        """

        with tempfile.TemporaryDirectory() as dirpath:

            # Save the Keras models
            if self.mol_to_latent_model is not None:
                self.mol_to_latent_model.save(dirpath + "/mol_to_latent_model.h5")

            self.latent_to_states_model.save(dirpath + "/latent_to_states_model.h5")
            self.batch_model.save(dirpath + "/batch_model.h5")

            # Exclude unpicklable and unwanted attributes
            excl_attr = [
                "_DDC__mode",
                "_DDC__train_gen",
                "_DDC__valid_gen",
                "_DDC__mol_to_latent_model",
                "_DDC__latent_to_states_model",
                "_DDC__batch_model",
                "_DDC__sample_model",
                "_DDC__multi_sample_model",
                "_DDC__model",
            ]

            # Cannot deepcopy self.__dict__ because of Keras' thread lock so this is
            # bypassed by popping and re-inserting the unpicklable attributes
            to_add = {}
            # Remove unpicklable attributes
            for attr in excl_attr:
                to_add[attr] = self.__dict__.pop(attr, None)

            # Pickle metadata, i.e. almost everything but the Keras models and generators
            pickle.dump(self.__dict__, open(dirpath + "/metadata.pickle", "wb"))

            # Zip directory with its contents
            shutil.make_archive(model_name, "zip", dirpath)

            # Finally, re-load the popped elements for the model to be usable
            for attr in excl_attr:
                self.__dict__[attr] = to_add[attr]

            print("Model saved.")