--- a +++ b/ddc_pub/ddc_v3.py @@ -0,0 +1,1461 @@ +import os +os.environ[ + "TF_CPP_MIN_LOG_LEVEL" +] = "3" # Suppress UserWarning of TensorFlow while loading the model + +import shutil +import zipfile +import tempfile +import pickle +import numpy as np +from datetime import datetime +from functools import wraps + +from tensorflow.keras.layers import ( + Input, + Concatenate, + Dense, + Flatten, + RepeatVector, + TimeDistributed, + Bidirectional, + GaussianNoise, + BatchNormalization, +) +from tensorflow.compat.v1.keras.layers import ( + CuDNNLSTM as LSTM, +) # Faster drop-in for LSTM using CuDNN on TF backend on GPU +from tensorflow.keras.models import Model, load_model +from tensorflow.keras.optimizers import Adam +from tensorflow.keras.callbacks import ReduceLROnPlateau, LearningRateScheduler +from tensorflow.keras.utils import multi_gpu_model, plot_model + +from sklearn.preprocessing import StandardScaler # For the descriptors +from sklearn.decomposition import PCA # For the descriptors + +# Custom dependencies +from .vectorizers import SmilesVectorizer +from .generators import CodeGenerator as DescriptorGenerator +from .generators import HetSmilesGenerator +from .custom_callbacks import ModelAndHistoryCheckpoint, LearningRateSchedule + + +def timed(func): + """Timer decorator to benchmark functions.""" + @wraps(func) + def wrapper(*args, **kwargs): + tstart = datetime.now() + result = func(*args, **kwargs) + elapsed = (datetime.now() - tstart).microseconds / 1e6 + print("Elapsed time: %.3f seconds." % elapsed) + return result + return wrapper + + +class DDC: + def __init__(self, **kwargs): + """Initialize a DDC object from scratch or from a trained configuration. All binary mols are converted to SMILES strings internally, using the vectorizers. + + # Examples of __init__ usage + To *train* a blank model with encoder (autoencoder): + model = ddc.DDC(x = mols, + y = mols, + scaling = True, + pca = True, + dataset_info = info, + noise_std = 0.1, + lstm_dim = 256, + dec_layers = 3, + td_dense_dim = 0, + batch_size = 128, + codelayer_dim = 128) + + To *train* a blank model without encoder: + model = ddc.DDC(x = descriptors, + y = mols, + scaling = True, + pca = True, + dataset_info = info, + noise_std = 0.1, + lstm_dim = 256, + dec_layers = 3, + td_dense_dim = 0, + batch_size = 128) + + To *re-train* a saved model with encoder (autoencoder): + model = ddc.DDC(x = mols, + y = mols, + model_name = saved_model_name) + + To *re-train* a saved model without encoder: + model = ddc.DDC(x = descriptors, + y = mols, + model_name = saved_model_name) + + To *test* a saved model: + model = ddc.DDC(model_name = saved_model_name) + + :param x: Encoder input + :type x: list or numpy.ndarray + :param y: Decoder input for teacher's forcing + :type y: list or numpy.ndarray + :param scaling: Flag to scale descriptor inputs, defaults to `False` + :type scaling: boolean + :param pca: Flag to apply PCA on descriptor inputs, defaults to `False` + :type pca: boolean + :param model_name: Filename of model to load + :type model_name: str + :param dataset_info: Metadata about dataset + :type dataset_info: dict + :param noise_std: Standard deviation of noise in the latent space, defaults to 0.01 + :type noise_std: float + :param lstm_dim: Number of LSTM units in the encoder/decoder layers, defaults to 256 + :param dec_layers: Number of decoder layers, defaults to 2 + :type dec_layers: int + :param td_dense_dim: Number of intermediate Dense units to squeeze LSTM outputs, defaults to 0 + :type td_dense_dim: int + :param batch_size: Batch size to train with, defaults to 256 + :type batch_size: int + :param codelayer_dim: Dimensionality of latent space + :type codelayer_dim: int + :param bn: Fla to enable batch normalization, defaults to `True` + :type bn: boolean + :param bn_momentum: Momentum value to be used in batch normalization, defaults to 0.9 + :type bn_momentum: float + """ + + # Identify the mode to start the model in + if "x" in kwargs and "y" in kwargs: + x = kwargs.get("x") + y = kwargs.get("y") + if "model_name" not in kwargs: + self.__mode = "train" + else: + self.__mode = "retrain" + elif "model_name" in kwargs: + self.__mode = "test" + else: + raise NameError("Cannot infer mode from arguments.") + + print("Initializing model in %s mode." % self.__mode) + + if self.mode == "train": + # Infer input type from type(x) + if type(x[0]) == np.bytes_: + print("Input type is 'binary mols'.") + self.__input_type = "mols" # binary RDKit mols + else: + print("Input type is 'molecular descriptors'.") + self.__input_type = "descriptors" # other molecular descriptors + + # If scaling is required + if kwargs.get("scaling", False) is True: + # Normalize the input + print("Applying scaling on input.") + self.__scaler = StandardScaler() + x = self.__scaler.fit_transform(x) + else: + self.__scaler = None + + # If PCA is required + if kwargs.get("pca", False) is True: + print("Applying PCA on input.") + self.__pca = PCA( + n_components=x.shape[1] + ) # n_components=n_features for now + x = self.__pca.fit_transform(x) + else: + self.__pca = None + + self.__maxlen = ( + kwargs.get("dataset_info")["maxlen"] + 10 + ) # Extend maxlen to avoid breaks in training + self.__charset = kwargs.get("dataset_info")["charset"] + self.__dataset_name = kwargs.get("dataset_info")["name"] + self.__lstm_dim = kwargs.get("lstm_dim", 256) + self.__h_activation = kwargs.get("h_activation", "relu") + self.__bn = kwargs.get("bn", True) + self.__bn_momentum = kwargs.get("bn_momentum", 0.9) + self.__noise_std = kwargs.get("noise_std", 0.01) + self.__td_dense_dim = kwargs.get( + "td_dense_dim", 0 + ) # >0 squeezes RNN connections with Dense sandwiches + self.__batch_size = kwargs.get("batch_size", 256) + self.__dec_layers = kwargs.get("dec_layers", 2) + + if self.input_type == "descriptors": + self.__codelayer_dim = x.shape[1] # features + if "codelayer_dim" in kwargs: + print( + "Ignoring requested codelayer_dim because it is inferred from the cardinality of the descriptors." + ) + else: + self.__codelayer_dim = kwargs.get("codelayer_dim", 128) + + # Create the left/right-padding vectorizers + self.__smilesvec1 = SmilesVectorizer( + canonical=False, + augment=True, + maxlength=self.maxlen, + charset=self.charset, + binary=True, + ) + + self.__smilesvec2 = SmilesVectorizer( + canonical=False, + augment=True, + maxlength=self.maxlen, + charset=self.charset, + binary=True, + leftpad=False, + ) + + # self.train_gen.next() #This line is needed to set train_gen.dims (to be fixed in HetSmilesGenerator) + self.__input_shape = self.smilesvec1.dims + self.__dec_dims = list(self.smilesvec1.dims) + self.__dec_dims[0] = self.dec_dims[0] - 1 + self.__dec_input_shape = self.dec_dims + self.__output_len = self.smilesvec1.dims[0] - 1 + self.__output_dims = self.smilesvec1.dims[-1] + + # Build all sub-models as untrained models + if self.input_type == "mols": + self.__build_mol_to_latent_model() + else: + self.__mol_to_latent_model = None + + self.__build_latent_to_states_model() + self.__build_batch_model() + + # Build data generators + self.__build_generators(x, y) + + # Retrain or Test mode + else: + self.__model_name = kwargs.get("model_name") + + # Load the model + self.__load(self.model_name) + + if self.mode == "retrain": + # If scaling is required + if self.scaler is not None: + print("Applying scaling on input.") + x = self.scaler.transform(x) + + # If PCA is required + if self.pca is not None: + print("Applying PCA on input.") + x = self.pca.transform(x) + + # Build data generators + self.__build_generators(x, y) + + # Build full model out of the sub-models + self.__build_model() + + # Show the resulting full model + print(self.model.summary()) + + """ + Architecture properties. + """ + + @property + def lstm_dim(self): + return self.__lstm_dim + + @property + def h_activation(self): + return self.__h_activation + + @property + def bn(self): + return self.__bn + + @property + def bn_momentum(self): + return self.__bn_momentum + + @property + def noise_std(self): + return self.__noise_std + + @property + def td_dense_dim(self): + return self.__td_dense_dim + + @property + def batch_size(self): + return self.__batch_size + + @property + def dec_layers(self): + return self.__dec_layers + + @property + def codelayer_dim(self): + return self.__codelayer_dim + + @property + def steps_per_epoch(self): + return self.__steps_per_epoch + + @property + def validation_steps(self): + return self.__validation_steps + + @property + def input_shape(self): + return self.__input_shape + + @property + def dec_dims(self): + return self.__dec_dims + + @property + def dec_input_shape(self): + return self.__dec_input_shape + + @property + def output_len(self): + return self.__output_len + + @property + def output_dims(self): + return self.__output_dims + + @property + def batch_input_length(self): + return self.__batch_input_length + + @batch_input_length.setter + def batch_input_length(self, value): + self.__batch_input_length = value + self.__build_sample_model(batch_input_length=value) + + """ + Models. + """ + + @property + def mol_to_latent_model(self): + return self.__mol_to_latent_model + + @property + def latent_to_states_model(self): + return self.__latent_to_states_model + + @property + def batch_model(self): + return self.__batch_model + + @property + def sample_model(self): + return self.__sample_model + + @property + def multi_sample_model(self): + return self.__multi_sample_model + + @property + def model(self): + return self.__model + + """ + Train properties. + """ + + @property + def epochs(self): + return self.__epochs + + @property + def clipvalue(self): + return self.__clipvalue + + @property + def lr(self): + return self.__lr + + @property + def h(self): + return self.__h + + @h.setter + def h(self, value): + self.__h = value + + """ + Other properties. + """ + + @property + def mode(self): + return self.__mode + + @property + def dataset_name(self): + return self.__dataset_name + + @property + def model_name(self): + return self.__model_name + + @property + def input_type(self): + return self.__input_type + + @property + def maxlen(self): + return self.__maxlen + + @property + def charset(self): + return self.__charset + + @property + def smilesvec1(self): + return self.__smilesvec1 + + @property + def smilesvec2(self): + return self.__smilesvec2 + + @property + def train_gen(self): + return self.__train_gen + + @property + def valid_gen(self): + return self.__valid_gen + + @property + def scaler(self): + try: + return self.__scaler + except: + return None + + @property + def pca(self): + try: + return self.__pca + except: + return None + + """ + Private methods. + """ + + def __build_generators(self, x, y, split=0.9): + """Build data generators to be used for (re)training. + + :param x: Encoder input + :type x: list + :param y: Decoder input for teacher's forcing + :type y: list + :param split: Fraction of samples to keep for training (rest for validation), defaults to 0.9 + :type split: float, optional + """ + + # Sanity check + assert len(x) == len(y) + + # Split dataset into train and validation sets + cut = int(split * len(x)) + x_train = x[:cut] + x_valid = x[cut:] + y_train = y[:cut] + y_valid = y[cut:] + + if self.input_type == "mols": + self.__train_gen = HetSmilesGenerator( + x_train, + None, + self.smilesvec1, + self.smilesvec2, + batch_size=self.batch_size, + shuffle=True, + ) + + self.__valid_gen = HetSmilesGenerator( + x_valid, + None, + self.smilesvec1, + self.smilesvec2, + batch_size=self.batch_size, + shuffle=True, + ) + + else: + self.__train_gen = DescriptorGenerator( + x_train, + y_train, + self.smilesvec1, + self.smilesvec2, + batch_size=self.batch_size, + shuffle=True, + ) + + self.__valid_gen = DescriptorGenerator( + x_valid, + y_valid, + self.smilesvec1, + self.smilesvec2, + batch_size=self.batch_size, + shuffle=True, + ) + + # Calculate number of batches per training/validation epoch + train_samples = len(x_train) + valid_samples = len(x_valid) + self.__steps_per_epoch = train_samples // self.batch_size + self.__validation_steps = valid_samples // self.batch_size + + print( + "Model received %d train samples and %d validation samples." + % (train_samples, valid_samples) + ) + + def __build_mol_to_latent_model(self): + """Model that transforms binary molecules to their latent representation. + Only used if input is mols. + """ + + # Input tensor (MANDATORY) + encoder_inputs = Input(shape=self.input_shape, name="Encoder_Inputs") + + x = encoder_inputs + + # The two encoder layers, number of cells are halved as Bidirectional + encoder = Bidirectional( + LSTM( + self.lstm_dim // 2, + return_sequences=True, + return_state=True, # Return the states at end of the batch + name="Encoder_LSTM_1", + ) + ) + + x, state_h, state_c, state_h_reverse, state_c_reverse = encoder(x) + + if self.bn: + x = BatchNormalization(momentum=self.bn_momentum, name="BN_1")(x) + + encoder2 = Bidirectional( + LSTM( + self.lstm_dim // 2, + return_state=True, # Return the states at end of the batch + name="Encoder_LSTM_2", + ) + ) + + _, state_h2, state_c2, state_h2_reverse, state_c2_reverse = encoder2(x) + + # Concatenate all states of the forward and the backward LSTM layers + states = Concatenate(axis=-1, name="Concatenate_1")( + [ + state_h, + state_c, + state_h2, + state_c2, + state_h_reverse, + state_c_reverse, + state_h2_reverse, + state_c2_reverse, + ] + ) + + if self.bn: + states = BatchNormalization(momentum=self.bn_momentum, name="BN_2")(states) + + # A non-linear recombination + neck_relu = Dense( + self.codelayer_dim, activation=self.h_activation, name="Codelayer_Relu" + ) + neck_outputs = neck_relu(states) + + if self.bn: + neck_outputs = BatchNormalization( + momentum=self.bn_momentum, name="BN_Codelayer" + )(neck_outputs) + + # Add Gaussian noise to "spread" the distribution of the latent variables during training + neck_outputs = GaussianNoise(self.noise_std, name="Gaussian_Noise")( + neck_outputs + ) + + # Define the model + self.__mol_to_latent_model = Model(encoder_inputs, neck_outputs) + + # Name it! + self.mol_to_latent_model._name = "mol_to_latent_model" + + def __build_latent_to_states_model(self): + """Model that constructs the initial states of the decoder from a latent molecular representation. + """ + + # Input tensor (MANDATORY) + latent_input = Input(shape=(self.codelayer_dim,), name="Latent_Input") + + # Initialize list of state tensors for the decoder + decoder_state_list = [] + + for dec_layer in range(self.dec_layers): + + # The tensors for the initial states of the decoder + name = "Dense_h_" + str(dec_layer) + h_decoder = Dense(self.lstm_dim, activation="relu", name=name)(latent_input) + + name = "Dense_c_" + str(dec_layer) + c_decoder = Dense(self.lstm_dim, activation="relu", name=name)(latent_input) + + if self.bn: + name = "BN_h_" + str(dec_layer) + h_decoder = BatchNormalization(momentum=self.bn_momentum, name=name)( + h_decoder + ) + + name = "BN_c_" + str(dec_layer) + c_decoder = BatchNormalization(momentum=self.bn_momentum, name=name)( + c_decoder + ) + + decoder_state_list.append(h_decoder) + decoder_state_list.append(c_decoder) + + # Define the model + self.__latent_to_states_model = Model(latent_input, decoder_state_list) + + # Name it! + self.latent_to_states_model._name = "latent_to_states_model" + + def __build_batch_model(self): + """Model that returns a vectorized SMILES string of OHE characters. + """ + + # List of input tensors to batch_model + inputs = [] + + # This is the start character padded OHE smiles for teacher forcing + decoder_inputs = Input(shape=self.dec_input_shape, name="Decoder_Inputs") + inputs.append(decoder_inputs) + + # I/O tensor of the LSTM layers + x = decoder_inputs + + for dec_layer in range(self.dec_layers): + name = "Decoder_State_h_" + str(dec_layer) + state_h = Input(shape=[self.lstm_dim], name=name) + inputs.append(state_h) + + name = "Decoder_State_c_" + str(dec_layer) + state_c = Input(shape=[self.lstm_dim], name=name) + inputs.append(state_c) + + # RNN layer + decoder_lstm = LSTM( + self.lstm_dim, + return_sequences=True, + name="Decoder_LSTM_" + str(dec_layer), + ) + + x = decoder_lstm(x, initial_state=[state_h, state_c]) + + if self.bn: + x = BatchNormalization( + momentum=self.bn_momentum, name="BN_Decoder_" + str(dec_layer) + )(x) + + # Squeeze LSTM interconnections using Dense layers + if self.td_dense_dim > 0: + x = TimeDistributed( + Dense(self.td_dense_dim), name="Time_Distributed_" + str(dec_layer) + )(x) + + # Final Dense layer to return soft labels (probabilities) + outputs = Dense(self.output_dims, activation="softmax", name="Dense_Decoder")(x) + + # Define the batch_model + self.__batch_model = Model(inputs=inputs, outputs=[outputs]) + + # Name it! + self.batch_model._name = "batch_model" + + def __build_model(self): + """Full model that constitutes the complete pipeline. + """ + + # IFF input is not encoded, stack the encoder (mol_to_latent_model) + if self.input_type == "mols": + # Input tensor (MANDATORY) - Same as the mol_to_latent_model input! + encoder_inputs = Input(shape=self.input_shape, name="Encoder_Inputs") + # Input tensor (MANDATORY) - Same as the batch_model input for teacher's forcing! + decoder_inputs = Input(shape=self.dec_input_shape, name="Decoder_Inputs") + + # Stack the three models + # Propagate tensors through 1st model + x = self.mol_to_latent_model(encoder_inputs) + # Propagate tensors through 2nd model + x = self.latent_to_states_model(x) + # Append the first input of the third model to be the one for teacher's forcing + x = [decoder_inputs] + x + # Propagate tensors through 3rd model + x = self.batch_model(x) + + # Define full model (SMILES -> SMILES) + self.__model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=[x]) + + # Input is pre-encoded, no need for encoder + else: + # Input tensor (MANDATORY) + latent_input = Input(shape=(self.codelayer_dim,), name="Latent_Input") + # Input tensor (MANDATORY) - Same as the batch_model input for teacher's forcing! + decoder_inputs = Input(shape=self.dec_input_shape, name="Decoder_Inputs") + + # Stack the two models + # Propagate tensors through 1st model + x = self.latent_to_states_model(latent_input) + # Append the first input of the 2nd model to be the one for teacher's forcing + x = [decoder_inputs] + x + # Propagate tensors through 2nd model + x = self.batch_model(x) + + # Define full model (latent -> SMILES) + self.__model = Model(inputs=[latent_input, decoder_inputs], outputs=[x]) + + def __build_sample_model(self, batch_input_length) -> dict: + """Model that predicts a single OHE character. + This model is generated from the modified config file of the batch_model. + + :param batch_input_length: Size of generated batch + :type batch_input_length: int + :return: The dictionary of the configuration + :rtype: dict + """ + + self.__batch_input_length = batch_input_length + + # Get the configuration of the batch_model + config = self.batch_model.get_config() + + # Keep only the "Decoder_Inputs" as single input to the sample_model + config["input_layers"] = [config["input_layers"][0]] + + # Find decoder states that are used as inputs in batch_model and remove them + idx_list = [] + for idx, layer in enumerate(config["layers"]): + + if "Decoder_State_" in layer["name"]: + idx_list.append(idx) + + # Pop the layer from the layer list + # Revert indices to avoid re-arranging after deleting elements + for idx in sorted(idx_list, reverse=True): + config["layers"].pop(idx) + + # Remove inbound_nodes dependencies of remaining layers on deleted ones + for layer in config["layers"]: + idx_list = [] + + try: + for idx, inbound_node in enumerate(layer["inbound_nodes"][0]): + if "Decoder_State_" in inbound_node[0]: + idx_list.append(idx) + # Catch the exception for first layer (Decoder_Inputs) that has empty list of inbound_nodes[0] + except: + pass + + # Pop the inbound_nodes from the list + # Revert indices to avoid re-arranging + for idx in sorted(idx_list, reverse=True): + layer["inbound_nodes"][0].pop(idx) + + # Change the batch_shape of input layer + config["layers"][0]["config"]["batch_input_shape"] = ( + batch_input_length, + 1, + self.dec_input_shape[-1], + ) + + # Finally, change the statefulness of the RNN layers + for layer in config["layers"]: + if "Decoder_LSTM_" in layer["name"]: + layer["config"]["stateful"] = True + # layer["config"]["return_sequences"] = True + + # Define the sample_model using the modified config file + sample_model = Model.from_config(config) + + # Copy the trained weights from the trained batch_model to the untrained sample_model + for layer in sample_model.layers: + # Get weights from the batch_model + weights = self.batch_model.get_layer(layer.name).get_weights() + # Set the weights to the sample_model + sample_model.get_layer(layer.name).set_weights(weights) + + if batch_input_length == 1: + self.__sample_model = sample_model + + elif batch_input_length > 1: + self.__multi_sample_model = sample_model + + return config + + def __load(self, model_name): + """Load a DDC object from a zip file. + + :param model_name: Path to model + :type model_name: string + """ + + print("Loading model.") + tstart = datetime.now() + + # Temporary directory to extract the zipped information + with tempfile.TemporaryDirectory() as dirpath: + + # Unzip the directory that contains the saved model(s) + with zipfile.ZipFile(model_name + ".zip", "r") as zip_ref: + zip_ref.extractall(dirpath) + + # Load metadata + metadata = pickle.load(open(dirpath + "/metadata.pickle", "rb")) + + # Re-load metadata + self.__dict__.update(metadata) + + # Load all sub-models + try: + self.__mol_to_latent_model = load_model( + dirpath + "/mol_to_latent_model.h5" + ) + except: + print("'mol_to_latent_model' not found, setting to None.") + self.__mol_to_latent_model = None + + self.__latent_to_states_model = load_model( + dirpath + "/latent_to_states_model.h5" + ) + self.__batch_model = load_model(dirpath + "/batch_model.h5") + + # Build sample_model out of the trained batch_model + self.__build_sample_model(batch_input_length=1) # Single-output model + self.__build_sample_model( + batch_input_length=256 # could also be self.batch_size + ) # Multi-output model + + print("Loading finished in %i seconds." % ((datetime.now() - tstart).seconds)) + + """ + Public methods. + """ + + def fit( + self, + epochs, + lr, + mini_epochs, + patience, + model_name, + gpus=1, + workers=1, + use_multiprocessing=False, + verbose=2, + max_queue_size=10, + clipvalue=0, + save_period=5, + checkpoint_dir="/", + lr_decay=False, + sch_epoch_to_start=500, + sch_last_epoch=999, + sch_lr_init=1e-3, + sch_lr_final=1e-6, + ): + + """Fit the full model to the training data. + Supports multi-gpu training if gpus set to >1. + + :param epochs: Training iterations over complete training set. + :type epochs: int + :param lr: Initial learning rate + :type lr: float + :param mini_epochs: Subdivisions of a single epoch to trick Keras into applying callbacks + :type mini_epochs: int + :param patience: minimum consecutive mini_epochs of stagnated learning rate to consider before lowering it with ReduceLROnPlateau + :type patience: int + :param model_name: Base name of model checkpoints + :type model_name: str + :param gpus: Number of GPUs to be used for training, defaults to 1 + :type gpus: int, optional + :param workers: Keras CPU workers, defaults to 1 + :type workers: int, optional + :param use_multiprocessing: Multi-CPU processing, defaults to False + :type use_multiprocessing: bool, optional + :param verbose: Keras training verbosity, defaults to 2 + :type verbose: int, optional + :param max_queue_size: Keras generator max number of fetched samples, defaults to 10 + :type max_queue_size: int, optional + :param clipvalue: Gradient clipping value, defaults to 0 + :type clipvalue: int, optional + :param save_period: Checkpoint period in miniepochs, defaults to 5 + :type save_period: int, optional + :param checkpoint_dir: Directory to store checkpoints in, defaults to "/" + :type checkpoint_dir: str, optional + :param lr_decay: Flag to enable exponential learning rate decay, defaults to False + :type lr_decay: bool, optional + :param sch_epoch_to_start: Miniepoch to start exponential learning rate decay, defaults to 500 + :type sch_epoch_to_start: int, optional + :param sch_last_epoch: Last miniepoch of exponential learning rate decay, defaults to 999 + :type sch_last_epoch: int, optional + :param sch_lr_init: Initial learning rate to start exponential learning rate decay, defaults to 1e-3 + :type sch_lr_init: float, optional + :param sch_lr_final: Target learning rate value to stop decaying, defaults to 1e-6 + :type sch_lr_final: float, optional + """ + + # Get parameter values if specified + self.__epochs = epochs + self.__lr = lr + self.__clipvalue = clipvalue + + # Optimizer + if clipvalue > 0: + print("Using gradient clipping %.2f." % clipvalue) + opt = Adam(lr=self.lr, clipvalue=self.clipvalue) + + else: + opt = Adam(lr=self.lr) + + checkpoint_file = ( + checkpoint_dir + "%s--{epoch:02d}--{val_loss:.4f}--{lr:.7f}" % model_name + ) + + # If model is untrained, history is blank + try: + history = self.h + + # Else, append the history + except: + history = {} + + # Callback for saving intermediate models during training + mhcp = ModelAndHistoryCheckpoint( + filepath=checkpoint_file, + model_dict=self.__dict__, + monitor="val_loss", + verbose=1, + mode="min", + period=save_period, + history=history, + ) + # Training history + self.__h = mhcp.history + + if lr_decay: + lr_schedule = LearningRateSchedule( + epoch_to_start=sch_epoch_to_start, + last_epoch=sch_last_epoch, + lr_init=sch_lr_init, + lr_final=sch_lr_final, + ) + + lr_scheduler = LearningRateScheduler( + schedule=lr_schedule.exp_decay, verbose=1 + ) + + callbacks = [lr_scheduler, mhcp] + + else: + rlr = ReduceLROnPlateau( + monitor="val_loss", + factor=0.5, + patience=patience, + min_lr=1e-6, + verbose=1, + min_delta=1e-4, + ) + + callbacks = [rlr, mhcp] + + # Inspect training parameters at the start of the training + self.summary() + + # Parallel training on multiple GPUs + if gpus > 1: + parallel_model = multi_gpu_model(self.model, gpus=gpus) + parallel_model.compile(loss="categorical_crossentropy", optimizer=opt) + # This `fit` call will be distributed on all GPUs. + # Each GPU will process (batch_size/gpus) samples per batch. + parallel_model.fit_generator( + self.train_gen, + steps_per_epoch=self.steps_per_epoch / mini_epochs, + epochs=mini_epochs * self.epochs, + validation_data=self.valid_gen, + validation_steps=self.validation_steps / mini_epochs, + callbacks=callbacks, + max_queue_size=max_queue_size, + workers=workers, + use_multiprocessing=use_multiprocessing, + verbose=verbose, + ) # 1 to show progress bar + + elif gpus == 1: + self.model.compile(loss="categorical_crossentropy", optimizer=opt) + self.model.fit_generator( + self.train_gen, + steps_per_epoch=self.steps_per_epoch / mini_epochs, + epochs=mini_epochs * self.epochs, + validation_data=self.valid_gen, + validation_steps=self.validation_steps / mini_epochs, + callbacks=callbacks, + max_queue_size=10, + workers=workers, + use_multiprocessing=use_multiprocessing, + verbose=verbose, + ) # 1 to show progress bar + + # Build sample_model out of the trained batch_model + self.__build_sample_model(batch_input_length=1) # Single-output model + self.__build_sample_model( + batch_input_length=self.batch_size + ) # Multi-output model + + def vectorize(self, mols_test, leftpad=True): + """Perform One-Hot Encoding (OHE) on a binary molecule. + + :param mols_test: Molecules to vectorize + :type mols_test: list + :param leftpad: Left zero-padding direction, defaults to True + :type leftpad: bool, optional + :return: One-Hot Encoded molecules + :rtype: list + """ + + if leftpad: + return self.smilesvec1.transform(mols_test) + else: + return self.smilesvec2.transform(mols_test) + + def transform(self, mols_ohe): + """Encode a batch of OHE molecules into their latent representations. + Must be called on the output of self.vectorize(). + + :param mols_ohe: List of One-Hot Encoded molecules + :type mols_ohe: list + :return: Latent representation of input molecules + :rtype: list + """ + + latent = self.mol_to_latent_model.predict(mols_ohe) + return latent.reshape((latent.shape[0], 1, latent.shape[1])) + + # @timed + def predict(self, latent, temp=1, rng_seed=None): + """Generate a single SMILES string. + The states of the RNN are set based on the latent input. + Careful, "latent" must be: the output of self.transform() + or + an array of molecular descriptors. + If temp>0, multinomial sampling is used instead of selecting + the single most probable character at each step. + If temp==1, multinomial sampling without temperature scaling is used. + + :param latent: 1D Latent vector to steer the generation + :type latent: numpy.ndarray + :param temp: Temperatute of multinomial sampling (argmax if 0), defaults to 1 + :type temp: int, optional + :return: The predicted SMILES string and its NLL of being sampled + :rtype: list + """ + + # Pass rng_seed for repeatable sampling + if rng_seed is not None: + np.random.seed(rng_seed) + + # Scale inputs if model is trained on scaled data + if self.scaler is not None: + latent = self.scaler.transform( + latent.reshape(1, -1) + ) # Re-shape because scaler complains + + # Apply PCA to input if model is trained accordingly + if self.pca is not None: + latent = self.pca.transform(latent) + + states = self.latent_to_states_model.predict(latent) + + # Decode states and reset the LSTM cells with them to bias the generation towards the desired properties + for dec_layer in range(self.dec_layers): + self.sample_model.get_layer("Decoder_LSTM_" + str(dec_layer)).reset_states( + states=[states[2 * dec_layer], states[2 * dec_layer + 1]] + ) + + # Prepare the input char + startidx = self.smilesvec1._char_to_int[self.smilesvec1.startchar] + samplevec = np.zeros((1, 1, self.smilesvec1.dims[-1])) + samplevec[0, 0, startidx] = 1 + smiles = "" + # Initialize Negative Log-Likelihood (NLL) + NLL = 0 + # Loop and predict next char + for i in range(1000): + o = self.sample_model.predict(samplevec) + # Multinomial sampling with temperature scaling + if temp > 0: + temp = abs(temp) # Handle negative values + nextCharProbs = np.log(o) / temp + nextCharProbs = np.exp(nextCharProbs) + nextCharProbs = ( + nextCharProbs / nextCharProbs.sum() - 1e-8 + ) # Re-normalize for float64 to make exactly 1.0 for np.random.multinomial + sampleidx = np.random.multinomial( + 1, nextCharProbs.squeeze(), 1 + ).argmax() + + # Else, select the most probable character + else: + sampleidx = np.argmax(o) + + samplechar = self.smilesvec1._int_to_char[sampleidx] + if samplechar != self.smilesvec1.endchar: + # Append the new character + smiles += samplechar + samplevec = np.zeros((1, 1, self.smilesvec1.dims[-1])) + samplevec[0, 0, sampleidx] = 1 + # Calculate negative log likelihood for the selected character given the sequence so far + NLL -= np.log(o[0][0][sampleidx]) + else: + return smiles, NLL + + # @timed + def predict_batch(self, latent, temp=1, rng_seed=None): + """Generate multiple biased SMILES strings. + Careful, "latent" must be: the output of self.transform() + or + an array of molecular descriptors. + If temp>0, multinomial sampling is used instead of selecting + the single most probable character at each step. + If temp==1, multinomial sampling without temperature scaling is used. + Low temp leads to elimination of characters with low probabilities. + predict_many() generates batch_input_length (default==batch_size) individual SMILES + strings per call. To change that, reset batch_input_length to a new value. + + :param latent: List of latent vectors + :type latent: list + :param temp: Temperatute of multinomial sampling (argmax if 0), defaults to 1 + :type temp: int, optional + :return: List of predicted SMILES strings and their NLL of being sampled + :rtype: list + """ + + # Pass rng_seed for repeatable sampling + if rng_seed is not None: + np.random.seed(rng_seed) + + if latent.shape[0] == 1: + # Make a batch prediction by repeating the same latent vector for every neuron + latent = np.ones((self.batch_input_length, self.codelayer_dim)) * latent + else: + # Make sure it is squeezed + latent = latent.squeeze() + + # Scale inputs if model is trained on scaled data + if self.scaler is not None: + latent = self.scaler.transform(latent) + + # Apply PCA to input if model is trained accordingly + if self.pca is not None: + latent = self.pca.transform(latent) + + # Decode states and reset the LSTM cells with them, to bias the generation towards the desired properties + states = self.latent_to_states_model.predict(latent) + + for dec_layer in range(self.dec_layers): + self.multi_sample_model.get_layer( + "Decoder_LSTM_" + str(dec_layer) + ).reset_states(states=[states[2 * dec_layer], states[2 * dec_layer + 1]]) + + # Index of input char "^" + startidx = self.smilesvec1._char_to_int[self.smilesvec1.startchar] + # Vectorize the input char for all SMILES + samplevec = np.zeros((self.batch_input_length, 1, self.smilesvec1.dims[-1])) + samplevec[:, 0, startidx] = 1 + # Initialize arrays to store SMILES, their NLLs and their status + smiles = np.array([""] * self.batch_input_length, dtype=object) + NLL = np.zeros((self.batch_input_length,)) + finished = np.array([False] * self.batch_input_length) + + # Loop and predict next char + for i in range(1000): + o = self.multi_sample_model.predict( + samplevec, batch_size=self.batch_input_length + ).squeeze() + + # Multinomial sampling with temperature scaling + if temp > 0: + temp = abs(temp) # No negative values + nextCharProbs = np.log(o) / temp + nextCharProbs = np.exp(nextCharProbs) # .squeeze() + + # Normalize probabilities + nextCharProbs = (nextCharProbs.T / nextCharProbs.sum(axis=1) - 1e-8).T + sampleidc = np.asarray( + [ + np.random.multinomial(1, nextCharProb, 1).argmax() + for nextCharProb in nextCharProbs + ] + ) + + else: + sampleidc = np.argmax(o, axis=1) + + samplechars = [self.smilesvec1._int_to_char[idx] for idx in sampleidc] + + for idx, samplechar in enumerate(samplechars): + if not finished[idx]: + if samplechar != self.smilesvec1.endchar: + # Append the SMILES with the next character + smiles[idx] += self.smilesvec1._int_to_char[sampleidc[idx]] + samplevec = np.zeros( + (self.batch_input_length, 1, self.smilesvec1.dims[-1]) + ) + # One-Hot Encode the character + # samplevec[:,0,sampleidc] = 1 + for count, sampleidx in enumerate(sampleidc): + samplevec[count, 0, sampleidx] = 1 + # Calculate negative log likelihood for the selected character given the sequence so far + NLL[idx] -= np.log(o[idx][sampleidc[idx]]) + else: + finished[idx] = True + # print("SMILES has finished at %i" %i) + + # If all SMILES are finished, i.e. the endchar "$" has been generated, stop the generation + if finished.sum() == len(finished): + return smiles, NLL + + @timed + def get_smiles_nll(self, latent, smiles_ref) -> float: + """Back-calculate the NLL of a given SMILES string if its descriptors are used as RNN states. + + :param latent: Descriptors or latent representation of smiles_ref + :type latent: list + :param smiles_ref: Given SMILES to back-calculate its NLL + :type smiles_ref: str + :return: NLL of sampling smiles_ref given its latent representation (or descriptors) + :rtype: float + """ + + # Scale inputs if model is trained on scaled data + if self.scaler is not None: + latent = self.scaler.transform( + latent.reshape(1, -1) + ) # Re-shape because scaler complains + + # Apply PCA to input if model is trained accordingly + if self.pca is not None: + latent = self.pca.transform(latent) + + states = self.latent_to_states_model.predict(latent) + + # Decode states and reset the LSTM cells with them to bias the generation towards the desired properties + for dec_layer in range(self.dec_layers): + self.sample_model.get_layer("Decoder_LSTM_" + str(dec_layer)).reset_states( + states=[states[2 * dec_layer], states[2 * dec_layer + 1]] + ) + + # Prepare the input char + startidx = self.smilesvec1._char_to_int[self.smilesvec1.startchar] + samplevec = np.zeros((1, 1, self.smilesvec1.dims[-1])) + samplevec[0, 0, startidx] = 1 + + # Initialize Negative Log-Likelihood (NLL) + NLL = 0 + # Loop and predict next char + for i in range(1000): + o = self.sample_model.predict(samplevec) + + samplechar = smiles_ref[i] + sampleidx = self.smilesvec1._char_to_int[samplechar] + + if i != len(smiles_ref) - 1: + samplevec = np.zeros((1, 1, self.smilesvec1.dims[-1])) + samplevec[0, 0, sampleidx] = 1 + # Calculate negative log likelihood for the selected character given the sequence so far + NLL -= np.log(o[0][0][sampleidx]) + else: + return NLL + + @timed + def get_smiles_nll_batch(self, latent, smiles_ref) -> list: + """Back-calculate the individual NLL for a batch of known SMILES strings. + Batch size is equal to self.batch_input_length so reset it if needed. + + :param latent: List of latent representations (or descriptors) + :type latent: list + :param smiles_ref: List of given SMILES to back-calculate their NLL + :type smiles_ref: list + :return: List of NLL of sampling smiles_ref given their latent representations (or descriptors) + :rtype: list + """ + + assert ( + len(latent) <= self.batch_input_length + ), "Input length must be less than or equal to batch_input_length." + + # Scale inputs if model is trained on scaled data + if self.scaler is not None: + latent = self.scaler.transform(latent) + + # Apply PCA to input if model is trained accordingly + if self.pca is not None: + latent = self.pca.transform(latent) + + # Decode states and reset the LSTM cells with them, to bias the generation towards the desired properties + states = self.latent_to_states_model.predict(latent) + + for dec_layer in range(self.dec_layers): + self.multi_sample_model.get_layer( + "Decoder_LSTM_" + str(dec_layer) + ).reset_states(states=[states[2 * dec_layer], states[2 * dec_layer + 1]]) + + # Index of input char "^" + startidx = self.smilesvec1._char_to_int[self.smilesvec1.startchar] + # Vectorize the input char for all SMILES + samplevec = np.zeros((self.batch_input_length, 1, self.smilesvec1.dims[-1])) + samplevec[:, 0, startidx] = 1 + # Initialize arrays to store NLLs and flag if a SMILES is finished + NLL = np.zeros((self.batch_input_length,)) + finished = np.array([False] * self.batch_input_length) + + # Loop and predict next char + for i in range(1000): + o = self.multi_sample_model.predict( + samplevec, batch_size=self.batch_input_length + ).squeeze() + samplechars = [] + + for smiles in smiles_ref: + try: + samplechars.append(smiles[i]) + except: + # This is a finished SMILES, so "i" exceeds dimensions + samplechars.append("$") + + sampleidc = np.asarray( + [self.smilesvec1._char_to_int[char] for char in samplechars] + ) + + for idx, samplechar in enumerate(samplechars): + if not finished[idx]: + if i != len(smiles_ref[idx]) - 1: + samplevec = np.zeros( + (self.batch_input_length, 1, self.smilesvec1.dims[-1]) + ) + # One-Hot Encode the character + for count, sampleidx in enumerate(sampleidc): + samplevec[count, 0, sampleidx] = 1 + # Calculate negative log likelihood for the selected character given the sequence so far + NLL[idx] -= np.log(o[idx][sampleidc[idx]]) + else: + finished[idx] = True + + # If all SMILES are finished, i.e. the endchar "$" has been generated, stop the generation + if finished.sum() == len(finished): + return NLL + + def summary(self): + """Echo the training configuration for inspection. + """ + + print( + "\nModel trained with dataset %s that has maxlen=%d and charset=%s for %d epochs." + % (self.dataset_name, self.maxlen, self.charset, self.epochs) + ) + + print( + "noise_std: %.6f, lstm_dim: %d, dec_layers: %d, td_dense_dim: %d, batch_size: %d, codelayer_dim: %d, lr: %.6f." + % ( + self.noise_std, + self.lstm_dim, + self.dec_layers, + self.td_dense_dim, + self.batch_size, + self.codelayer_dim, + self.lr, + ) + ) + + def get_graphs(self): + """Export the graphs of the model and its submodels to png files. + Requires "pydot" and "graphviz" to be installed (pip install graphviz && pip install pydot). + """ + + try: + from keras.utils import plot_model + from keras.utils.vis_utils import model_to_dot + + # from IPython.display import SVG + + plot_model(self.model, to_file="model.png") + plot_model( + self.latent_to_states_model, to_file="latent_to_states_model.png" + ) + plot_model(self.batch_model, to_file="batch_model.png") + if self.mol_to_latent_model is not None: + plot_model(self.mol_to_latent_model, to_file="mol_to_latent_model.png") + + print("Models exported to png files.") + + except: + print("Check pydot and graphviz installation.") + + @timed + def save(self, model_name): + """Save model in a zip file. + + :param model_name: Path to save model in + :type model_name: str + """ + + with tempfile.TemporaryDirectory() as dirpath: + + # Save the Keras models + if self.mol_to_latent_model is not None: + self.mol_to_latent_model.save(dirpath + "/mol_to_latent_model.h5") + + self.latent_to_states_model.save(dirpath + "/latent_to_states_model.h5") + self.batch_model.save(dirpath + "/batch_model.h5") + + # Exclude unpicklable and unwanted attributes + excl_attr = [ + "_DDC__mode", + "_DDC__train_gen", + "_DDC__valid_gen", + "_DDC__mol_to_latent_model", + "_DDC__latent_to_states_model", + "_DDC__batch_model", + "_DDC__sample_model", + "_DDC__multi_sample_model", + "_DDC__model", + ] + + # Cannot deepcopy self.__dict__ because of Keras' thread lock so this is + # bypassed by popping and re-inserting the unpicklable attributes + to_add = {} + # Remove unpicklable attributes + for attr in excl_attr: + to_add[attr] = self.__dict__.pop(attr, None) + + # Pickle metadata, i.e. almost everything but the Keras models and generators + pickle.dump(self.__dict__, open(dirpath + "/metadata.pickle", "wb")) + + # Zip directory with its contents + shutil.make_archive(model_name, "zip", dirpath) + + # Finally, re-load the popped elements for the model to be usable + for attr in excl_attr: + self.__dict__[attr] = to_add[attr] + + print("Model saved.")