--- a +++ b/examples/vae_example/data_loader.py @@ -0,0 +1,75 @@ +""" +This file is to encode SMILES and SELFIES into one-hot encodings +""" + +import numpy as np + +import selfies as sf + + +def smile_to_hot(smile, largest_smile_len, alphabet): + """Go from a single smile string to a one-hot encoding. + """ + + char_to_int = dict((c, i) for i, c in enumerate(alphabet)) + + # pad with ' ' + smile += ' ' * (largest_smile_len - len(smile)) + + # integer encode input smile + integer_encoded = [char_to_int[char] for char in smile] + + # one hot-encode input smile + onehot_encoded = list() + for value in integer_encoded: + letter = [0 for _ in range(len(alphabet))] + letter[value] = 1 + onehot_encoded.append(letter) + return integer_encoded, np.array(onehot_encoded) + + +def multiple_smile_to_hot(smiles_list, largest_molecule_len, alphabet): + """Convert a list of smile strings to a one-hot encoding + + Returned shape (num_smiles x len_of_largest_smile x len_smile_encoding) + """ + + hot_list = [] + for s in smiles_list: + _, onehot_encoded = smile_to_hot(s, largest_molecule_len, alphabet) + hot_list.append(onehot_encoded) + return np.array(hot_list) + + +def selfies_to_hot(selfie, largest_selfie_len, alphabet): + """Go from a single selfies string to a one-hot encoding. + """ + + symbol_to_int = dict((c, i) for i, c in enumerate(alphabet)) + + # pad with [nop] + selfie += '[nop]' * (largest_selfie_len - sf.len_selfies(selfie)) + + # integer encode + symbol_list = sf.split_selfies(selfie) + integer_encoded = [symbol_to_int[symbol] for symbol in symbol_list] + + # one hot-encode the integer encoded selfie + onehot_encoded = list() + for index in integer_encoded: + letter = [0] * len(alphabet) + letter[index] = 1 + onehot_encoded.append(letter) + + return integer_encoded, np.array(onehot_encoded) + + +def multiple_selfies_to_hot(selfies_list, largest_molecule_len, alphabet): + """Convert a list of selfies strings to a one-hot encoding + """ + + hot_list = [] + for s in selfies_list: + _, onehot_encoded = selfies_to_hot(s, largest_molecule_len, alphabet) + hot_list.append(onehot_encoded) + return np.array(hot_list)