"""
This file is to encode SMILES and SELFIES into one-hot encodings
"""
import numpy as np
import selfies as sf
def smile_to_hot(smile, largest_smile_len, alphabet):
"""Go from a single smile string to a one-hot encoding.
"""
char_to_int = dict((c, i) for i, c in enumerate(alphabet))
# pad with ' '
smile += ' ' * (largest_smile_len - len(smile))
# integer encode input smile
integer_encoded = [char_to_int[char] for char in smile]
# one hot-encode input smile
onehot_encoded = list()
for value in integer_encoded:
letter = [0 for _ in range(len(alphabet))]
letter[value] = 1
onehot_encoded.append(letter)
return integer_encoded, np.array(onehot_encoded)
def multiple_smile_to_hot(smiles_list, largest_molecule_len, alphabet):
"""Convert a list of smile strings to a one-hot encoding
Returned shape (num_smiles x len_of_largest_smile x len_smile_encoding)
"""
hot_list = []
for s in smiles_list:
_, onehot_encoded = smile_to_hot(s, largest_molecule_len, alphabet)
hot_list.append(onehot_encoded)
return np.array(hot_list)
def selfies_to_hot(selfie, largest_selfie_len, alphabet):
"""Go from a single selfies string to a one-hot encoding.
"""
symbol_to_int = dict((c, i) for i, c in enumerate(alphabet))
# pad with [nop]
selfie += '[nop]' * (largest_selfie_len - sf.len_selfies(selfie))
# integer encode
symbol_list = sf.split_selfies(selfie)
integer_encoded = [symbol_to_int[symbol] for symbol in symbol_list]
# one hot-encode the integer encoded selfie
onehot_encoded = list()
for index in integer_encoded:
letter = [0] * len(alphabet)
letter[index] = 1
onehot_encoded.append(letter)
return integer_encoded, np.array(onehot_encoded)
def multiple_selfies_to_hot(selfies_list, largest_molecule_len, alphabet):
"""Convert a list of selfies strings to a one-hot encoding
"""
hot_list = []
for s in selfies_list:
_, onehot_encoded = selfies_to_hot(s, largest_molecule_len, alphabet)
hot_list.append(onehot_encoded)
return np.array(hot_list)