[6eeb30]: / examples / vae_example / data_loader.py

Download this file

76 lines (53 with data), 2.2 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
"""
This file is to encode SMILES and SELFIES into one-hot encodings
"""
import numpy as np
import selfies as sf
def smile_to_hot(smile, largest_smile_len, alphabet):
"""Go from a single smile string to a one-hot encoding.
"""
char_to_int = dict((c, i) for i, c in enumerate(alphabet))
# pad with ' '
smile += ' ' * (largest_smile_len - len(smile))
# integer encode input smile
integer_encoded = [char_to_int[char] for char in smile]
# one hot-encode input smile
onehot_encoded = list()
for value in integer_encoded:
letter = [0 for _ in range(len(alphabet))]
letter[value] = 1
onehot_encoded.append(letter)
return integer_encoded, np.array(onehot_encoded)
def multiple_smile_to_hot(smiles_list, largest_molecule_len, alphabet):
"""Convert a list of smile strings to a one-hot encoding
Returned shape (num_smiles x len_of_largest_smile x len_smile_encoding)
"""
hot_list = []
for s in smiles_list:
_, onehot_encoded = smile_to_hot(s, largest_molecule_len, alphabet)
hot_list.append(onehot_encoded)
return np.array(hot_list)
def selfies_to_hot(selfie, largest_selfie_len, alphabet):
"""Go from a single selfies string to a one-hot encoding.
"""
symbol_to_int = dict((c, i) for i, c in enumerate(alphabet))
# pad with [nop]
selfie += '[nop]' * (largest_selfie_len - sf.len_selfies(selfie))
# integer encode
symbol_list = sf.split_selfies(selfie)
integer_encoded = [symbol_to_int[symbol] for symbol in symbol_list]
# one hot-encode the integer encoded selfie
onehot_encoded = list()
for index in integer_encoded:
letter = [0] * len(alphabet)
letter[index] = 1
onehot_encoded.append(letter)
return integer_encoded, np.array(onehot_encoded)
def multiple_selfies_to_hot(selfies_list, largest_molecule_len, alphabet):
"""Convert a list of selfies strings to a one-hot encoding
"""
hot_list = []
for s in selfies_list:
_, onehot_encoded = selfies_to_hot(s, largest_molecule_len, alphabet)
hot_list.append(onehot_encoded)
return np.array(hot_list)