[58db57]: / ddc_pub / vectorizers.py

Download this file

272 lines (218 with data), 10.6 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
# Experimental Class for Smiles Enumeration, Iterator and SmilesIterator adapted from Keras 1.2.2
# Source: https://github.com/EBjerrum/molvecgen
from rdkit import Chem
import numpy as np
class SmilesVectorizer(object):
"""SMILES vectorizer and devectorizer, with support for SMILES enumeration (atom order randomization)
as data augmentation
:parameter charset: string containing the characters for the vectorization
can also be generated via the .fit() method
:parameter pad: Length of the vectorization
:parameter leftpad: Add spaces to the left of the SMILES
:parameter isomericSmiles: Generate SMILES containing information about stereogenic centers
:parameter augment: Enumerate the SMILES during transform
:parameter canonical: use canonical SMILES during transform (overrides enum)
:parameter binary: Use RDKit binary strings instead of molecule objects
"""
def __init__(self, charset = '@C)(=cOn1S2/H[N]\\', pad=5, maxlength=120, leftpad=True, isomericSmiles=True, augment=True, canonical=False, startchar = '^', endchar = '$', unknownchar = '?', binary=False):
#Special Characters
self.startchar = startchar
self.endchar = endchar
self.unknownchar = unknownchar
#Vectorization and SMILES options
self.binary = binary
self.leftpad = leftpad
self.isomericSmiles = isomericSmiles
self.augment = augment
self.canonical = canonical
self._pad = pad
self._maxlength = maxlength
#The characterset
self._charset = None
self.charset = charset
#Calculate the dimensions
self.setdims()
@property
def charset(self):
return self._charset
@charset.setter
def charset(self, charset):
#Ensure start and endchars are in the charset
for char in [self.startchar, self.endchar, self.unknownchar]:
if char not in charset:
charset = charset + char
#Set the hidden properties
self._charset = charset
self._charlen = len(charset)
self._char_to_int = dict((c,i) for i,c in enumerate(charset))
self._int_to_char = dict((i,c) for i,c in enumerate(charset))
self.setdims()
@property
def maxlength(self):
return self._maxlength
@maxlength.setter
def maxlength(self, maxlength):
self._maxlength = maxlength
self.setdims()
@property
def pad(self):
return self._pad
@pad.setter
def pad(self, pad):
self._pad = pad
self.setdims()
def setdims(self):
"""Calculates and sets the output dimensions of the vectorized molecules from the current settings"""
self.dims = (self.maxlength + self.pad, self._charlen)
def fit(self, mols, extra_chars=[]):
"""Performs extraction of the charset and length of a SMILES datasets and sets self.maxlength and self.charset
:parameter smiles: Numpy array or Pandas series containing smiles as strings
:parameter extra_chars: List of extra chars to add to the charset (e.g. "\\\\" when "/" is present)
"""
smiles = [Chem.MolToSmiles(mol) for mol in mols]
charset = set("".join(list(smiles))) #Is there a smarter way when the list of SMILES is HUGE!
self.charset = "".join(charset.union(set(extra_chars)))
self.maxlength = max([len(smile) for smile in smiles])
def randomize_smiles(self, smiles):
"""Perform a randomization of a SMILES string
must be RDKit sanitizable"""
mol = Chem.MolFromSmiles(smiles)
nmol = self.randomize_mol(mol)
return Chem.MolToSmiles(nmol, canonical=self.canonical, isomericSmiles=self.isomericSmiles)
def randomize_mol(self, mol):
"""Performs a randomization of the atom order of an RDKit molecule"""
ans = list(range(mol.GetNumAtoms()))
np.random.shuffle(ans)
return Chem.RenumberAtoms(mol,ans)
def transform(self, mols, augment=None, canonical=None):
"""Perform an enumeration (atom order randomization) and vectorization of a Numpy array of RDkit molecules
:parameter mols: The RDKit molecules to transform in a list or array
:parameter augment: Override the objects .augment setting
:parameter canonical: Override the objects .canonical setting
:output: Numpy array with the vectorized molecules with shape [batch, maxlength+pad, charset]
"""
#TODO make it possible to use both SMILES, RDKit mols and RDKit binary strings in input
one_hot = np.zeros([len(mols)] + list(self.dims), dtype=np.int8)
#Possibl override object settings
if augment is None:
augment = self.augment
if canonical is None:
canonical = self.canonical
for i,mol in enumerate(mols):
#Fast convert from RDKit binary
if self.binary: mol = Chem.Mol(mol)
if augment:
mol = self.randomize_mol(mol)
ss = Chem.MolToSmiles(mol, canonical=canonical, isomericSmiles=self.isomericSmiles)
#TODO, Improvement make it robust to too long SMILES strings
#TODO, Improvement make a "jitter", with random offset within the possible frame
#TODO, Improvement make it report to many "?"'s
l = len(ss)
if self.leftpad:
offset = self.dims[0]-l-1
else:
offset = 1
for j,c in enumerate(ss):
charidx = self._char_to_int.get(c, self._char_to_int[self.unknownchar])
one_hot[i,j+offset,charidx] = 1
#Pad the start
one_hot[i,offset-1,self._char_to_int[self.startchar]] = 1
#Pad the end
one_hot[i,offset+l:,self._char_to_int[self.endchar]] = 1
#Pad the space in front of start (Could this lead to funky effects during sampling?)
#one_hot[i,:offset-1,self._char_to_int[self.endchar]] = 1
return one_hot
def reverse_transform(self, vect, strip=True):
""" Performs a conversion of a vectorized SMILES to a SMILES strings
charset must be the same as used for vectorization.
:parameter vect: Numpy array of vectorized SMILES.
:parameter strip: Strip start and end tokens from the SMILES string
"""
#TODO make it possible to take a single vectorized molecule, not a list
smiles = []
for v in vect:
#mask v
v=v[v.sum(axis=1)==1]
#Find one hot encoded index with argmax, translate to char and join to string
smile = "".join(self._int_to_char[i] for i in v.argmax(axis=1))
if strip:
smile = smile.strip(self.startchar + self.endchar)
smiles.append(smile)
return np.array(smiles)
from rdkit import DataStructs
from rdkit.Chem import AllChem
class HashedMorganVectorizer(object):
def __init__(self, radius=2, bits=2048, augment=None):
self.bits = bits
self.radius = radius
self.augment = augment #Not used
self.dims = (bits,)
self.keys = None
def transform_mol(self, mol):
""" transforms the molecule into a numpy bit array with the morgan bits
:parameter mol: the RDKit molecule to be transformed
"""
fp = AllChem.GetMorganFingerprintAsBitVect(mol,self.radius,nBits=self.bits)
arr = np.zeros((self.bits,))
DataStructs.ConvertToNumpyArray(fp, arr)
return arr
def transform(self, mols):
"""Transforms a list or array of RDKit molecules into an array with the Morgan bits
:parameter mols: list or array of RDKit molecules
"""
arr = np.zeros((len(mols), self.bits))
for i, mol in enumerate(mols):
arr[i,:] = self.transform_mol(mol)
return arr
class MorganDictVectorizer(object):
def __init__(self, radius=2, augment=None):
self.radius = radius
self.augment = augment #Not used
self.dims = None
def fit(self, mols):
"""Analyses the molecules and creates the key index for the creation of the dense array"""
keys=set()
for mol in mols:
fp = AllChem.GetMorganFingerprint(mol,self.radius)
keys.update(fp.GetNonzeroElements().keys())
keys = list(keys)
keys.sort()
self.keys= np.array(keys)
self.dims = len(self.keys)
def transform_mol(self, mol, misses=False):
""" transforms the mol into a dense array using the fitted keys as index
:parameter mol: the RDKit molecule to be transformed
:parameter misses: wheter to return the number of key misses for the molecule
"""
assert type(self.keys) is np.ndarray, "keys are not defined or is not an np.array, has the .fit(mols) function been used?"
#Get fingerprint as a dictionary
fp = AllChem.GetMorganFingerprint(mol,self.radius)
fp_d = fp.GetNonzeroElements()
#Prepare the array, and set the values
#TODO is there a way to vectorize and speed up this?
arr = np.zeros((self.dims,))
_misses = 0
for key, value in fp_d.items():
if key in self.keys:
arr[self.keys == key] = value
else:
_misses = _misses + 1
if misses:
return arr, _misses
else:
return arr
def transform(self, mols, misses=False):
"""Transforms a list or array of RDKit molecules into a dense array using the key dictionary (see .fit())
:parameter mols: list or array of RDKit molecules
:parameter misses: Wheter to return the number of key misses for each molecule
"""
arr = np.zeros((len(mols), self.dims))
if misses:
_misses = np.zeros((len(mols),1))
for i, mol in enumerate(mols):
arr[i,:], _misses[i] = self.transform_mol(mol, misses=misses)
return arr, _misses
else:
for i, mol in enumerate(mols):
arr[i,:] = self.transform_mol(mol, misses=False)
return arr