--- a +++ b/ddc_pub/vectorizers.py @@ -0,0 +1,272 @@ +# Experimental Class for Smiles Enumeration, Iterator and SmilesIterator adapted from Keras 1.2.2 +# Source: https://github.com/EBjerrum/molvecgen + +from rdkit import Chem +import numpy as np + +class SmilesVectorizer(object): + """SMILES vectorizer and devectorizer, with support for SMILES enumeration (atom order randomization) + as data augmentation + + :parameter charset: string containing the characters for the vectorization + can also be generated via the .fit() method + :parameter pad: Length of the vectorization + :parameter leftpad: Add spaces to the left of the SMILES + :parameter isomericSmiles: Generate SMILES containing information about stereogenic centers + :parameter augment: Enumerate the SMILES during transform + :parameter canonical: use canonical SMILES during transform (overrides enum) + :parameter binary: Use RDKit binary strings instead of molecule objects + """ + def __init__(self, charset = '@C)(=cOn1S2/H[N]\\', pad=5, maxlength=120, leftpad=True, isomericSmiles=True, augment=True, canonical=False, startchar = '^', endchar = '$', unknownchar = '?', binary=False): + #Special Characters + self.startchar = startchar + self.endchar = endchar + self.unknownchar = unknownchar + + #Vectorization and SMILES options + self.binary = binary + self.leftpad = leftpad + self.isomericSmiles = isomericSmiles + self.augment = augment + self.canonical = canonical + self._pad = pad + self._maxlength = maxlength + + #The characterset + self._charset = None + self.charset = charset + + #Calculate the dimensions + self.setdims() + + @property + def charset(self): + return self._charset + + @charset.setter + def charset(self, charset): + #Ensure start and endchars are in the charset + for char in [self.startchar, self.endchar, self.unknownchar]: + if char not in charset: + charset = charset + char + #Set the hidden properties + self._charset = charset + self._charlen = len(charset) + self._char_to_int = dict((c,i) for i,c in enumerate(charset)) + self._int_to_char = dict((i,c) for i,c in enumerate(charset)) + self.setdims() + + @property + def maxlength(self): + return self._maxlength + + @maxlength.setter + def maxlength(self, maxlength): + self._maxlength = maxlength + self.setdims() + + @property + def pad(self): + return self._pad + + @pad.setter + def pad(self, pad): + self._pad = pad + self.setdims() + + def setdims(self): + """Calculates and sets the output dimensions of the vectorized molecules from the current settings""" + self.dims = (self.maxlength + self.pad, self._charlen) + + + def fit(self, mols, extra_chars=[]): + """Performs extraction of the charset and length of a SMILES datasets and sets self.maxlength and self.charset + + :parameter smiles: Numpy array or Pandas series containing smiles as strings + :parameter extra_chars: List of extra chars to add to the charset (e.g. "\\\\" when "/" is present) + """ + smiles = [Chem.MolToSmiles(mol) for mol in mols] + charset = set("".join(list(smiles))) #Is there a smarter way when the list of SMILES is HUGE! + self.charset = "".join(charset.union(set(extra_chars))) + self.maxlength = max([len(smile) for smile in smiles]) + + def randomize_smiles(self, smiles): + """Perform a randomization of a SMILES string + must be RDKit sanitizable""" + mol = Chem.MolFromSmiles(smiles) + nmol = self.randomize_mol(mol) + return Chem.MolToSmiles(nmol, canonical=self.canonical, isomericSmiles=self.isomericSmiles) + + def randomize_mol(self, mol): + """Performs a randomization of the atom order of an RDKit molecule""" + ans = list(range(mol.GetNumAtoms())) + np.random.shuffle(ans) + return Chem.RenumberAtoms(mol,ans) + + def transform(self, mols, augment=None, canonical=None): + """Perform an enumeration (atom order randomization) and vectorization of a Numpy array of RDkit molecules + + :parameter mols: The RDKit molecules to transform in a list or array + :parameter augment: Override the objects .augment setting + :parameter canonical: Override the objects .canonical setting + + :output: Numpy array with the vectorized molecules with shape [batch, maxlength+pad, charset] + """ + #TODO make it possible to use both SMILES, RDKit mols and RDKit binary strings in input + one_hot = np.zeros([len(mols)] + list(self.dims), dtype=np.int8) + + #Possibl override object settings + if augment is None: + augment = self.augment + if canonical is None: + canonical = self.canonical + + for i,mol in enumerate(mols): + + #Fast convert from RDKit binary + if self.binary: mol = Chem.Mol(mol) + + if augment: + mol = self.randomize_mol(mol) + ss = Chem.MolToSmiles(mol, canonical=canonical, isomericSmiles=self.isomericSmiles) + + #TODO, Improvement make it robust to too long SMILES strings + #TODO, Improvement make a "jitter", with random offset within the possible frame + #TODO, Improvement make it report to many "?"'s + + l = len(ss) + if self.leftpad: + offset = self.dims[0]-l-1 + else: + offset = 1 + + for j,c in enumerate(ss): + charidx = self._char_to_int.get(c, self._char_to_int[self.unknownchar]) + one_hot[i,j+offset,charidx] = 1 + + #Pad the start + one_hot[i,offset-1,self._char_to_int[self.startchar]] = 1 + #Pad the end + one_hot[i,offset+l:,self._char_to_int[self.endchar]] = 1 + #Pad the space in front of start (Could this lead to funky effects during sampling?) + #one_hot[i,:offset-1,self._char_to_int[self.endchar]] = 1 + + return one_hot + + + def reverse_transform(self, vect, strip=True): + """ Performs a conversion of a vectorized SMILES to a SMILES strings + charset must be the same as used for vectorization. + + :parameter vect: Numpy array of vectorized SMILES. + :parameter strip: Strip start and end tokens from the SMILES string + """ + #TODO make it possible to take a single vectorized molecule, not a list + + smiles = [] + for v in vect: + #mask v + v=v[v.sum(axis=1)==1] + #Find one hot encoded index with argmax, translate to char and join to string + smile = "".join(self._int_to_char[i] for i in v.argmax(axis=1)) + if strip: + smile = smile.strip(self.startchar + self.endchar) + smiles.append(smile) + return np.array(smiles) + +from rdkit import DataStructs +from rdkit.Chem import AllChem + + +class HashedMorganVectorizer(object): + def __init__(self, radius=2, bits=2048, augment=None): + self.bits = bits + self.radius = radius + self.augment = augment #Not used + self.dims = (bits,) + self.keys = None + + def transform_mol(self, mol): + """ transforms the molecule into a numpy bit array with the morgan bits + + :parameter mol: the RDKit molecule to be transformed + """ + fp = AllChem.GetMorganFingerprintAsBitVect(mol,self.radius,nBits=self.bits) + arr = np.zeros((self.bits,)) + DataStructs.ConvertToNumpyArray(fp, arr) + return arr + + def transform(self, mols): + """Transforms a list or array of RDKit molecules into an array with the Morgan bits + + :parameter mols: list or array of RDKit molecules + """ + + arr = np.zeros((len(mols), self.bits)) + for i, mol in enumerate(mols): + arr[i,:] = self.transform_mol(mol) + return arr + + +class MorganDictVectorizer(object): + def __init__(self, radius=2, augment=None): + self.radius = radius + self.augment = augment #Not used + self.dims = None + + def fit(self, mols): + """Analyses the molecules and creates the key index for the creation of the dense array""" + keys=set() + for mol in mols: + fp = AllChem.GetMorganFingerprint(mol,self.radius) + keys.update(fp.GetNonzeroElements().keys()) + keys = list(keys) + keys.sort() + self.keys= np.array(keys) + self.dims = len(self.keys) + + def transform_mol(self, mol, misses=False): + """ transforms the mol into a dense array using the fitted keys as index + + :parameter mol: the RDKit molecule to be transformed + :parameter misses: wheter to return the number of key misses for the molecule + """ + assert type(self.keys) is np.ndarray, "keys are not defined or is not an np.array, has the .fit(mols) function been used?" + #Get fingerprint as a dictionary + fp = AllChem.GetMorganFingerprint(mol,self.radius) + fp_d = fp.GetNonzeroElements() + + #Prepare the array, and set the values + #TODO is there a way to vectorize and speed up this? + arr = np.zeros((self.dims,)) + _misses = 0 + for key, value in fp_d.items(): + if key in self.keys: + arr[self.keys == key] = value + else: + _misses = _misses + 1 + + if misses: + return arr, _misses + else: + return arr + + def transform(self, mols, misses=False): + """Transforms a list or array of RDKit molecules into a dense array using the key dictionary (see .fit()) + + :parameter mols: list or array of RDKit molecules + :parameter misses: Wheter to return the number of key misses for each molecule + """ + arr = np.zeros((len(mols), self.dims)) + if misses: + _misses = np.zeros((len(mols),1)) + for i, mol in enumerate(mols): + arr[i,:], _misses[i] = self.transform_mol(mol, misses=misses) + return arr, _misses + else: + for i, mol in enumerate(mols): + arr[i,:] = self.transform_mol(mol, misses=False) + return arr + + + \ No newline at end of file