Diff of /ddc_pub/vectorizers.py [000000] .. [58db57]

Switch to side-by-side view

--- a
+++ b/ddc_pub/vectorizers.py
@@ -0,0 +1,272 @@
+# Experimental Class for Smiles Enumeration, Iterator and SmilesIterator adapted from Keras 1.2.2
+# Source: https://github.com/EBjerrum/molvecgen
+
+from rdkit import Chem
+import numpy as np
+
+class SmilesVectorizer(object):
+    """SMILES vectorizer and devectorizer, with support for SMILES enumeration (atom order randomization)
+    as data augmentation
+    
+    :parameter charset: string containing the characters for the vectorization
+          can also be generated via the .fit() method
+    :parameter pad: Length of the vectorization
+    :parameter leftpad: Add spaces to the left of the SMILES
+    :parameter isomericSmiles: Generate SMILES containing information about stereogenic centers
+    :parameter augment: Enumerate the SMILES during transform
+    :parameter canonical: use canonical SMILES during transform (overrides enum)
+    :parameter binary: Use RDKit binary strings instead of molecule objects
+    """
+    def __init__(self, charset = '@C)(=cOn1S2/H[N]\\', pad=5, maxlength=120, leftpad=True, isomericSmiles=True, augment=True, canonical=False, startchar = '^', endchar = '$', unknownchar = '?', binary=False):
+        #Special Characters
+        self.startchar = startchar
+        self.endchar = endchar
+        self.unknownchar = unknownchar
+        
+        #Vectorization and SMILES options
+        self.binary = binary
+        self.leftpad = leftpad
+        self.isomericSmiles = isomericSmiles
+        self.augment = augment
+        self.canonical = canonical
+        self._pad = pad
+        self._maxlength = maxlength
+        
+        #The characterset
+        self._charset = None
+        self.charset = charset
+        
+        #Calculate the dimensions
+        self.setdims()
+
+    @property
+    def charset(self):
+        return self._charset
+        
+    @charset.setter
+    def charset(self, charset):
+        #Ensure start and endchars are in the charset
+        for char in [self.startchar, self.endchar, self.unknownchar]:
+            if char not in charset:
+                charset = charset + char
+        #Set the hidden properties        
+        self._charset = charset
+        self._charlen = len(charset)
+        self._char_to_int = dict((c,i) for i,c in enumerate(charset))
+        self._int_to_char = dict((i,c) for i,c in enumerate(charset))
+        self.setdims()
+        
+    @property
+    def maxlength(self):
+        return self._maxlength
+    
+    @maxlength.setter
+    def maxlength(self, maxlength):
+        self._maxlength = maxlength
+        self.setdims()
+        
+    @property
+    def pad(self):
+        return self._pad
+    
+    @pad.setter
+    def pad(self, pad):
+        self._pad = pad
+        self.setdims()
+        
+    def setdims(self):
+        """Calculates and sets the output dimensions of the vectorized molecules from the current settings"""
+        self.dims = (self.maxlength + self.pad, self._charlen)
+    
+        
+    def fit(self, mols, extra_chars=[]):
+        """Performs extraction of the charset and length of a SMILES datasets and sets self.maxlength and self.charset
+        
+        :parameter smiles: Numpy array or Pandas series containing smiles as strings
+        :parameter extra_chars: List of extra chars to add to the charset (e.g. "\\\\" when "/" is present)
+        """
+        smiles = [Chem.MolToSmiles(mol) for mol in mols]
+        charset = set("".join(list(smiles))) #Is there a smarter way when the list of SMILES is HUGE!
+        self.charset = "".join(charset.union(set(extra_chars)))
+        self.maxlength = max([len(smile) for smile in smiles])
+        
+    def randomize_smiles(self, smiles):
+        """Perform a randomization of a SMILES string
+        must be RDKit sanitizable"""
+        mol = Chem.MolFromSmiles(smiles)
+        nmol = self.randomize_mol(mol)
+        return Chem.MolToSmiles(nmol, canonical=self.canonical, isomericSmiles=self.isomericSmiles)
+    
+    def randomize_mol(self, mol):
+        """Performs a randomization of the atom order of an RDKit molecule"""
+        ans = list(range(mol.GetNumAtoms()))
+        np.random.shuffle(ans)
+        return Chem.RenumberAtoms(mol,ans)
+
+    def transform(self, mols, augment=None, canonical=None):
+        """Perform an enumeration (atom order randomization) and vectorization of a Numpy array of RDkit molecules
+        
+            :parameter mols: The RDKit molecules to transform in a list or array
+            :parameter augment: Override the objects .augment setting
+            :parameter canonical: Override the objects .canonical setting
+            
+            :output: Numpy array with the vectorized molecules with shape [batch, maxlength+pad, charset]
+        """
+        #TODO make it possible to use both SMILES, RDKit mols and RDKit binary strings in input
+        one_hot =  np.zeros([len(mols)] + list(self.dims), dtype=np.int8)
+
+        #Possibl override object settings
+        if augment is None:
+            augment = self.augment
+        if canonical is None:    
+            canonical = self.canonical
+
+        for i,mol in enumerate(mols):
+            
+            #Fast convert from RDKit binary
+            if self.binary: mol = Chem.Mol(mol)
+            
+            if augment:
+                mol = self.randomize_mol(mol)
+            ss = Chem.MolToSmiles(mol, canonical=canonical, isomericSmiles=self.isomericSmiles)
+
+            #TODO, Improvement make it robust to too long SMILES strings
+            #TODO, Improvement make a "jitter", with random offset within the possible frame
+            #TODO, Improvement make it report to many "?"'s
+
+            l = len(ss)
+            if self.leftpad:
+                offset = self.dims[0]-l-1
+            else:
+                offset = 1
+
+            for j,c in enumerate(ss):
+                charidx = self._char_to_int.get(c, self._char_to_int[self.unknownchar])
+                one_hot[i,j+offset,charidx] = 1
+
+            #Pad the start
+            one_hot[i,offset-1,self._char_to_int[self.startchar]] = 1
+            #Pad the end
+            one_hot[i,offset+l:,self._char_to_int[self.endchar]] = 1
+            #Pad the space in front of start (Could this lead to funky effects during sampling?)
+            #one_hot[i,:offset-1,self._char_to_int[self.endchar]] = 1     
+                
+        return one_hot
+
+      
+    def reverse_transform(self, vect, strip=True):
+        """ Performs a conversion of a vectorized SMILES to a SMILES strings
+        charset must be the same as used for vectorization.
+        
+        :parameter vect: Numpy array of vectorized SMILES.
+        :parameter strip: Strip start and end tokens from the SMILES string
+        """
+        #TODO make it possible to take a single vectorized molecule, not a list
+        
+        smiles = []
+        for v in vect:
+            #mask v 
+            v=v[v.sum(axis=1)==1]
+            #Find one hot encoded index with argmax, translate to char and join to string
+            smile = "".join(self._int_to_char[i] for i in v.argmax(axis=1))
+            if strip:
+                smile = smile.strip(self.startchar + self.endchar)
+            smiles.append(smile)
+        return np.array(smiles)
+    
+from rdkit import DataStructs
+from rdkit.Chem import AllChem
+
+    
+class HashedMorganVectorizer(object):
+    def __init__(self, radius=2, bits=2048, augment=None):
+        self.bits = bits
+        self.radius = radius
+        self.augment = augment #Not used
+        self.dims = (bits,)
+        self.keys = None
+        
+    def transform_mol(self, mol):
+        """ transforms the molecule into a numpy bit array with the morgan bits
+
+            :parameter mol: the RDKit molecule to be transformed
+        """
+        fp = AllChem.GetMorganFingerprintAsBitVect(mol,self.radius,nBits=self.bits)
+        arr = np.zeros((self.bits,))
+        DataStructs.ConvertToNumpyArray(fp, arr)
+        return arr
+    
+    def transform(self, mols):
+        """Transforms a list or array of RDKit molecules into an array with the Morgan bits
+      
+        :parameter mols: list or array of RDKit molecules
+        """
+        
+        arr = np.zeros((len(mols), self.bits))
+        for i, mol in enumerate(mols):
+            arr[i,:] = self.transform_mol(mol)
+        return arr
+    
+
+class MorganDictVectorizer(object):
+    def __init__(self, radius=2, augment=None):
+        self.radius = radius
+        self.augment = augment #Not used
+        self.dims = None
+        
+    def fit(self, mols):
+        """Analyses the molecules and creates the key index for the creation of the dense array"""
+        keys=set()
+        for mol in mols:
+            fp = AllChem.GetMorganFingerprint(mol,self.radius)
+            keys.update(fp.GetNonzeroElements().keys())
+        keys = list(keys)
+        keys.sort()
+        self.keys= np.array(keys)
+        self.dims = len(self.keys)
+        
+    def transform_mol(self, mol, misses=False):
+        """ transforms the mol into a dense array using the fitted keys as index
+        
+            :parameter mol: the RDKit molecule to be transformed
+            :parameter misses: wheter to return the number of key misses for the molecule
+         """
+        assert type(self.keys) is np.ndarray, "keys are not defined or is not an np.array, has the .fit(mols) function been used?"
+        #Get fingerprint as a dictionary
+        fp = AllChem.GetMorganFingerprint(mol,self.radius)
+        fp_d = fp.GetNonzeroElements()
+        
+        #Prepare the array, and set the values
+        #TODO is there a way to vectorize and speed up this?
+        arr = np.zeros((self.dims,))
+        _misses = 0
+        for key, value in fp_d.items():
+            if key in self.keys:
+                arr[self.keys == key] = value
+            else:
+                _misses = _misses + 1
+        
+        if misses:
+            return arr, _misses
+        else:
+            return arr
+    
+    def transform(self, mols, misses=False):
+        """Transforms a list or array of RDKit molecules into a dense array using the key dictionary (see .fit())
+        
+        :parameter mols: list or array of RDKit molecules
+        :parameter misses: Wheter to return the number of key misses for each molecule
+        """
+        arr = np.zeros((len(mols), self.dims))
+        if misses:
+            _misses = np.zeros((len(mols),1))
+            for i, mol in enumerate(mols):
+                arr[i,:], _misses[i] = self.transform_mol(mol, misses=misses)
+            return arr, _misses
+        else:
+            for i, mol in enumerate(mols):
+                arr[i,:] = self.transform_mol(mol, misses=False)
+            return arr
+            
+
+    
\ No newline at end of file