<a href="https://colab.research.google.com/github/francescopatane96/Computer_aided_drug_discovery_kit/blob/main/ML_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Unwanted substructures:
substructures can be reactive or toxic or they can interfere with certain assays. Filtering unwanted substructures can support assembling more efficient screening libraries, which can save time and resources.

Examples of such unwanted features are nitro groups (mutagenic), sulfates and phosphates (likely resulting in unfavorable pharmacokinetic properties), 2-halopyridines and thiols (reactive). 

Pan Assay Interference Compounds (PAINS):
PAINS are compounds that often occur as hits in HTS even though they actually are false positives. PAINS show activity at numerous targets rather than one specific target.

In [None]:
!pip install rdkit

In [None]:
from pathlib import Path

import pandas as pd
from tqdm.auto import tqdm
from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem.FilterCatalog import FilterCatalog, FilterCatalogParams

In [None]:
# load data from Talktorial T2
TNFB_data = pd.read_csv(
    "TNFB_compounds_lipinski.csv",
    index_col=0,
)
# Drop unnecessary information
print("Dataframe shape:", TNFB_data.shape)
TNFB_data.drop(columns=["molecular_weight", "n_hbd", "n_hba", "logp"], inplace=True)
TNFB_data.head()

In [None]:
# Add molecule column
PandasTools.AddMoleculeColumnToFrame(TNFB_data, smilesCol="smiles")
# Draw first 3 molecules
Chem.Draw.MolsToGridImage(
    list(TNFB_data.head(3).ROMol),
    legends=list(TNFB_data.head(3).molecule_chembl_id),
)

Filter for PAINS

In [None]:
# initialize filter
params = FilterCatalogParams()
params.AddCatalog(FilterCatalogParams.FilterCatalogs.PAINS)
catalog = FilterCatalog(params)

In [None]:
# search for PAINS
matches = []
clean = []
for index, row in tqdm(TNFB_data.iterrows(), total=TNFB_data.shape[0]):
    molecule = Chem.MolFromSmiles(row.smiles)
    entry = catalog.GetFirstMatch(molecule)  # Get the first matching PAINS
    if entry is not None:
        # store PAINS information
        matches.append(
            {
                "chembl_id": row.molecule_chembl_id,
                "rdkit_molecule": molecule,
                "pains": entry.GetDescription().capitalize(),
            }
        )
    else:
        # collect indices of molecules without PAINS
        clean.append(index)

matches = pd.DataFrame(matches)
TNFB_data = TNFB_data.loc[clean]  # keep molecules without PAINS

In [None]:
print(f"Number of compounds with PAINS: {len(matches)}")
print(f"Number of compounds without PAINS: {len(TNFB_data)}")

In [None]:
Chem.Draw.MolsToGridImage(
    list(matches.head(5).rdkit_molecule),
    legends=list(matches.head(5)["pains"]),
)

Filter and highlight unwanted substructures

In [None]:
substructures = pd.read_csv("unwantedSubstructures.csv", sep="\s+")
substructures["rdkit_molecule"] = substructures.smart.apply(Chem.MolFromSmarts)
print("Number of unwanted substructures in collection:", len(substructures))

In [None]:
Chem.Draw.MolsToGridImage(
    mols=substructures.rdkit_molecule.tolist()[2:5],
    
)

In [None]:
# search for unwanted substructure
matches = []
clean = []
for index, row in tqdm(TNFB_data.iterrows(), total=TNFB_data.shape[0]):
    molecule = Chem.MolFromSmiles(row.smiles)
    match = False
    for _, substructure in substructures.iterrows():
        if molecule.HasSubstructMatch(substructure.rdkit_molecule):
            matches.append(
                {
                    "chembl_id": row.molecule_chembl_id,
                    "rdkit_molecule": molecule,
                    "substructure": substructure.rdkit_molecule,
                    
                }
            )
            match = True
    if not match:
        clean.append(index)

matches = pd.DataFrame(matches)
TNFB_data = TNFB_data.loc[clean]

In [None]:
print(f"Number of found unwanted substructure: {len(matches)}")
print(f"Number of compounds without unwanted substructure: {len(TNFB_data)}")

In [None]:
to_highlight = [
    row.rdkit_molecule.GetSubstructMatch(row.substructure) for _, row in matches.head(3).iterrows()
]
Chem.Draw.MolsToGridImage(
    list(matches.head(3).rdkit_molecule),
    highlightAtomLists=to_highlight,
    
)

Substructure statistics

In [None]:

group_frequencies = groups.size()
group_frequencies.sort_values(ascending=False, inplace=True)
group_frequencies.head(10)