{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"
"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "J_8OxbTXRGUH",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "42e37ffc-3f3d-41a3-c579-de28e29ab26e"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"processor\t: 0\n",
"vendor_id\t: GenuineIntel\n",
"cpu family\t: 6\n",
"model\t\t: 79\n",
"model name\t: Intel(R) Xeon(R) CPU @ 2.20GHz\n",
"stepping\t: 0\n",
"microcode\t: 0x1\n",
"cpu MHz\t\t: 2199.998\n",
"cache size\t: 56320 KB\n",
"physical id\t: 0\n",
"siblings\t: 2\n",
"core id\t\t: 0\n",
"cpu cores\t: 1\n",
"apicid\t\t: 0\n",
"initial apicid\t: 0\n",
"fpu\t\t: yes\n",
"fpu_exception\t: yes\n",
"cpuid level\t: 13\n",
"wp\t\t: yes\n",
"flags\t\t: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm rdseed adx smap xsaveopt arat md_clear arch_capabilities\n",
"bugs\t\t: cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds swapgs taa\n",
"bogomips\t: 4399.99\n",
"clflush size\t: 64\n",
"cache_alignment\t: 64\n",
"address sizes\t: 46 bits physical, 48 bits virtual\n",
"power management:\n",
"\n",
"processor\t: 1\n",
"vendor_id\t: GenuineIntel\n",
"cpu family\t: 6\n",
"model\t\t: 79\n",
"model name\t: Intel(R) Xeon(R) CPU @ 2.20GHz\n",
"stepping\t: 0\n",
"microcode\t: 0x1\n",
"cpu MHz\t\t: 2199.998\n",
"cache size\t: 56320 KB\n",
"physical id\t: 0\n",
"siblings\t: 2\n",
"core id\t\t: 0\n",
"cpu cores\t: 1\n",
"apicid\t\t: 1\n",
"initial apicid\t: 1\n",
"fpu\t\t: yes\n",
"fpu_exception\t: yes\n",
"cpuid level\t: 13\n",
"wp\t\t: yes\n",
"flags\t\t: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm rdseed adx smap xsaveopt arat md_clear arch_capabilities\n",
"bugs\t\t: cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds swapgs taa\n",
"bogomips\t: 4399.99\n",
"clflush size\t: 64\n",
"cache_alignment\t: 64\n",
"address sizes\t: 46 bits physical, 48 bits virtual\n",
"power management:\n",
"\n"
]
}
],
"source": [
"!cat /proc/cpuinfo"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "m9S78vQIV0sZ"
},
"outputs": [],
"source": [
"!sudo apt-get -qq install dvipng texlive-latex-extra texlive-fonts-recommended cm-super"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "dckDYheNzuOs"
},
"outputs": [],
"source": [
"!pip install -qq selfies==2.1.1 pandas rdkit matplotlib SciencePlots"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "ITdwe6ePmvGE"
},
"outputs": [],
"source": [
"%config InlineBackend.figure_formats = ['svg']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "b218cY-3aMVc"
},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import pandas as pd \n",
"import random\n",
"import selfies as sf \n",
"import timeit\n",
"\n",
"from rdkit import Chem\n",
"\n",
"plt.style.use(\"science\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "_Afa-aGYXkDN"
},
"source": [
"# SELFIES Translation"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "Kjc0q7cYXmWY"
},
"outputs": [],
"source": [
"def time_roundtrip_translation(smiles):\n",
" selfies = [sf.encoder(s) for s in smiles]\n",
"\n",
" def batch_encode():\n",
" for s in smiles: \n",
" sf.encoder(s)\n",
"\n",
" def batch_decode():\n",
" for s in selfies:\n",
" sf.decoder(s)\n",
"\n",
" n_trials = 3\n",
" encode_time = timeit.timeit(stmt=batch_encode, number=n_trials) / n_trials\n",
" decode_time = timeit.timeit(stmt=batch_decode, number=n_trials) / n_trials \n",
" return encode_time, decode_time"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "c5yGYrHkkeIc"
},
"outputs": [],
"source": [
"def time_individual_roundtrip_translation(smiles):\n",
" sizes = []\n",
" times = []\n",
" for s in smiles:\n",
" n_trials = 3\n",
" time = timeit.timeit(stmt=lambda: sf.decoder(sf.encoder(s)), number=n_trials) / n_trials\n",
" mol = Chem.MolFromSmiles(s)\n",
" if mol is not None:\n",
" sizes.append(mol.GetNumAtoms())\n",
" times.append(time)\n",
" return sizes, times\n",
"\n",
"def plot_translation_sizes_vs_time(sizes, times):\n",
" times = np.array(times) * 1000\n",
" plt.scatter(sizes, times, s=2)\n",
" plt.xlabel(\"Number of Atoms\")\n",
" plt.ylabel(\"Roundtrip Time (ms)\")\n",
" plt.xlim((0, 60))\n",
" plt.tight_layout()\n",
"\n",
" plt.savefig(\"nci_open_compound_translation.pdf\")\n",
" plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "taID525mghWs"
},
"source": [
"## NCI Open Compound"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "wzOubi6Vf63j",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "6167cf54-920a-4574-f3c6-6c5e8eeaba03"
},
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py:2882: DtypeWarning: Columns (23) have mixed types.Specify dtype option on import or set low_memory=False.\n",
" exec(code_obj, self.user_global_ns, self.user_ns)\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"301607"
]
},
"metadata": {},
"execution_count": 8
}
],
"source": [
"nci_open_compound = pd.read_csv(\"PubChem_compound_text_DTP_NCI.csv\")\n",
"nci_open_compound = nci_open_compound[\"isosmiles\"].tolist()\n",
"\n",
"# csv file saves backslashes as \\\\, so we replace\n",
"nci_open_compound = [s.replace(\"\\\\\\\\\", \"\\\\\") for s in nci_open_compound]\n",
"\n",
"len(nci_open_compound)"
]
},
{
"cell_type": "code",
"source": [
"constraints = sf.get_preset_constraints(name=\"hypervalent\")\n",
"constraints[\"P-1\"] = 6\n",
"sf.set_semantic_constraints(constraints)"
],
"metadata": {
"id": "4eN92ylFuoxf"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "o5FQN2epgjY3",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "93c6232b-25f4-41b7-b18a-da7862db6b88"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Encode time: 136.29431584966673\n",
"Decode time: 116.39895675733351\n",
"Total time: 252.69327260700024\n"
]
}
],
"source": [
"encode_time, decode_time = time_roundtrip_translation(nci_open_compound)\n",
"\n",
"print(\"Encode time:\", encode_time)\n",
"print(\"Decode time:\", decode_time)\n",
"print(\"Total time: \", encode_time + decode_time)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "Po-HDF2YkdOI",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 254
},
"outputId": "c0d0cb12-974c-4c83-f07b-76a577814709"
},
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
""
],
"image/svg+xml": "\n\n\n