--- a +++ b/ipynb/embedding 3d NCBI 2019-04-07.ipynb @@ -0,0 +1,1528 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exploration of 100d space of genome vectors\n", + "\n", + "Genome vectors created by the Dna2VecDataBunch exhibit piculiar patterns. This notebook is dedicated to exploratoin \n", + "of the bacterial genome space using dimensionality reduction techniques" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append(\"../mylib/\")\n", + "\n", + "from genomic import sequence\n", + "from genomic.sequence import regex_filter, count_filter\n", + "from functools import partial\n", + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.decomposition import PCA\n", + "from sklearn import manifold,neighbors\n", + "from scipy.cluster.hierarchy import dendrogram, linkage \n", + "from matplotlib import pyplot as plt\n", + "import seaborn as sns; sns.set(color_codes=True)\n", + "import plotly.plotly as py\n", + "import plotly.graph_objs as go" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load Data" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# DB=\"/data/genomes/GenSeq_fastas/train\"\n", + "DB='/home/serge/development/genomes/ncbi-genomes-2019-04-07/bacterial genomes'" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "filters=[partial(regex_filter, rx=\"Escherichia|Klebsiella|Bacillus\"),partial(regex_filter, rx=\"plasmid?\\s\", keep=False),\n", + " partial(count_filter,num_fastas=(1,1), keep=1)]\n", + "data = sequence.Dna2VecList.from_folder(DB,filters=filters,agg=partial(np.mean, axis=0),n_cpus=7)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1686" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(data.items)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 42min 11s, sys: 8min 41s, total: 50min 52s\n", + "Wall time: 1h 25min 43s\n" + ] + } + ], + "source": [ + "processors = [\n", + " sequence.GSFileProcessor(),\n", + " sequence.GSTokenizeProcessor(tokenizer=sequence.GSTokenizer(ngram=8, skip=0, n_cpus=4)),\n", + " sequence.Dna2VecProcessor()]\n", + "%time for p in processors: p.process(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1686" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(data.items)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Genome vectors" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "def log_scale(X):\n", + " x=np.asarray(X);e=1e-6\n", + " return np.log10(x+np.abs(x.min())+e) \n", + "\n", + "\n", + "x=np.asarray(data.items)\n", + "bad_fastas = np.where(np.mean(x,axis=1) == 0.)[0]\n", + "X = np.delete(x, bad_fastas,0)\n", + "labelList=[\" \".join(i.split()[1:3]) for i in data.descriptions]\n", + "labelList=np.delete(np.asarray(labelList), bad_fastas)\n", + "vocab=list(np.unique(labelList))\n", + "y=[vocab.index(x) for x in labelList]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Correlation Distance in log-scaled space" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### tSNE" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 31.1 s, sys: 313 ms, total: 31.4 s\n", + "Wall time: 30.9 s\n" + ] + } + ], + "source": [ + "tsne = manifold.TSNE(n_components=3, init='pca', perplexity=10, metric=\"correlation\",random_state=0)\n", + "%time X3 = tsne.fit_transform(log_scale(X))\n", + "\n", + "genus = [i.split()[0] for i in labelList]\n", + "genus_vocab=list(np.unique(genus))\n", + "y=[genus_vocab.index(x) for x in genus]\n", + "genus_vocab\n", + "\n", + "X3_df = pd.DataFrame(data=X3, columns=[\"pc1\",'pc2','pc3'], index=labelList)\n", + "\n", + "X3_df[\"genus\"]=genus\n", + "X3_df[\"y\"]=y\n", + "\n", + "genus_df=X3_df.groupby(\"genus\").agg({\"pc1\": list, \"pc2\":list,\"pc3\":list,\"y\":np.mean})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Correlation Distance visualisation" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/serge/anaconda3/envs/bio/lib/python3.6/site-packages/IPython/core/display.py:689: UserWarning:\n", + "\n", + "Consider using IPython.display.IFrame instead\n", + "\n" + ] + }, + { + "data": { + "text/html": [ + "<iframe id=\"igraph\" scrolling=\"no\" style=\"border:none;\" seamless=\"seamless\" src=\"https://plot.ly/~sergeman/12.embed\" height=\"1000px\" width=\"1000px\"></iframe>" + ], + "text/plain": [ + "<plotly.tools.PlotlyDisplay object>" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data=[]\n", + "for g in genus_df.index:\n", + " trace = go.Scatter3d(\n", + " name = str(g),\n", + " x=genus_df.loc[g,\"pc1\"],\n", + " y=genus_df.loc[g,\"pc2\"],\n", + " z=genus_df.loc[g,\"pc3\"],\n", + " mode='markers',\n", + " marker=dict(\n", + " size=8,\n", + " color=genus_df.loc[g,\"y\"], # set color to an array/list of desired values\n", + " colorscale='Jet', # choose a colorscale\n", + " opacity=0.5)\n", + " )\n", + "\n", + " data.append(trace)\n", + " \n", + "\n", + "layout = go.Layout(\n", + " width=1000,\n", + " height=1000,\n", + " margin=dict(\n", + " l=0,\n", + " r=0,\n", + " b=0,\n", + " t=0\n", + " )\n", + ")\n", + "fig = go.Figure(data=data, layout=layout)\n", + "py.iplot(fig, filename='correlation distance ncbi-genomes-2019-04-07 Escherichia,Klebsiella,Bacillus')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Eucleadian Distance in log-scaled space" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### tSNE" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 25.8 s, sys: 334 ms, total: 26.1 s\n", + "Wall time: 25.5 s\n" + ] + } + ], + "source": [ + "tsne = manifold.TSNE(n_components=3, init='pca', perplexity=30,random_state=0)\n", + "%time X3 = tsne.fit_transform(log_scale(X))" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "genus = [i.split()[0] for i in labelList]\n", + "genus_vocab=list(np.unique(genus))\n", + "y=[genus_vocab.index(x) for x in genus]\n", + "genus_vocab\n", + "\n", + "X3_df = pd.DataFrame(data=X3, columns=[\"pc1\",'pc2','pc3'], index=labelList)\n", + "\n", + "X3_df[\"genus\"]=genus\n", + "X3_df[\"y\"]=y\n", + "\n", + "genus_df=X3_df.groupby(\"genus\").agg({\"pc1\": list, \"pc2\":list,\"pc3\":list,\"y\":np.mean})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Eucleadian Distance Visualisation" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/serge/anaconda3/envs/bio/lib/python3.6/site-packages/IPython/core/display.py:689: UserWarning:\n", + "\n", + "Consider using IPython.display.IFrame instead\n", + "\n" + ] + }, + { + "data": { + "text/html": [ + "<iframe id=\"igraph\" scrolling=\"no\" style=\"border:none;\" seamless=\"seamless\" src=\"https://plot.ly/~sergeman/14.embed\" height=\"1000px\" width=\"1000px\"></iframe>" + ], + "text/plain": [ + "<plotly.tools.PlotlyDisplay object>" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data=[]\n", + "for g in genus_df.index:\n", + " trace = go.Scatter3d(\n", + " name = str(g),\n", + " x=genus_df.loc[g,\"pc1\"],\n", + " y=genus_df.loc[g,\"pc2\"],\n", + " z=genus_df.loc[g,\"pc3\"],\n", + " mode='markers',\n", + " marker=dict(\n", + " size=8,\n", + " color=genus_df.loc[g,\"y\"]+1, # set color to an array/list of desired values\n", + " colorscale='YlGnBu', # choose a colorscale\n", + " opacity=0.5)\n", + " )\n", + "\n", + " data.append(trace)\n", + " \n", + "\n", + "layout = go.Layout(\n", + " width=1000,\n", + " height=1000,\n", + " margin=dict(\n", + " l=0,\n", + " r=0,\n", + " b=0,\n", + " t=0\n", + " )\n", + ")\n", + "fig = go.Figure(data=data, layout=layout)\n", + "py.iplot(fig, filename='eucledian distance metric by genus Escherichia|Klebsiella|Bacillus')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Eucleadian Distance in unmodified space" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### tSNE" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 41.5 s, sys: 313 ms, total: 41.8 s\n", + "Wall time: 41.2 s\n" + ] + } + ], + "source": [ + "tsne = manifold.TSNE(n_components=3, init='pca', perplexity=30,random_state=0)\n", + "%time X3 = tsne.fit_transform(X)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "genus = [i.split()[0] for i in labelList]\n", + "genus_vocab=list(np.unique(genus))\n", + "y=[genus_vocab.index(x) for x in genus]\n", + "genus_vocab\n", + "\n", + "X3_df = pd.DataFrame(data=X3, columns=[\"pc1\",'pc2','pc3'], index=labelList)\n", + "\n", + "X3_df[\"genus\"]=genus\n", + "X3_df[\"y\"]=y\n", + "\n", + "genus_df=X3_df.groupby(\"genus\").agg({\"pc1\": list, \"pc2\":list,\"pc3\":list,\"y\":np.mean})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Eucleadian Distance Visualisation" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/serge/anaconda3/envs/bio/lib/python3.6/site-packages/IPython/core/display.py:689: UserWarning:\n", + "\n", + "Consider using IPython.display.IFrame instead\n", + "\n" + ] + }, + { + "data": { + "text/html": [ + "<iframe id=\"igraph\" scrolling=\"no\" style=\"border:none;\" seamless=\"seamless\" src=\"https://plot.ly/~sergeman/16.embed\" height=\"1000px\" width=\"1000px\"></iframe>" + ], + "text/plain": [ + "<plotly.tools.PlotlyDisplay object>" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data=[]\n", + "for g in genus_df.index:\n", + " trace = go.Scatter3d(\n", + " name = str(g),\n", + " x=genus_df.loc[g,\"pc1\"],\n", + " y=genus_df.loc[g,\"pc2\"],\n", + " z=genus_df.loc[g,\"pc3\"],\n", + " mode='markers',\n", + " marker=dict(\n", + " size=8,\n", + " color=genus_df.loc[g,\"y\"], # set color to an array/list of desired values\n", + " colorscale='Jet', # choose a colorscale\n", + " opacity=0.5)\n", + " )\n", + "\n", + " data.append(trace)\n", + " \n", + "\n", + "layout = go.Layout(\n", + " width=1000,\n", + " height=1000,\n", + " margin=dict(\n", + " l=0,\n", + " r=0,\n", + " b=0,\n", + " t=0\n", + " )\n", + ")\n", + "fig = go.Figure(data=data, layout=layout)\n", + "py.iplot(fig, filename='eucledian distance in native space Escherichia|Klebsiella|Bacillus')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Genome Inventory" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "inventory = pd.DataFrame(data=[l.split()[1:3] for l in all_fastas], columns=[\"genus\",\"species\" ])" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>species</th>\n", + " </tr>\n", + " <tr>\n", + " <th>genus</th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>Escherichia</th>\n", + " <td>2239</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Klebsiella</th>\n", + " <td>1718</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Salmonella</th>\n", + " <td>1183</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Bacillus</th>\n", + " <td>1172</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Lactobacillus</th>\n", + " <td>953</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Staphylococcus</th>\n", + " <td>889</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Burkholderia</th>\n", + " <td>650</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Enterococcus</th>\n", + " <td>626</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Pseudomonas</th>\n", + " <td>613</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Streptococcus</th>\n", + " <td>564</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Acinetobacter</th>\n", + " <td>531</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Bordetella</th>\n", + " <td>504</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Vibrio</th>\n", + " <td>474</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Xanthomonas</th>\n", + " <td>395</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Mycobacterium</th>\n", + " <td>368</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Borrelia</th>\n", + " <td>347</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Campylobacter</th>\n", + " <td>339</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Rhizobium</th>\n", + " <td>307</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Enterobacter</th>\n", + " <td>298</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Mycoplasma</th>\n", + " <td>290</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Corynebacterium</th>\n", + " <td>258</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Phaeobacter</th>\n", + " <td>256</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Yersinia</th>\n", + " <td>243</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Brucella</th>\n", + " <td>233</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Clostridium</th>\n", + " <td>228</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Listeria</th>\n", + " <td>227</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Streptomyces</th>\n", + " <td>225</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Chlamydia</th>\n", + " <td>202</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Helicobacter</th>\n", + " <td>187</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Candidatus</th>\n", + " <td>187</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>'Deinococcus</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Mariniflexile</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Marinithermus</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Lactobacillales</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Marinobacterium</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Megamonas</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Melioribacter</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Melittangium</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Methylobacillus</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Methylocaldum</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Magnetococcus</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Mageeibacillus</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Lysinimonas</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Luteitalea</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Lacunisphaera</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Lautropia</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Leadbetterella</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Leminorella</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Lentibacillus</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Lentzea</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Leptothrix</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Levyella</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Limnobaculum</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Limnochorda</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Litorilituus</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Lonsdalea</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Luteibacter</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Luteipulveratus</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>secondary</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>plasmid1</th>\n", + " <td>0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>1120 rows × 1 columns</p>\n", + "</div>" + ], + "text/plain": [ + " species\n", + "genus \n", + "Escherichia 2239\n", + "Klebsiella 1718\n", + "Salmonella 1183\n", + "Bacillus 1172\n", + "Lactobacillus 953\n", + "Staphylococcus 889\n", + "Burkholderia 650\n", + "Enterococcus 626\n", + "Pseudomonas 613\n", + "Streptococcus 564\n", + "Acinetobacter 531\n", + "Bordetella 504\n", + "Vibrio 474\n", + "Xanthomonas 395\n", + "Mycobacterium 368\n", + "Borrelia 347\n", + "Campylobacter 339\n", + "Rhizobium 307\n", + "Enterobacter 298\n", + "Mycoplasma 290\n", + "Corynebacterium 258\n", + "Phaeobacter 256\n", + "Yersinia 243\n", + "Brucella 233\n", + "Clostridium 228\n", + "Listeria 227\n", + "Streptomyces 225\n", + "Chlamydia 202\n", + "Helicobacter 187\n", + "Candidatus 187\n", + "... ...\n", + "'Deinococcus 1\n", + "Mariniflexile 1\n", + "Marinithermus 1\n", + "Lactobacillales 1\n", + "Marinobacterium 1\n", + "Megamonas 1\n", + "Melioribacter 1\n", + "Melittangium 1\n", + "Methylobacillus 1\n", + "Methylocaldum 1\n", + "Magnetococcus 1\n", + "Mageeibacillus 1\n", + "Lysinimonas 1\n", + "Luteitalea 1\n", + "Lacunisphaera 1\n", + "Lautropia 1\n", + "Leadbetterella 1\n", + "Leminorella 1\n", + "Lentibacillus 1\n", + "Lentzea 1\n", + "Leptothrix 1\n", + "Levyella 1\n", + "Limnobaculum 1\n", + "Limnochorda 1\n", + "Litorilituus 1\n", + "Lonsdalea 1\n", + "Luteibacter 1\n", + "Luteipulveratus 1\n", + "secondary 1\n", + "plasmid1 0\n", + "\n", + "[1120 rows x 1 columns]" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inventory.groupby(\"genus\").agg({\"species\":\"count\"}).sort_values(\"species\",ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th></th>\n", + " <th>count</th>\n", + " </tr>\n", + " <tr>\n", + " <th>genus</th>\n", + " <th>species</th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>'Catharanthus</th>\n", + " <th>roseus'</th>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>'Deinococcus</th>\n", + " <th>soli'</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>'Nostoc</th>\n", + " <th>azollae'</th>\n", + " <td>3</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18,711,729</th>\n", + " <th>reads</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Acaryochloris</th>\n", + " <th>marina</th>\n", + " <td>10</td>\n", + " </tr>\n", + " <tr>\n", + " <th rowspan=\"10\" valign=\"top\">Acetobacter</th>\n", + " <th>aceti</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>ascendens</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>orientalis</th>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>oryzifermentans</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>pasteurianus</th>\n", + " <td>91</td>\n", + " </tr>\n", + " <tr>\n", + " <th>persici</th>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>pomorum</th>\n", + " <td>7</td>\n", + " </tr>\n", + " <tr>\n", + " <th>senegalensis</th>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>sp.</th>\n", + " <td>7</td>\n", + " </tr>\n", + " <tr>\n", + " <th>tropicalis</th>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Acetobacterium</th>\n", + " <th>woodii</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Acetohalobium</th>\n", + " <th>arabaticum</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Acetomicrobium</th>\n", + " <th>mobile</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th rowspan=\"4\" valign=\"top\">Acholeplasma</th>\n", + " <th>axanthum</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>hippikon</th>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>laidlawii</th>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>oculi</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th rowspan=\"5\" valign=\"top\">Achromobacter</th>\n", + " <th>denitrificans</th>\n", + " <td>3</td>\n", + " </tr>\n", + " <tr>\n", + " <th>insolitus</th>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>sp.</th>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>spanius</th>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>xylosoxidans</th>\n", + " <td>11</td>\n", + " </tr>\n", + " <tr>\n", + " <th rowspan=\"2\" valign=\"top\">Acidaminococcus</th>\n", + " <th>fermentans</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>intestini</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Acidiferrobacter</th>\n", + " <th>sp.</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Zymobacter</th>\n", + " <th>palmae</th>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Zymomonas</th>\n", + " <th>mobilis</th>\n", + " <td>49</td>\n", + " </tr>\n", + " <tr>\n", + " <th>[Arcobacter]</th>\n", + " <th>porcinus</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>[Bacillus</th>\n", + " <th>thuringiensis]</th>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th rowspan=\"2\" valign=\"top\">[Bacillus]</th>\n", + " <th>caldolyticus</th>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>selenitireducens</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th rowspan=\"2\" valign=\"top\">[Brevibacterium]</th>\n", + " <th>flavum</th>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>frigoritolerans</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th rowspan=\"8\" valign=\"top\">[Clostridium]</th>\n", + " <th>bolteae</th>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>cellulolyticum</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>cellulosi</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>clariflavum</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>propionicum</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>saccharolyticum</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>scindens</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>stercorarium</th>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>[Enterobacter]</th>\n", + " <th>lignolyticus</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th rowspan=\"3\" valign=\"top\">[Eubacterium]</th>\n", + " <th>eligens</th>\n", + " <td>3</td>\n", + " </tr>\n", + " <tr>\n", + " <th>hallii</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>rectale</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th rowspan=\"2\" valign=\"top\">[Haemophilus]</th>\n", + " <th>ducreyi</th>\n", + " <td>14</td>\n", + " </tr>\n", + " <tr>\n", + " <th>parasuis</th>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>[Mycobacterium]</th>\n", + " <th>chelonae</th>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>[Pasteurella]</th>\n", + " <th>aerogenes</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>[Polyangium]</th>\n", + " <th>brachysporum</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>[Pseudomonas</th>\n", + " <th>syringae]</th>\n", + " <td>3</td>\n", + " </tr>\n", + " <tr>\n", + " <th>[Pseudomonas]</th>\n", + " <th>mesoacidophila</th>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>complete</th>\n", + " <th>chromosome</th>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>gamma</th>\n", + " <th>proteobacterium</th>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>secondary</th>\n", + " <th>endosymbiont</th>\n", + " <td>1</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>3186 rows × 1 columns</p>\n", + "</div>" + ], + "text/plain": [ + " count\n", + "genus species \n", + "'Catharanthus roseus' 2\n", + "'Deinococcus soli' 1\n", + "'Nostoc azollae' 3\n", + "18,711,729 reads 1\n", + "Acaryochloris marina 10\n", + "Acetobacter aceti 1\n", + " ascendens 1\n", + " orientalis 2\n", + " oryzifermentans 1\n", + " pasteurianus 91\n", + " persici 2\n", + " pomorum 7\n", + " senegalensis 2\n", + " sp. 7\n", + " tropicalis 2\n", + "Acetobacterium woodii 1\n", + "Acetohalobium arabaticum 1\n", + "Acetomicrobium mobile 1\n", + "Acholeplasma axanthum 1\n", + " hippikon 2\n", + " laidlawii 2\n", + " oculi 1\n", + "Achromobacter denitrificans 3\n", + " insolitus 4\n", + " sp. 2\n", + " spanius 4\n", + " xylosoxidans 11\n", + "Acidaminococcus fermentans 1\n", + " intestini 1\n", + "Acidiferrobacter sp. 1\n", + "... ...\n", + "Zymobacter palmae 2\n", + "Zymomonas mobilis 49\n", + "[Arcobacter] porcinus 1\n", + "[Bacillus thuringiensis] 2\n", + "[Bacillus] caldolyticus 2\n", + " selenitireducens 1\n", + "[Brevibacterium] flavum 2\n", + " frigoritolerans 1\n", + "[Clostridium] bolteae 2\n", + " cellulolyticum 1\n", + " cellulosi 1\n", + " clariflavum 1\n", + " propionicum 1\n", + " saccharolyticum 1\n", + " scindens 1\n", + " stercorarium 4\n", + "[Enterobacter] lignolyticus 1\n", + "[Eubacterium] eligens 3\n", + " hallii 1\n", + " rectale 1\n", + "[Haemophilus] ducreyi 14\n", + " parasuis 4\n", + "[Mycobacterium] chelonae 2\n", + "[Pasteurella] aerogenes 1\n", + "[Polyangium] brachysporum 1\n", + "[Pseudomonas syringae] 3\n", + "[Pseudomonas] mesoacidophila 4\n", + "complete chromosome 2\n", + "gamma proteobacterium 1\n", + "secondary endosymbiont 1\n", + "\n", + "[3186 rows x 1 columns]" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inventory.groupby([\"genus\", \"species\"]).agg({\"species\": \"count\"})\n", + "inventory.columns=[\"count\"]\n", + "inventory" + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>n_sequences</th>\n", + " <th>species</th>\n", + " </tr>\n", + " <tr>\n", + " <th>genus</th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>Bacillus</th>\n", + " <td>1132</td>\n", + " <td>11</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Streptomyces</th>\n", + " <td>743</td>\n", + " <td>5</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Vibrio</th>\n", + " <td>468</td>\n", + " <td>5</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Rhizobium</th>\n", + " <td>325</td>\n", + " <td>6</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Pseudomonas</th>\n", + " <td>304</td>\n", + " <td>8</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Staphylococcus</th>\n", + " <td>301</td>\n", + " <td>6</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Clostridium</th>\n", + " <td>259</td>\n", + " <td>5</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Streptococcus</th>\n", + " <td>222</td>\n", + " <td>6</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Planktothrix</th>\n", + " <td>179</td>\n", + " <td>5</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Stenotrophomonas</th>\n", + " <td>176</td>\n", + " <td>5</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Escherichia</th>\n", + " <td>133</td>\n", + " <td>3</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Paenibacillus</th>\n", + " <td>127</td>\n", + " <td>3</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Listeria</th>\n", + " <td>104</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Corynebacterium</th>\n", + " <td>103</td>\n", + " <td>7</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Klebsiella</th>\n", + " <td>16</td>\n", + " <td>3</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Shigella</th>\n", + " <td>14</td>\n", + " <td>3</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Salmonella</th>\n", + " <td>4</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Enterobacter</th>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " n_sequences species\n", + "genus \n", + "Bacillus 1132 11\n", + "Streptomyces 743 5\n", + "Vibrio 468 5\n", + "Rhizobium 325 6\n", + "Pseudomonas 304 8\n", + "Staphylococcus 301 6\n", + "Clostridium 259 5\n", + "Streptococcus 222 6\n", + "Planktothrix 179 5\n", + "Stenotrophomonas 176 5\n", + "Escherichia 133 3\n", + "Paenibacillus 127 3\n", + "Listeria 104 4\n", + "Corynebacterium 103 7\n", + "Klebsiella 16 3\n", + "Shigella 14 3\n", + "Salmonella 4 2\n", + "Enterobacter 1 1" + ] + }, + "execution_count": 117, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "counts = inventory.reset_index().groupby(\"genus\").agg({\"count\", sum}).drop((\"species\"), axis=1)\n", + "counts.columns=[\"n_sequences\",\"species\"]\n", + "counts.sort_values(\"n_sequences\", ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}