1529 lines (1528 with data), 43.4 kB
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Exploration of 100d space of genome vectors\n",
"\n",
"Genome vectors created by the Dna2VecDataBunch exhibit piculiar patterns. This notebook is dedicated to exploratoin \n",
"of the bacterial genome space using dimensionality reduction techniques"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The autoreload extension is already loaded. To reload it, use:\n",
" %reload_ext autoreload\n"
]
}
],
"source": [
"%load_ext autoreload\n",
"%autoreload 2"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"sys.path.append(\"../mylib/\")\n",
"\n",
"from genomic import sequence\n",
"from genomic.sequence import regex_filter, count_filter\n",
"from functools import partial\n",
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.decomposition import PCA\n",
"from sklearn import manifold,neighbors\n",
"from scipy.cluster.hierarchy import dendrogram, linkage \n",
"from matplotlib import pyplot as plt\n",
"import seaborn as sns; sns.set(color_codes=True)\n",
"import plotly.plotly as py\n",
"import plotly.graph_objs as go"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Load Data"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# DB=\"/data/genomes/GenSeq_fastas/train\"\n",
"DB='/home/serge/development/genomes/ncbi-genomes-2019-04-07/bacterial genomes'"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"filters=[partial(regex_filter, rx=\"Escherichia|Klebsiella|Bacillus\"),partial(regex_filter, rx=\"plasmid?\\s\", keep=False),\n",
" partial(count_filter,num_fastas=(1,1), keep=1)]\n",
"data = sequence.Dna2VecList.from_folder(DB,filters=filters,agg=partial(np.mean, axis=0),n_cpus=7)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1686"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(data.items)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 42min 11s, sys: 8min 41s, total: 50min 52s\n",
"Wall time: 1h 25min 43s\n"
]
}
],
"source": [
"processors = [\n",
" sequence.GSFileProcessor(),\n",
" sequence.GSTokenizeProcessor(tokenizer=sequence.GSTokenizer(ngram=8, skip=0, n_cpus=4)),\n",
" sequence.Dna2VecProcessor()]\n",
"%time for p in processors: p.process(data)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1686"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(data.items)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Genome vectors"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"def log_scale(X):\n",
" x=np.asarray(X);e=1e-6\n",
" return np.log10(x+np.abs(x.min())+e) \n",
"\n",
"\n",
"x=np.asarray(data.items)\n",
"bad_fastas = np.where(np.mean(x,axis=1) == 0.)[0]\n",
"X = np.delete(x, bad_fastas,0)\n",
"labelList=[\" \".join(i.split()[1:3]) for i in data.descriptions]\n",
"labelList=np.delete(np.asarray(labelList), bad_fastas)\n",
"vocab=list(np.unique(labelList))\n",
"y=[vocab.index(x) for x in labelList]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Correlation Distance in log-scaled space"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### tSNE"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 31.1 s, sys: 313 ms, total: 31.4 s\n",
"Wall time: 30.9 s\n"
]
}
],
"source": [
"tsne = manifold.TSNE(n_components=3, init='pca', perplexity=10, metric=\"correlation\",random_state=0)\n",
"%time X3 = tsne.fit_transform(log_scale(X))\n",
"\n",
"genus = [i.split()[0] for i in labelList]\n",
"genus_vocab=list(np.unique(genus))\n",
"y=[genus_vocab.index(x) for x in genus]\n",
"genus_vocab\n",
"\n",
"X3_df = pd.DataFrame(data=X3, columns=[\"pc1\",'pc2','pc3'], index=labelList)\n",
"\n",
"X3_df[\"genus\"]=genus\n",
"X3_df[\"y\"]=y\n",
"\n",
"genus_df=X3_df.groupby(\"genus\").agg({\"pc1\": list, \"pc2\":list,\"pc3\":list,\"y\":np.mean})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Correlation Distance visualisation"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/serge/anaconda3/envs/bio/lib/python3.6/site-packages/IPython/core/display.py:689: UserWarning:\n",
"\n",
"Consider using IPython.display.IFrame instead\n",
"\n"
]
},
{
"data": {
"text/html": [
"<iframe id=\"igraph\" scrolling=\"no\" style=\"border:none;\" seamless=\"seamless\" src=\"https://plot.ly/~sergeman/12.embed\" height=\"1000px\" width=\"1000px\"></iframe>"
],
"text/plain": [
"<plotly.tools.PlotlyDisplay object>"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data=[]\n",
"for g in genus_df.index:\n",
" trace = go.Scatter3d(\n",
" name = str(g),\n",
" x=genus_df.loc[g,\"pc1\"],\n",
" y=genus_df.loc[g,\"pc2\"],\n",
" z=genus_df.loc[g,\"pc3\"],\n",
" mode='markers',\n",
" marker=dict(\n",
" size=8,\n",
" color=genus_df.loc[g,\"y\"], # set color to an array/list of desired values\n",
" colorscale='Jet', # choose a colorscale\n",
" opacity=0.5)\n",
" )\n",
"\n",
" data.append(trace)\n",
" \n",
"\n",
"layout = go.Layout(\n",
" width=1000,\n",
" height=1000,\n",
" margin=dict(\n",
" l=0,\n",
" r=0,\n",
" b=0,\n",
" t=0\n",
" )\n",
")\n",
"fig = go.Figure(data=data, layout=layout)\n",
"py.iplot(fig, filename='correlation distance ncbi-genomes-2019-04-07 Escherichia,Klebsiella,Bacillus')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Eucleadian Distance in log-scaled space"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### tSNE"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 25.8 s, sys: 334 ms, total: 26.1 s\n",
"Wall time: 25.5 s\n"
]
}
],
"source": [
"tsne = manifold.TSNE(n_components=3, init='pca', perplexity=30,random_state=0)\n",
"%time X3 = tsne.fit_transform(log_scale(X))"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"genus = [i.split()[0] for i in labelList]\n",
"genus_vocab=list(np.unique(genus))\n",
"y=[genus_vocab.index(x) for x in genus]\n",
"genus_vocab\n",
"\n",
"X3_df = pd.DataFrame(data=X3, columns=[\"pc1\",'pc2','pc3'], index=labelList)\n",
"\n",
"X3_df[\"genus\"]=genus\n",
"X3_df[\"y\"]=y\n",
"\n",
"genus_df=X3_df.groupby(\"genus\").agg({\"pc1\": list, \"pc2\":list,\"pc3\":list,\"y\":np.mean})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Eucleadian Distance Visualisation"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/serge/anaconda3/envs/bio/lib/python3.6/site-packages/IPython/core/display.py:689: UserWarning:\n",
"\n",
"Consider using IPython.display.IFrame instead\n",
"\n"
]
},
{
"data": {
"text/html": [
"<iframe id=\"igraph\" scrolling=\"no\" style=\"border:none;\" seamless=\"seamless\" src=\"https://plot.ly/~sergeman/14.embed\" height=\"1000px\" width=\"1000px\"></iframe>"
],
"text/plain": [
"<plotly.tools.PlotlyDisplay object>"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data=[]\n",
"for g in genus_df.index:\n",
" trace = go.Scatter3d(\n",
" name = str(g),\n",
" x=genus_df.loc[g,\"pc1\"],\n",
" y=genus_df.loc[g,\"pc2\"],\n",
" z=genus_df.loc[g,\"pc3\"],\n",
" mode='markers',\n",
" marker=dict(\n",
" size=8,\n",
" color=genus_df.loc[g,\"y\"]+1, # set color to an array/list of desired values\n",
" colorscale='YlGnBu', # choose a colorscale\n",
" opacity=0.5)\n",
" )\n",
"\n",
" data.append(trace)\n",
" \n",
"\n",
"layout = go.Layout(\n",
" width=1000,\n",
" height=1000,\n",
" margin=dict(\n",
" l=0,\n",
" r=0,\n",
" b=0,\n",
" t=0\n",
" )\n",
")\n",
"fig = go.Figure(data=data, layout=layout)\n",
"py.iplot(fig, filename='eucledian distance metric by genus Escherichia|Klebsiella|Bacillus')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Eucleadian Distance in unmodified space"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### tSNE"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 41.5 s, sys: 313 ms, total: 41.8 s\n",
"Wall time: 41.2 s\n"
]
}
],
"source": [
"tsne = manifold.TSNE(n_components=3, init='pca', perplexity=30,random_state=0)\n",
"%time X3 = tsne.fit_transform(X)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"genus = [i.split()[0] for i in labelList]\n",
"genus_vocab=list(np.unique(genus))\n",
"y=[genus_vocab.index(x) for x in genus]\n",
"genus_vocab\n",
"\n",
"X3_df = pd.DataFrame(data=X3, columns=[\"pc1\",'pc2','pc3'], index=labelList)\n",
"\n",
"X3_df[\"genus\"]=genus\n",
"X3_df[\"y\"]=y\n",
"\n",
"genus_df=X3_df.groupby(\"genus\").agg({\"pc1\": list, \"pc2\":list,\"pc3\":list,\"y\":np.mean})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Eucleadian Distance Visualisation"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/serge/anaconda3/envs/bio/lib/python3.6/site-packages/IPython/core/display.py:689: UserWarning:\n",
"\n",
"Consider using IPython.display.IFrame instead\n",
"\n"
]
},
{
"data": {
"text/html": [
"<iframe id=\"igraph\" scrolling=\"no\" style=\"border:none;\" seamless=\"seamless\" src=\"https://plot.ly/~sergeman/16.embed\" height=\"1000px\" width=\"1000px\"></iframe>"
],
"text/plain": [
"<plotly.tools.PlotlyDisplay object>"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data=[]\n",
"for g in genus_df.index:\n",
" trace = go.Scatter3d(\n",
" name = str(g),\n",
" x=genus_df.loc[g,\"pc1\"],\n",
" y=genus_df.loc[g,\"pc2\"],\n",
" z=genus_df.loc[g,\"pc3\"],\n",
" mode='markers',\n",
" marker=dict(\n",
" size=8,\n",
" color=genus_df.loc[g,\"y\"], # set color to an array/list of desired values\n",
" colorscale='Jet', # choose a colorscale\n",
" opacity=0.5)\n",
" )\n",
"\n",
" data.append(trace)\n",
" \n",
"\n",
"layout = go.Layout(\n",
" width=1000,\n",
" height=1000,\n",
" margin=dict(\n",
" l=0,\n",
" r=0,\n",
" b=0,\n",
" t=0\n",
" )\n",
")\n",
"fig = go.Figure(data=data, layout=layout)\n",
"py.iplot(fig, filename='eucledian distance in native space Escherichia|Klebsiella|Bacillus')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Genome Inventory"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"inventory = pd.DataFrame(data=[l.split()[1:3] for l in all_fastas], columns=[\"genus\",\"species\" ])"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>species</th>\n",
" </tr>\n",
" <tr>\n",
" <th>genus</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Escherichia</th>\n",
" <td>2239</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Klebsiella</th>\n",
" <td>1718</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Salmonella</th>\n",
" <td>1183</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Bacillus</th>\n",
" <td>1172</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Lactobacillus</th>\n",
" <td>953</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Staphylococcus</th>\n",
" <td>889</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Burkholderia</th>\n",
" <td>650</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Enterococcus</th>\n",
" <td>626</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Pseudomonas</th>\n",
" <td>613</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Streptococcus</th>\n",
" <td>564</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Acinetobacter</th>\n",
" <td>531</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Bordetella</th>\n",
" <td>504</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Vibrio</th>\n",
" <td>474</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Xanthomonas</th>\n",
" <td>395</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Mycobacterium</th>\n",
" <td>368</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Borrelia</th>\n",
" <td>347</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Campylobacter</th>\n",
" <td>339</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Rhizobium</th>\n",
" <td>307</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Enterobacter</th>\n",
" <td>298</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Mycoplasma</th>\n",
" <td>290</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Corynebacterium</th>\n",
" <td>258</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Phaeobacter</th>\n",
" <td>256</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Yersinia</th>\n",
" <td>243</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Brucella</th>\n",
" <td>233</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Clostridium</th>\n",
" <td>228</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Listeria</th>\n",
" <td>227</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Streptomyces</th>\n",
" <td>225</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Chlamydia</th>\n",
" <td>202</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Helicobacter</th>\n",
" <td>187</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Candidatus</th>\n",
" <td>187</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>'Deinococcus</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Mariniflexile</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Marinithermus</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Lactobacillales</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Marinobacterium</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Megamonas</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Melioribacter</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Melittangium</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Methylobacillus</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Methylocaldum</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Magnetococcus</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Mageeibacillus</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Lysinimonas</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Luteitalea</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Lacunisphaera</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Lautropia</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Leadbetterella</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Leminorella</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Lentibacillus</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Lentzea</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Leptothrix</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Levyella</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Limnobaculum</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Limnochorda</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Litorilituus</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Lonsdalea</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Luteibacter</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Luteipulveratus</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>secondary</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>plasmid1</th>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1120 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" species\n",
"genus \n",
"Escherichia 2239\n",
"Klebsiella 1718\n",
"Salmonella 1183\n",
"Bacillus 1172\n",
"Lactobacillus 953\n",
"Staphylococcus 889\n",
"Burkholderia 650\n",
"Enterococcus 626\n",
"Pseudomonas 613\n",
"Streptococcus 564\n",
"Acinetobacter 531\n",
"Bordetella 504\n",
"Vibrio 474\n",
"Xanthomonas 395\n",
"Mycobacterium 368\n",
"Borrelia 347\n",
"Campylobacter 339\n",
"Rhizobium 307\n",
"Enterobacter 298\n",
"Mycoplasma 290\n",
"Corynebacterium 258\n",
"Phaeobacter 256\n",
"Yersinia 243\n",
"Brucella 233\n",
"Clostridium 228\n",
"Listeria 227\n",
"Streptomyces 225\n",
"Chlamydia 202\n",
"Helicobacter 187\n",
"Candidatus 187\n",
"... ...\n",
"'Deinococcus 1\n",
"Mariniflexile 1\n",
"Marinithermus 1\n",
"Lactobacillales 1\n",
"Marinobacterium 1\n",
"Megamonas 1\n",
"Melioribacter 1\n",
"Melittangium 1\n",
"Methylobacillus 1\n",
"Methylocaldum 1\n",
"Magnetococcus 1\n",
"Mageeibacillus 1\n",
"Lysinimonas 1\n",
"Luteitalea 1\n",
"Lacunisphaera 1\n",
"Lautropia 1\n",
"Leadbetterella 1\n",
"Leminorella 1\n",
"Lentibacillus 1\n",
"Lentzea 1\n",
"Leptothrix 1\n",
"Levyella 1\n",
"Limnobaculum 1\n",
"Limnochorda 1\n",
"Litorilituus 1\n",
"Lonsdalea 1\n",
"Luteibacter 1\n",
"Luteipulveratus 1\n",
"secondary 1\n",
"plasmid1 0\n",
"\n",
"[1120 rows x 1 columns]"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"inventory.groupby(\"genus\").agg({\"species\":\"count\"}).sort_values(\"species\",ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>count</th>\n",
" </tr>\n",
" <tr>\n",
" <th>genus</th>\n",
" <th>species</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>'Catharanthus</th>\n",
" <th>roseus'</th>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>'Deinococcus</th>\n",
" <th>soli'</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>'Nostoc</th>\n",
" <th>azollae'</th>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18,711,729</th>\n",
" <th>reads</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Acaryochloris</th>\n",
" <th>marina</th>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"10\" valign=\"top\">Acetobacter</th>\n",
" <th>aceti</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ascendens</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>orientalis</th>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>oryzifermentans</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>pasteurianus</th>\n",
" <td>91</td>\n",
" </tr>\n",
" <tr>\n",
" <th>persici</th>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>pomorum</th>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>senegalensis</th>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>sp.</th>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>tropicalis</th>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Acetobacterium</th>\n",
" <th>woodii</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Acetohalobium</th>\n",
" <th>arabaticum</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Acetomicrobium</th>\n",
" <th>mobile</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"4\" valign=\"top\">Acholeplasma</th>\n",
" <th>axanthum</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>hippikon</th>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>laidlawii</th>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>oculi</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"5\" valign=\"top\">Achromobacter</th>\n",
" <th>denitrificans</th>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>insolitus</th>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>sp.</th>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>spanius</th>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>xylosoxidans</th>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"2\" valign=\"top\">Acidaminococcus</th>\n",
" <th>fermentans</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>intestini</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Acidiferrobacter</th>\n",
" <th>sp.</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Zymobacter</th>\n",
" <th>palmae</th>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Zymomonas</th>\n",
" <th>mobilis</th>\n",
" <td>49</td>\n",
" </tr>\n",
" <tr>\n",
" <th>[Arcobacter]</th>\n",
" <th>porcinus</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>[Bacillus</th>\n",
" <th>thuringiensis]</th>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"2\" valign=\"top\">[Bacillus]</th>\n",
" <th>caldolyticus</th>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>selenitireducens</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"2\" valign=\"top\">[Brevibacterium]</th>\n",
" <th>flavum</th>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>frigoritolerans</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"8\" valign=\"top\">[Clostridium]</th>\n",
" <th>bolteae</th>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>cellulolyticum</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>cellulosi</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>clariflavum</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>propionicum</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>saccharolyticum</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>scindens</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>stercorarium</th>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>[Enterobacter]</th>\n",
" <th>lignolyticus</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"3\" valign=\"top\">[Eubacterium]</th>\n",
" <th>eligens</th>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>hallii</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>rectale</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"2\" valign=\"top\">[Haemophilus]</th>\n",
" <th>ducreyi</th>\n",
" <td>14</td>\n",
" </tr>\n",
" <tr>\n",
" <th>parasuis</th>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>[Mycobacterium]</th>\n",
" <th>chelonae</th>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>[Pasteurella]</th>\n",
" <th>aerogenes</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>[Polyangium]</th>\n",
" <th>brachysporum</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>[Pseudomonas</th>\n",
" <th>syringae]</th>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>[Pseudomonas]</th>\n",
" <th>mesoacidophila</th>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>complete</th>\n",
" <th>chromosome</th>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>gamma</th>\n",
" <th>proteobacterium</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>secondary</th>\n",
" <th>endosymbiont</th>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3186 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" count\n",
"genus species \n",
"'Catharanthus roseus' 2\n",
"'Deinococcus soli' 1\n",
"'Nostoc azollae' 3\n",
"18,711,729 reads 1\n",
"Acaryochloris marina 10\n",
"Acetobacter aceti 1\n",
" ascendens 1\n",
" orientalis 2\n",
" oryzifermentans 1\n",
" pasteurianus 91\n",
" persici 2\n",
" pomorum 7\n",
" senegalensis 2\n",
" sp. 7\n",
" tropicalis 2\n",
"Acetobacterium woodii 1\n",
"Acetohalobium arabaticum 1\n",
"Acetomicrobium mobile 1\n",
"Acholeplasma axanthum 1\n",
" hippikon 2\n",
" laidlawii 2\n",
" oculi 1\n",
"Achromobacter denitrificans 3\n",
" insolitus 4\n",
" sp. 2\n",
" spanius 4\n",
" xylosoxidans 11\n",
"Acidaminococcus fermentans 1\n",
" intestini 1\n",
"Acidiferrobacter sp. 1\n",
"... ...\n",
"Zymobacter palmae 2\n",
"Zymomonas mobilis 49\n",
"[Arcobacter] porcinus 1\n",
"[Bacillus thuringiensis] 2\n",
"[Bacillus] caldolyticus 2\n",
" selenitireducens 1\n",
"[Brevibacterium] flavum 2\n",
" frigoritolerans 1\n",
"[Clostridium] bolteae 2\n",
" cellulolyticum 1\n",
" cellulosi 1\n",
" clariflavum 1\n",
" propionicum 1\n",
" saccharolyticum 1\n",
" scindens 1\n",
" stercorarium 4\n",
"[Enterobacter] lignolyticus 1\n",
"[Eubacterium] eligens 3\n",
" hallii 1\n",
" rectale 1\n",
"[Haemophilus] ducreyi 14\n",
" parasuis 4\n",
"[Mycobacterium] chelonae 2\n",
"[Pasteurella] aerogenes 1\n",
"[Polyangium] brachysporum 1\n",
"[Pseudomonas syringae] 3\n",
"[Pseudomonas] mesoacidophila 4\n",
"complete chromosome 2\n",
"gamma proteobacterium 1\n",
"secondary endosymbiont 1\n",
"\n",
"[3186 rows x 1 columns]"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"inventory.groupby([\"genus\", \"species\"]).agg({\"species\": \"count\"})\n",
"inventory.columns=[\"count\"]\n",
"inventory"
]
},
{
"cell_type": "code",
"execution_count": 117,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>n_sequences</th>\n",
" <th>species</th>\n",
" </tr>\n",
" <tr>\n",
" <th>genus</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Bacillus</th>\n",
" <td>1132</td>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Streptomyces</th>\n",
" <td>743</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Vibrio</th>\n",
" <td>468</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Rhizobium</th>\n",
" <td>325</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Pseudomonas</th>\n",
" <td>304</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Staphylococcus</th>\n",
" <td>301</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Clostridium</th>\n",
" <td>259</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Streptococcus</th>\n",
" <td>222</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Planktothrix</th>\n",
" <td>179</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Stenotrophomonas</th>\n",
" <td>176</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Escherichia</th>\n",
" <td>133</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Paenibacillus</th>\n",
" <td>127</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Listeria</th>\n",
" <td>104</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Corynebacterium</th>\n",
" <td>103</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Klebsiella</th>\n",
" <td>16</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Shigella</th>\n",
" <td>14</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Salmonella</th>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Enterobacter</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" n_sequences species\n",
"genus \n",
"Bacillus 1132 11\n",
"Streptomyces 743 5\n",
"Vibrio 468 5\n",
"Rhizobium 325 6\n",
"Pseudomonas 304 8\n",
"Staphylococcus 301 6\n",
"Clostridium 259 5\n",
"Streptococcus 222 6\n",
"Planktothrix 179 5\n",
"Stenotrophomonas 176 5\n",
"Escherichia 133 3\n",
"Paenibacillus 127 3\n",
"Listeria 104 4\n",
"Corynebacterium 103 7\n",
"Klebsiella 16 3\n",
"Shigella 14 3\n",
"Salmonella 4 2\n",
"Enterobacter 1 1"
]
},
"execution_count": 117,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"counts = inventory.reset_index().groupby(\"genus\").agg({\"count\", sum}).drop((\"species\"), axis=1)\n",
"counts.columns=[\"n_sequences\",\"species\"]\n",
"counts.sort_values(\"n_sequences\", ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}