[98867e]: / ipynb / embedding 3d NCBI 2019-04-07.ipynb

Download this file

1529 lines (1528 with data), 43.4 kB

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Exploration of 100d space of genome vectors\n",
    "\n",
    "Genome vectors created by the Dna2VecDataBunch exhibit piculiar patterns. This notebook is dedicated to exploratoin \n",
    "of the bacterial genome space using dimensionality reduction techniques"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The autoreload extension is already loaded. To reload it, use:\n",
      "  %reload_ext autoreload\n"
     ]
    }
   ],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append(\"../mylib/\")\n",
    "\n",
    "from genomic import sequence\n",
    "from genomic.sequence import regex_filter, count_filter\n",
    "from functools import partial\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.decomposition import PCA\n",
    "from sklearn import manifold,neighbors\n",
    "from scipy.cluster.hierarchy import dendrogram, linkage  \n",
    "from matplotlib import pyplot as plt\n",
    "import seaborn as sns; sns.set(color_codes=True)\n",
    "import plotly.plotly as py\n",
    "import plotly.graph_objs as go"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# DB=\"/data/genomes/GenSeq_fastas/train\"\n",
    "DB='/home/serge/development/genomes/ncbi-genomes-2019-04-07/bacterial genomes'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "filters=[partial(regex_filter, rx=\"Escherichia|Klebsiella|Bacillus\"),partial(regex_filter, rx=\"plasmid?\\s\", keep=False),\n",
    "         partial(count_filter,num_fastas=(1,1), keep=1)]\n",
    "data = sequence.Dna2VecList.from_folder(DB,filters=filters,agg=partial(np.mean, axis=0),n_cpus=7)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1686"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(data.items)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 42min 11s, sys: 8min 41s, total: 50min 52s\n",
      "Wall time: 1h 25min 43s\n"
     ]
    }
   ],
   "source": [
    "processors = [\n",
    "    sequence.GSFileProcessor(),\n",
    "    sequence.GSTokenizeProcessor(tokenizer=sequence.GSTokenizer(ngram=8, skip=0, n_cpus=4)),\n",
    "    sequence.Dna2VecProcessor()]\n",
    "%time for p in processors: p.process(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1686"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(data.items)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  Genome vectors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "def log_scale(X):\n",
    "    x=np.asarray(X);e=1e-6\n",
    "    return np.log10(x+np.abs(x.min())+e) \n",
    "\n",
    "\n",
    "x=np.asarray(data.items)\n",
    "bad_fastas = np.where(np.mean(x,axis=1) == 0.)[0]\n",
    "X = np.delete(x, bad_fastas,0)\n",
    "labelList=[\" \".join(i.split()[1:3]) for i in data.descriptions]\n",
    "labelList=np.delete(np.asarray(labelList), bad_fastas)\n",
    "vocab=list(np.unique(labelList))\n",
    "y=[vocab.index(x) for x in labelList]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Correlation Distance in log-scaled space"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tSNE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 31.1 s, sys: 313 ms, total: 31.4 s\n",
      "Wall time: 30.9 s\n"
     ]
    }
   ],
   "source": [
    "tsne = manifold.TSNE(n_components=3, init='pca', perplexity=10, metric=\"correlation\",random_state=0)\n",
    "%time X3 = tsne.fit_transform(log_scale(X))\n",
    "\n",
    "genus = [i.split()[0] for i in labelList]\n",
    "genus_vocab=list(np.unique(genus))\n",
    "y=[genus_vocab.index(x) for x in genus]\n",
    "genus_vocab\n",
    "\n",
    "X3_df = pd.DataFrame(data=X3, columns=[\"pc1\",'pc2','pc3'], index=labelList)\n",
    "\n",
    "X3_df[\"genus\"]=genus\n",
    "X3_df[\"y\"]=y\n",
    "\n",
    "genus_df=X3_df.groupby(\"genus\").agg({\"pc1\": list, \"pc2\":list,\"pc3\":list,\"y\":np.mean})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Correlation Distance visualisation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/serge/anaconda3/envs/bio/lib/python3.6/site-packages/IPython/core/display.py:689: UserWarning:\n",
      "\n",
      "Consider using IPython.display.IFrame instead\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<iframe id=\"igraph\" scrolling=\"no\" style=\"border:none;\" seamless=\"seamless\" src=\"https://plot.ly/~sergeman/12.embed\" height=\"1000px\" width=\"1000px\"></iframe>"
      ],
      "text/plain": [
       "<plotly.tools.PlotlyDisplay object>"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data=[]\n",
    "for g in genus_df.index:\n",
    "    trace  = go.Scatter3d(\n",
    "        name = str(g),\n",
    "        x=genus_df.loc[g,\"pc1\"],\n",
    "        y=genus_df.loc[g,\"pc2\"],\n",
    "        z=genus_df.loc[g,\"pc3\"],\n",
    "        mode='markers',\n",
    "        marker=dict(\n",
    "            size=8,\n",
    "            color=genus_df.loc[g,\"y\"],                # set color to an array/list of desired values\n",
    "            colorscale='Jet',           # choose a colorscale\n",
    "            opacity=0.5)\n",
    "    )\n",
    "\n",
    "    data.append(trace)\n",
    "    \n",
    "\n",
    "layout = go.Layout(\n",
    "    width=1000,\n",
    "    height=1000,\n",
    "    margin=dict(\n",
    "        l=0,\n",
    "        r=0,\n",
    "        b=0,\n",
    "        t=0\n",
    "    )\n",
    ")\n",
    "fig = go.Figure(data=data, layout=layout)\n",
    "py.iplot(fig, filename='correlation distance ncbi-genomes-2019-04-07 Escherichia,Klebsiella,Bacillus')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Eucleadian Distance in log-scaled space"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tSNE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 25.8 s, sys: 334 ms, total: 26.1 s\n",
      "Wall time: 25.5 s\n"
     ]
    }
   ],
   "source": [
    "tsne = manifold.TSNE(n_components=3, init='pca', perplexity=30,random_state=0)\n",
    "%time X3 = tsne.fit_transform(log_scale(X))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "genus = [i.split()[0] for i in labelList]\n",
    "genus_vocab=list(np.unique(genus))\n",
    "y=[genus_vocab.index(x) for x in genus]\n",
    "genus_vocab\n",
    "\n",
    "X3_df = pd.DataFrame(data=X3, columns=[\"pc1\",'pc2','pc3'], index=labelList)\n",
    "\n",
    "X3_df[\"genus\"]=genus\n",
    "X3_df[\"y\"]=y\n",
    "\n",
    "genus_df=X3_df.groupby(\"genus\").agg({\"pc1\": list, \"pc2\":list,\"pc3\":list,\"y\":np.mean})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Eucleadian Distance Visualisation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/serge/anaconda3/envs/bio/lib/python3.6/site-packages/IPython/core/display.py:689: UserWarning:\n",
      "\n",
      "Consider using IPython.display.IFrame instead\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<iframe id=\"igraph\" scrolling=\"no\" style=\"border:none;\" seamless=\"seamless\" src=\"https://plot.ly/~sergeman/14.embed\" height=\"1000px\" width=\"1000px\"></iframe>"
      ],
      "text/plain": [
       "<plotly.tools.PlotlyDisplay object>"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data=[]\n",
    "for g in genus_df.index:\n",
    "    trace  = go.Scatter3d(\n",
    "        name = str(g),\n",
    "        x=genus_df.loc[g,\"pc1\"],\n",
    "        y=genus_df.loc[g,\"pc2\"],\n",
    "        z=genus_df.loc[g,\"pc3\"],\n",
    "        mode='markers',\n",
    "        marker=dict(\n",
    "            size=8,\n",
    "            color=genus_df.loc[g,\"y\"]+1,                # set color to an array/list of desired values\n",
    "            colorscale='YlGnBu',           # choose a colorscale\n",
    "            opacity=0.5)\n",
    "    )\n",
    "\n",
    "    data.append(trace)\n",
    "    \n",
    "\n",
    "layout = go.Layout(\n",
    "    width=1000,\n",
    "    height=1000,\n",
    "    margin=dict(\n",
    "        l=0,\n",
    "        r=0,\n",
    "        b=0,\n",
    "        t=0\n",
    "    )\n",
    ")\n",
    "fig = go.Figure(data=data, layout=layout)\n",
    "py.iplot(fig, filename='eucledian distance metric by genus Escherichia|Klebsiella|Bacillus')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Eucleadian Distance in unmodified space"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tSNE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 41.5 s, sys: 313 ms, total: 41.8 s\n",
      "Wall time: 41.2 s\n"
     ]
    }
   ],
   "source": [
    "tsne = manifold.TSNE(n_components=3, init='pca', perplexity=30,random_state=0)\n",
    "%time X3 = tsne.fit_transform(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "genus = [i.split()[0] for i in labelList]\n",
    "genus_vocab=list(np.unique(genus))\n",
    "y=[genus_vocab.index(x) for x in genus]\n",
    "genus_vocab\n",
    "\n",
    "X3_df = pd.DataFrame(data=X3, columns=[\"pc1\",'pc2','pc3'], index=labelList)\n",
    "\n",
    "X3_df[\"genus\"]=genus\n",
    "X3_df[\"y\"]=y\n",
    "\n",
    "genus_df=X3_df.groupby(\"genus\").agg({\"pc1\": list, \"pc2\":list,\"pc3\":list,\"y\":np.mean})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Eucleadian Distance Visualisation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/serge/anaconda3/envs/bio/lib/python3.6/site-packages/IPython/core/display.py:689: UserWarning:\n",
      "\n",
      "Consider using IPython.display.IFrame instead\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<iframe id=\"igraph\" scrolling=\"no\" style=\"border:none;\" seamless=\"seamless\" src=\"https://plot.ly/~sergeman/16.embed\" height=\"1000px\" width=\"1000px\"></iframe>"
      ],
      "text/plain": [
       "<plotly.tools.PlotlyDisplay object>"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data=[]\n",
    "for g in genus_df.index:\n",
    "    trace  = go.Scatter3d(\n",
    "        name = str(g),\n",
    "        x=genus_df.loc[g,\"pc1\"],\n",
    "        y=genus_df.loc[g,\"pc2\"],\n",
    "        z=genus_df.loc[g,\"pc3\"],\n",
    "        mode='markers',\n",
    "        marker=dict(\n",
    "            size=8,\n",
    "            color=genus_df.loc[g,\"y\"],                # set color to an array/list of desired values\n",
    "            colorscale='Jet',           # choose a colorscale\n",
    "            opacity=0.5)\n",
    "    )\n",
    "\n",
    "    data.append(trace)\n",
    "    \n",
    "\n",
    "layout = go.Layout(\n",
    "    width=1000,\n",
    "    height=1000,\n",
    "    margin=dict(\n",
    "        l=0,\n",
    "        r=0,\n",
    "        b=0,\n",
    "        t=0\n",
    "    )\n",
    ")\n",
    "fig = go.Figure(data=data, layout=layout)\n",
    "py.iplot(fig, filename='eucledian distance in native space Escherichia|Klebsiella|Bacillus')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Genome Inventory"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "inventory = pd.DataFrame(data=[l.split()[1:3] for l in all_fastas], columns=[\"genus\",\"species\" ])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>species</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>genus</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Escherichia</th>\n",
       "      <td>2239</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Klebsiella</th>\n",
       "      <td>1718</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Salmonella</th>\n",
       "      <td>1183</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Bacillus</th>\n",
       "      <td>1172</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Lactobacillus</th>\n",
       "      <td>953</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Staphylococcus</th>\n",
       "      <td>889</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Burkholderia</th>\n",
       "      <td>650</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Enterococcus</th>\n",
       "      <td>626</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Pseudomonas</th>\n",
       "      <td>613</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Streptococcus</th>\n",
       "      <td>564</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Acinetobacter</th>\n",
       "      <td>531</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Bordetella</th>\n",
       "      <td>504</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Vibrio</th>\n",
       "      <td>474</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Xanthomonas</th>\n",
       "      <td>395</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Mycobacterium</th>\n",
       "      <td>368</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Borrelia</th>\n",
       "      <td>347</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Campylobacter</th>\n",
       "      <td>339</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Rhizobium</th>\n",
       "      <td>307</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Enterobacter</th>\n",
       "      <td>298</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Mycoplasma</th>\n",
       "      <td>290</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Corynebacterium</th>\n",
       "      <td>258</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Phaeobacter</th>\n",
       "      <td>256</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Yersinia</th>\n",
       "      <td>243</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Brucella</th>\n",
       "      <td>233</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Clostridium</th>\n",
       "      <td>228</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Listeria</th>\n",
       "      <td>227</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Streptomyces</th>\n",
       "      <td>225</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Chlamydia</th>\n",
       "      <td>202</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Helicobacter</th>\n",
       "      <td>187</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Candidatus</th>\n",
       "      <td>187</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>'Deinococcus</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Mariniflexile</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Marinithermus</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Lactobacillales</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Marinobacterium</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Megamonas</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Melioribacter</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Melittangium</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Methylobacillus</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Methylocaldum</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Magnetococcus</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Mageeibacillus</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Lysinimonas</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Luteitalea</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Lacunisphaera</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Lautropia</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Leadbetterella</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Leminorella</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Lentibacillus</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Lentzea</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Leptothrix</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Levyella</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Limnobaculum</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Limnochorda</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Litorilituus</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Lonsdalea</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Luteibacter</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Luteipulveratus</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>secondary</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>plasmid1</th>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1120 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                 species\n",
       "genus                   \n",
       "Escherichia         2239\n",
       "Klebsiella          1718\n",
       "Salmonella          1183\n",
       "Bacillus            1172\n",
       "Lactobacillus        953\n",
       "Staphylococcus       889\n",
       "Burkholderia         650\n",
       "Enterococcus         626\n",
       "Pseudomonas          613\n",
       "Streptococcus        564\n",
       "Acinetobacter        531\n",
       "Bordetella           504\n",
       "Vibrio               474\n",
       "Xanthomonas          395\n",
       "Mycobacterium        368\n",
       "Borrelia             347\n",
       "Campylobacter        339\n",
       "Rhizobium            307\n",
       "Enterobacter         298\n",
       "Mycoplasma           290\n",
       "Corynebacterium      258\n",
       "Phaeobacter          256\n",
       "Yersinia             243\n",
       "Brucella             233\n",
       "Clostridium          228\n",
       "Listeria             227\n",
       "Streptomyces         225\n",
       "Chlamydia            202\n",
       "Helicobacter         187\n",
       "Candidatus           187\n",
       "...                  ...\n",
       "'Deinococcus           1\n",
       "Mariniflexile          1\n",
       "Marinithermus          1\n",
       "Lactobacillales        1\n",
       "Marinobacterium        1\n",
       "Megamonas              1\n",
       "Melioribacter          1\n",
       "Melittangium           1\n",
       "Methylobacillus        1\n",
       "Methylocaldum          1\n",
       "Magnetococcus          1\n",
       "Mageeibacillus         1\n",
       "Lysinimonas            1\n",
       "Luteitalea             1\n",
       "Lacunisphaera          1\n",
       "Lautropia              1\n",
       "Leadbetterella         1\n",
       "Leminorella            1\n",
       "Lentibacillus          1\n",
       "Lentzea                1\n",
       "Leptothrix             1\n",
       "Levyella               1\n",
       "Limnobaculum           1\n",
       "Limnochorda            1\n",
       "Litorilituus           1\n",
       "Lonsdalea              1\n",
       "Luteibacter            1\n",
       "Luteipulveratus        1\n",
       "secondary              1\n",
       "plasmid1               0\n",
       "\n",
       "[1120 rows x 1 columns]"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "inventory.groupby(\"genus\").agg({\"species\":\"count\"}).sort_values(\"species\",ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>genus</th>\n",
       "      <th>species</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>'Catharanthus</th>\n",
       "      <th>roseus'</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>'Deinococcus</th>\n",
       "      <th>soli'</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>'Nostoc</th>\n",
       "      <th>azollae'</th>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18,711,729</th>\n",
       "      <th>reads</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Acaryochloris</th>\n",
       "      <th>marina</th>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"10\" valign=\"top\">Acetobacter</th>\n",
       "      <th>aceti</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ascendens</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>orientalis</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>oryzifermentans</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>pasteurianus</th>\n",
       "      <td>91</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>persici</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>pomorum</th>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>senegalensis</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sp.</th>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>tropicalis</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Acetobacterium</th>\n",
       "      <th>woodii</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Acetohalobium</th>\n",
       "      <th>arabaticum</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Acetomicrobium</th>\n",
       "      <th>mobile</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"4\" valign=\"top\">Acholeplasma</th>\n",
       "      <th>axanthum</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>hippikon</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>laidlawii</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>oculi</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"5\" valign=\"top\">Achromobacter</th>\n",
       "      <th>denitrificans</th>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>insolitus</th>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sp.</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>spanius</th>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>xylosoxidans</th>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"2\" valign=\"top\">Acidaminococcus</th>\n",
       "      <th>fermentans</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>intestini</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Acidiferrobacter</th>\n",
       "      <th>sp.</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Zymobacter</th>\n",
       "      <th>palmae</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Zymomonas</th>\n",
       "      <th>mobilis</th>\n",
       "      <td>49</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>[Arcobacter]</th>\n",
       "      <th>porcinus</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>[Bacillus</th>\n",
       "      <th>thuringiensis]</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"2\" valign=\"top\">[Bacillus]</th>\n",
       "      <th>caldolyticus</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>selenitireducens</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"2\" valign=\"top\">[Brevibacterium]</th>\n",
       "      <th>flavum</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>frigoritolerans</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"8\" valign=\"top\">[Clostridium]</th>\n",
       "      <th>bolteae</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cellulolyticum</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cellulosi</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>clariflavum</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>propionicum</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>saccharolyticum</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>scindens</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>stercorarium</th>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>[Enterobacter]</th>\n",
       "      <th>lignolyticus</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">[Eubacterium]</th>\n",
       "      <th>eligens</th>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>hallii</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>rectale</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"2\" valign=\"top\">[Haemophilus]</th>\n",
       "      <th>ducreyi</th>\n",
       "      <td>14</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>parasuis</th>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>[Mycobacterium]</th>\n",
       "      <th>chelonae</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>[Pasteurella]</th>\n",
       "      <th>aerogenes</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>[Polyangium]</th>\n",
       "      <th>brachysporum</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>[Pseudomonas</th>\n",
       "      <th>syringae]</th>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>[Pseudomonas]</th>\n",
       "      <th>mesoacidophila</th>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>complete</th>\n",
       "      <th>chromosome</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>gamma</th>\n",
       "      <th>proteobacterium</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>secondary</th>\n",
       "      <th>endosymbiont</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3186 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                   count\n",
       "genus            species                \n",
       "'Catharanthus    roseus'               2\n",
       "'Deinococcus     soli'                 1\n",
       "'Nostoc          azollae'              3\n",
       "18,711,729       reads                 1\n",
       "Acaryochloris    marina               10\n",
       "Acetobacter      aceti                 1\n",
       "                 ascendens             1\n",
       "                 orientalis            2\n",
       "                 oryzifermentans       1\n",
       "                 pasteurianus         91\n",
       "                 persici               2\n",
       "                 pomorum               7\n",
       "                 senegalensis          2\n",
       "                 sp.                   7\n",
       "                 tropicalis            2\n",
       "Acetobacterium   woodii                1\n",
       "Acetohalobium    arabaticum            1\n",
       "Acetomicrobium   mobile                1\n",
       "Acholeplasma     axanthum              1\n",
       "                 hippikon              2\n",
       "                 laidlawii             2\n",
       "                 oculi                 1\n",
       "Achromobacter    denitrificans         3\n",
       "                 insolitus             4\n",
       "                 sp.                   2\n",
       "                 spanius               4\n",
       "                 xylosoxidans         11\n",
       "Acidaminococcus  fermentans            1\n",
       "                 intestini             1\n",
       "Acidiferrobacter sp.                   1\n",
       "...                                  ...\n",
       "Zymobacter       palmae                2\n",
       "Zymomonas        mobilis              49\n",
       "[Arcobacter]     porcinus              1\n",
       "[Bacillus        thuringiensis]        2\n",
       "[Bacillus]       caldolyticus          2\n",
       "                 selenitireducens      1\n",
       "[Brevibacterium] flavum                2\n",
       "                 frigoritolerans       1\n",
       "[Clostridium]    bolteae               2\n",
       "                 cellulolyticum        1\n",
       "                 cellulosi             1\n",
       "                 clariflavum           1\n",
       "                 propionicum           1\n",
       "                 saccharolyticum       1\n",
       "                 scindens              1\n",
       "                 stercorarium          4\n",
       "[Enterobacter]   lignolyticus          1\n",
       "[Eubacterium]    eligens               3\n",
       "                 hallii                1\n",
       "                 rectale               1\n",
       "[Haemophilus]    ducreyi              14\n",
       "                 parasuis              4\n",
       "[Mycobacterium]  chelonae              2\n",
       "[Pasteurella]    aerogenes             1\n",
       "[Polyangium]     brachysporum          1\n",
       "[Pseudomonas     syringae]             3\n",
       "[Pseudomonas]    mesoacidophila        4\n",
       "complete         chromosome            2\n",
       "gamma            proteobacterium       1\n",
       "secondary        endosymbiont          1\n",
       "\n",
       "[3186 rows x 1 columns]"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "inventory.groupby([\"genus\", \"species\"]).agg({\"species\": \"count\"})\n",
    "inventory.columns=[\"count\"]\n",
    "inventory"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>n_sequences</th>\n",
       "      <th>species</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>genus</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Bacillus</th>\n",
       "      <td>1132</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Streptomyces</th>\n",
       "      <td>743</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Vibrio</th>\n",
       "      <td>468</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Rhizobium</th>\n",
       "      <td>325</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Pseudomonas</th>\n",
       "      <td>304</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Staphylococcus</th>\n",
       "      <td>301</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Clostridium</th>\n",
       "      <td>259</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Streptococcus</th>\n",
       "      <td>222</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Planktothrix</th>\n",
       "      <td>179</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Stenotrophomonas</th>\n",
       "      <td>176</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Escherichia</th>\n",
       "      <td>133</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Paenibacillus</th>\n",
       "      <td>127</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Listeria</th>\n",
       "      <td>104</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Corynebacterium</th>\n",
       "      <td>103</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Klebsiella</th>\n",
       "      <td>16</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Shigella</th>\n",
       "      <td>14</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Salmonella</th>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Enterobacter</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                  n_sequences  species\n",
       "genus                                 \n",
       "Bacillus                 1132       11\n",
       "Streptomyces              743        5\n",
       "Vibrio                    468        5\n",
       "Rhizobium                 325        6\n",
       "Pseudomonas               304        8\n",
       "Staphylococcus            301        6\n",
       "Clostridium               259        5\n",
       "Streptococcus             222        6\n",
       "Planktothrix              179        5\n",
       "Stenotrophomonas          176        5\n",
       "Escherichia               133        3\n",
       "Paenibacillus             127        3\n",
       "Listeria                  104        4\n",
       "Corynebacterium           103        7\n",
       "Klebsiella                 16        3\n",
       "Shigella                   14        3\n",
       "Salmonella                  4        2\n",
       "Enterobacter                1        1"
      ]
     },
     "execution_count": 117,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "counts = inventory.reset_index().groupby(\"genus\").agg({\"count\", sum}).drop((\"species\"), axis=1)\n",
    "counts.columns=[\"n_sequences\",\"species\"]\n",
    "counts.sort_values(\"n_sequences\", ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}