a b/ipynb/embedding 3d NCBI 2019-04-07.ipynb
1
{
2
 "cells": [
3
  {
4
   "cell_type": "markdown",
5
   "metadata": {},
6
   "source": [
7
    "## Exploration of 100d space of genome vectors\n",
8
    "\n",
9
    "Genome vectors created by the Dna2VecDataBunch exhibit piculiar patterns. This notebook is dedicated to exploratoin \n",
10
    "of the bacterial genome space using dimensionality reduction techniques"
11
   ]
12
  },
13
  {
14
   "cell_type": "code",
15
   "execution_count": 4,
16
   "metadata": {},
17
   "outputs": [
18
    {
19
     "name": "stdout",
20
     "output_type": "stream",
21
     "text": [
22
      "The autoreload extension is already loaded. To reload it, use:\n",
23
      "  %reload_ext autoreload\n"
24
     ]
25
    }
26
   ],
27
   "source": [
28
    "%load_ext autoreload\n",
29
    "%autoreload 2"
30
   ]
31
  },
32
  {
33
   "cell_type": "code",
34
   "execution_count": 5,
35
   "metadata": {},
36
   "outputs": [],
37
   "source": [
38
    "import sys\n",
39
    "sys.path.append(\"../mylib/\")\n",
40
    "\n",
41
    "from genomic import sequence\n",
42
    "from genomic.sequence import regex_filter, count_filter\n",
43
    "from functools import partial\n",
44
    "import pandas as pd\n",
45
    "import numpy as np\n",
46
    "from sklearn.decomposition import PCA\n",
47
    "from sklearn import manifold,neighbors\n",
48
    "from scipy.cluster.hierarchy import dendrogram, linkage  \n",
49
    "from matplotlib import pyplot as plt\n",
50
    "import seaborn as sns; sns.set(color_codes=True)\n",
51
    "import plotly.plotly as py\n",
52
    "import plotly.graph_objs as go"
53
   ]
54
  },
55
  {
56
   "cell_type": "markdown",
57
   "metadata": {},
58
   "source": [
59
    "### Load Data"
60
   ]
61
  },
62
  {
63
   "cell_type": "code",
64
   "execution_count": 6,
65
   "metadata": {},
66
   "outputs": [],
67
   "source": [
68
    "# DB=\"/data/genomes/GenSeq_fastas/train\"\n",
69
    "DB='/home/serge/development/genomes/ncbi-genomes-2019-04-07/bacterial genomes'"
70
   ]
71
  },
72
  {
73
   "cell_type": "code",
74
   "execution_count": 7,
75
   "metadata": {},
76
   "outputs": [],
77
   "source": [
78
    "filters=[partial(regex_filter, rx=\"Escherichia|Klebsiella|Bacillus\"),partial(regex_filter, rx=\"plasmid?\\s\", keep=False),\n",
79
    "         partial(count_filter,num_fastas=(1,1), keep=1)]\n",
80
    "data = sequence.Dna2VecList.from_folder(DB,filters=filters,agg=partial(np.mean, axis=0),n_cpus=7)"
81
   ]
82
  },
83
  {
84
   "cell_type": "code",
85
   "execution_count": 8,
86
   "metadata": {},
87
   "outputs": [
88
    {
89
     "data": {
90
      "text/plain": [
91
       "1686"
92
      ]
93
     },
94
     "execution_count": 8,
95
     "metadata": {},
96
     "output_type": "execute_result"
97
    }
98
   ],
99
   "source": [
100
    "len(data.items)"
101
   ]
102
  },
103
  {
104
   "cell_type": "code",
105
   "execution_count": 9,
106
   "metadata": {},
107
   "outputs": [
108
    {
109
     "name": "stdout",
110
     "output_type": "stream",
111
     "text": [
112
      "CPU times: user 42min 11s, sys: 8min 41s, total: 50min 52s\n",
113
      "Wall time: 1h 25min 43s\n"
114
     ]
115
    }
116
   ],
117
   "source": [
118
    "processors = [\n",
119
    "    sequence.GSFileProcessor(),\n",
120
    "    sequence.GSTokenizeProcessor(tokenizer=sequence.GSTokenizer(ngram=8, skip=0, n_cpus=4)),\n",
121
    "    sequence.Dna2VecProcessor()]\n",
122
    "%time for p in processors: p.process(data)"
123
   ]
124
  },
125
  {
126
   "cell_type": "code",
127
   "execution_count": 10,
128
   "metadata": {},
129
   "outputs": [
130
    {
131
     "data": {
132
      "text/plain": [
133
       "1686"
134
      ]
135
     },
136
     "execution_count": 10,
137
     "metadata": {},
138
     "output_type": "execute_result"
139
    }
140
   ],
141
   "source": [
142
    "len(data.items)"
143
   ]
144
  },
145
  {
146
   "cell_type": "markdown",
147
   "metadata": {},
148
   "source": [
149
    "###  Genome vectors"
150
   ]
151
  },
152
  {
153
   "cell_type": "code",
154
   "execution_count": 14,
155
   "metadata": {},
156
   "outputs": [],
157
   "source": [
158
    "def log_scale(X):\n",
159
    "    x=np.asarray(X);e=1e-6\n",
160
    "    return np.log10(x+np.abs(x.min())+e) \n",
161
    "\n",
162
    "\n",
163
    "x=np.asarray(data.items)\n",
164
    "bad_fastas = np.where(np.mean(x,axis=1) == 0.)[0]\n",
165
    "X = np.delete(x, bad_fastas,0)\n",
166
    "labelList=[\" \".join(i.split()[1:3]) for i in data.descriptions]\n",
167
    "labelList=np.delete(np.asarray(labelList), bad_fastas)\n",
168
    "vocab=list(np.unique(labelList))\n",
169
    "y=[vocab.index(x) for x in labelList]"
170
   ]
171
  },
172
  {
173
   "cell_type": "markdown",
174
   "metadata": {},
175
   "source": [
176
    "## Correlation Distance in log-scaled space"
177
   ]
178
  },
179
  {
180
   "cell_type": "markdown",
181
   "metadata": {},
182
   "source": [
183
    "### tSNE"
184
   ]
185
  },
186
  {
187
   "cell_type": "code",
188
   "execution_count": 18,
189
   "metadata": {
190
    "scrolled": true
191
   },
192
   "outputs": [
193
    {
194
     "name": "stdout",
195
     "output_type": "stream",
196
     "text": [
197
      "CPU times: user 31.1 s, sys: 313 ms, total: 31.4 s\n",
198
      "Wall time: 30.9 s\n"
199
     ]
200
    }
201
   ],
202
   "source": [
203
    "tsne = manifold.TSNE(n_components=3, init='pca', perplexity=10, metric=\"correlation\",random_state=0)\n",
204
    "%time X3 = tsne.fit_transform(log_scale(X))\n",
205
    "\n",
206
    "genus = [i.split()[0] for i in labelList]\n",
207
    "genus_vocab=list(np.unique(genus))\n",
208
    "y=[genus_vocab.index(x) for x in genus]\n",
209
    "genus_vocab\n",
210
    "\n",
211
    "X3_df = pd.DataFrame(data=X3, columns=[\"pc1\",'pc2','pc3'], index=labelList)\n",
212
    "\n",
213
    "X3_df[\"genus\"]=genus\n",
214
    "X3_df[\"y\"]=y\n",
215
    "\n",
216
    "genus_df=X3_df.groupby(\"genus\").agg({\"pc1\": list, \"pc2\":list,\"pc3\":list,\"y\":np.mean})"
217
   ]
218
  },
219
  {
220
   "cell_type": "markdown",
221
   "metadata": {},
222
   "source": [
223
    "### Correlation Distance visualisation"
224
   ]
225
  },
226
  {
227
   "cell_type": "code",
228
   "execution_count": 20,
229
   "metadata": {},
230
   "outputs": [
231
    {
232
     "name": "stderr",
233
     "output_type": "stream",
234
     "text": [
235
      "/home/serge/anaconda3/envs/bio/lib/python3.6/site-packages/IPython/core/display.py:689: UserWarning:\n",
236
      "\n",
237
      "Consider using IPython.display.IFrame instead\n",
238
      "\n"
239
     ]
240
    },
241
    {
242
     "data": {
243
      "text/html": [
244
       "<iframe id=\"igraph\" scrolling=\"no\" style=\"border:none;\" seamless=\"seamless\" src=\"https://plot.ly/~sergeman/12.embed\" height=\"1000px\" width=\"1000px\"></iframe>"
245
      ],
246
      "text/plain": [
247
       "<plotly.tools.PlotlyDisplay object>"
248
      ]
249
     },
250
     "execution_count": 20,
251
     "metadata": {},
252
     "output_type": "execute_result"
253
    }
254
   ],
255
   "source": [
256
    "data=[]\n",
257
    "for g in genus_df.index:\n",
258
    "    trace  = go.Scatter3d(\n",
259
    "        name = str(g),\n",
260
    "        x=genus_df.loc[g,\"pc1\"],\n",
261
    "        y=genus_df.loc[g,\"pc2\"],\n",
262
    "        z=genus_df.loc[g,\"pc3\"],\n",
263
    "        mode='markers',\n",
264
    "        marker=dict(\n",
265
    "            size=8,\n",
266
    "            color=genus_df.loc[g,\"y\"],                # set color to an array/list of desired values\n",
267
    "            colorscale='Jet',           # choose a colorscale\n",
268
    "            opacity=0.5)\n",
269
    "    )\n",
270
    "\n",
271
    "    data.append(trace)\n",
272
    "    \n",
273
    "\n",
274
    "layout = go.Layout(\n",
275
    "    width=1000,\n",
276
    "    height=1000,\n",
277
    "    margin=dict(\n",
278
    "        l=0,\n",
279
    "        r=0,\n",
280
    "        b=0,\n",
281
    "        t=0\n",
282
    "    )\n",
283
    ")\n",
284
    "fig = go.Figure(data=data, layout=layout)\n",
285
    "py.iplot(fig, filename='correlation distance ncbi-genomes-2019-04-07 Escherichia,Klebsiella,Bacillus')"
286
   ]
287
  },
288
  {
289
   "cell_type": "markdown",
290
   "metadata": {},
291
   "source": [
292
    "## Eucleadian Distance in log-scaled space"
293
   ]
294
  },
295
  {
296
   "cell_type": "markdown",
297
   "metadata": {},
298
   "source": [
299
    "### tSNE"
300
   ]
301
  },
302
  {
303
   "cell_type": "code",
304
   "execution_count": 21,
305
   "metadata": {},
306
   "outputs": [
307
    {
308
     "name": "stdout",
309
     "output_type": "stream",
310
     "text": [
311
      "CPU times: user 25.8 s, sys: 334 ms, total: 26.1 s\n",
312
      "Wall time: 25.5 s\n"
313
     ]
314
    }
315
   ],
316
   "source": [
317
    "tsne = manifold.TSNE(n_components=3, init='pca', perplexity=30,random_state=0)\n",
318
    "%time X3 = tsne.fit_transform(log_scale(X))"
319
   ]
320
  },
321
  {
322
   "cell_type": "code",
323
   "execution_count": 22,
324
   "metadata": {},
325
   "outputs": [],
326
   "source": [
327
    "genus = [i.split()[0] for i in labelList]\n",
328
    "genus_vocab=list(np.unique(genus))\n",
329
    "y=[genus_vocab.index(x) for x in genus]\n",
330
    "genus_vocab\n",
331
    "\n",
332
    "X3_df = pd.DataFrame(data=X3, columns=[\"pc1\",'pc2','pc3'], index=labelList)\n",
333
    "\n",
334
    "X3_df[\"genus\"]=genus\n",
335
    "X3_df[\"y\"]=y\n",
336
    "\n",
337
    "genus_df=X3_df.groupby(\"genus\").agg({\"pc1\": list, \"pc2\":list,\"pc3\":list,\"y\":np.mean})"
338
   ]
339
  },
340
  {
341
   "cell_type": "markdown",
342
   "metadata": {},
343
   "source": [
344
    "### Eucleadian Distance Visualisation"
345
   ]
346
  },
347
  {
348
   "cell_type": "code",
349
   "execution_count": 32,
350
   "metadata": {},
351
   "outputs": [
352
    {
353
     "name": "stderr",
354
     "output_type": "stream",
355
     "text": [
356
      "/home/serge/anaconda3/envs/bio/lib/python3.6/site-packages/IPython/core/display.py:689: UserWarning:\n",
357
      "\n",
358
      "Consider using IPython.display.IFrame instead\n",
359
      "\n"
360
     ]
361
    },
362
    {
363
     "data": {
364
      "text/html": [
365
       "<iframe id=\"igraph\" scrolling=\"no\" style=\"border:none;\" seamless=\"seamless\" src=\"https://plot.ly/~sergeman/14.embed\" height=\"1000px\" width=\"1000px\"></iframe>"
366
      ],
367
      "text/plain": [
368
       "<plotly.tools.PlotlyDisplay object>"
369
      ]
370
     },
371
     "execution_count": 32,
372
     "metadata": {},
373
     "output_type": "execute_result"
374
    }
375
   ],
376
   "source": [
377
    "data=[]\n",
378
    "for g in genus_df.index:\n",
379
    "    trace  = go.Scatter3d(\n",
380
    "        name = str(g),\n",
381
    "        x=genus_df.loc[g,\"pc1\"],\n",
382
    "        y=genus_df.loc[g,\"pc2\"],\n",
383
    "        z=genus_df.loc[g,\"pc3\"],\n",
384
    "        mode='markers',\n",
385
    "        marker=dict(\n",
386
    "            size=8,\n",
387
    "            color=genus_df.loc[g,\"y\"]+1,                # set color to an array/list of desired values\n",
388
    "            colorscale='YlGnBu',           # choose a colorscale\n",
389
    "            opacity=0.5)\n",
390
    "    )\n",
391
    "\n",
392
    "    data.append(trace)\n",
393
    "    \n",
394
    "\n",
395
    "layout = go.Layout(\n",
396
    "    width=1000,\n",
397
    "    height=1000,\n",
398
    "    margin=dict(\n",
399
    "        l=0,\n",
400
    "        r=0,\n",
401
    "        b=0,\n",
402
    "        t=0\n",
403
    "    )\n",
404
    ")\n",
405
    "fig = go.Figure(data=data, layout=layout)\n",
406
    "py.iplot(fig, filename='eucledian distance metric by genus Escherichia|Klebsiella|Bacillus')"
407
   ]
408
  },
409
  {
410
   "cell_type": "markdown",
411
   "metadata": {},
412
   "source": [
413
    "## Eucleadian Distance in unmodified space"
414
   ]
415
  },
416
  {
417
   "cell_type": "markdown",
418
   "metadata": {},
419
   "source": [
420
    "### tSNE"
421
   ]
422
  },
423
  {
424
   "cell_type": "code",
425
   "execution_count": 27,
426
   "metadata": {},
427
   "outputs": [
428
    {
429
     "name": "stdout",
430
     "output_type": "stream",
431
     "text": [
432
      "CPU times: user 41.5 s, sys: 313 ms, total: 41.8 s\n",
433
      "Wall time: 41.2 s\n"
434
     ]
435
    }
436
   ],
437
   "source": [
438
    "tsne = manifold.TSNE(n_components=3, init='pca', perplexity=30,random_state=0)\n",
439
    "%time X3 = tsne.fit_transform(X)"
440
   ]
441
  },
442
  {
443
   "cell_type": "code",
444
   "execution_count": 28,
445
   "metadata": {},
446
   "outputs": [],
447
   "source": [
448
    "genus = [i.split()[0] for i in labelList]\n",
449
    "genus_vocab=list(np.unique(genus))\n",
450
    "y=[genus_vocab.index(x) for x in genus]\n",
451
    "genus_vocab\n",
452
    "\n",
453
    "X3_df = pd.DataFrame(data=X3, columns=[\"pc1\",'pc2','pc3'], index=labelList)\n",
454
    "\n",
455
    "X3_df[\"genus\"]=genus\n",
456
    "X3_df[\"y\"]=y\n",
457
    "\n",
458
    "genus_df=X3_df.groupby(\"genus\").agg({\"pc1\": list, \"pc2\":list,\"pc3\":list,\"y\":np.mean})"
459
   ]
460
  },
461
  {
462
   "cell_type": "markdown",
463
   "metadata": {},
464
   "source": [
465
    "### Eucleadian Distance Visualisation"
466
   ]
467
  },
468
  {
469
   "cell_type": "code",
470
   "execution_count": 29,
471
   "metadata": {},
472
   "outputs": [
473
    {
474
     "name": "stderr",
475
     "output_type": "stream",
476
     "text": [
477
      "/home/serge/anaconda3/envs/bio/lib/python3.6/site-packages/IPython/core/display.py:689: UserWarning:\n",
478
      "\n",
479
      "Consider using IPython.display.IFrame instead\n",
480
      "\n"
481
     ]
482
    },
483
    {
484
     "data": {
485
      "text/html": [
486
       "<iframe id=\"igraph\" scrolling=\"no\" style=\"border:none;\" seamless=\"seamless\" src=\"https://plot.ly/~sergeman/16.embed\" height=\"1000px\" width=\"1000px\"></iframe>"
487
      ],
488
      "text/plain": [
489
       "<plotly.tools.PlotlyDisplay object>"
490
      ]
491
     },
492
     "execution_count": 29,
493
     "metadata": {},
494
     "output_type": "execute_result"
495
    }
496
   ],
497
   "source": [
498
    "data=[]\n",
499
    "for g in genus_df.index:\n",
500
    "    trace  = go.Scatter3d(\n",
501
    "        name = str(g),\n",
502
    "        x=genus_df.loc[g,\"pc1\"],\n",
503
    "        y=genus_df.loc[g,\"pc2\"],\n",
504
    "        z=genus_df.loc[g,\"pc3\"],\n",
505
    "        mode='markers',\n",
506
    "        marker=dict(\n",
507
    "            size=8,\n",
508
    "            color=genus_df.loc[g,\"y\"],                # set color to an array/list of desired values\n",
509
    "            colorscale='Jet',           # choose a colorscale\n",
510
    "            opacity=0.5)\n",
511
    "    )\n",
512
    "\n",
513
    "    data.append(trace)\n",
514
    "    \n",
515
    "\n",
516
    "layout = go.Layout(\n",
517
    "    width=1000,\n",
518
    "    height=1000,\n",
519
    "    margin=dict(\n",
520
    "        l=0,\n",
521
    "        r=0,\n",
522
    "        b=0,\n",
523
    "        t=0\n",
524
    "    )\n",
525
    ")\n",
526
    "fig = go.Figure(data=data, layout=layout)\n",
527
    "py.iplot(fig, filename='eucledian distance in native space Escherichia|Klebsiella|Bacillus')"
528
   ]
529
  },
530
  {
531
   "cell_type": "markdown",
532
   "metadata": {},
533
   "source": [
534
    "## Genome Inventory"
535
   ]
536
  },
537
  {
538
   "cell_type": "code",
539
   "execution_count": 26,
540
   "metadata": {},
541
   "outputs": [],
542
   "source": [
543
    "inventory = pd.DataFrame(data=[l.split()[1:3] for l in all_fastas], columns=[\"genus\",\"species\" ])"
544
   ]
545
  },
546
  {
547
   "cell_type": "code",
548
   "execution_count": 30,
549
   "metadata": {
550
    "scrolled": true
551
   },
552
   "outputs": [
553
    {
554
     "data": {
555
      "text/html": [
556
       "<div>\n",
557
       "<style scoped>\n",
558
       "    .dataframe tbody tr th:only-of-type {\n",
559
       "        vertical-align: middle;\n",
560
       "    }\n",
561
       "\n",
562
       "    .dataframe tbody tr th {\n",
563
       "        vertical-align: top;\n",
564
       "    }\n",
565
       "\n",
566
       "    .dataframe thead th {\n",
567
       "        text-align: right;\n",
568
       "    }\n",
569
       "</style>\n",
570
       "<table border=\"1\" class=\"dataframe\">\n",
571
       "  <thead>\n",
572
       "    <tr style=\"text-align: right;\">\n",
573
       "      <th></th>\n",
574
       "      <th>species</th>\n",
575
       "    </tr>\n",
576
       "    <tr>\n",
577
       "      <th>genus</th>\n",
578
       "      <th></th>\n",
579
       "    </tr>\n",
580
       "  </thead>\n",
581
       "  <tbody>\n",
582
       "    <tr>\n",
583
       "      <th>Escherichia</th>\n",
584
       "      <td>2239</td>\n",
585
       "    </tr>\n",
586
       "    <tr>\n",
587
       "      <th>Klebsiella</th>\n",
588
       "      <td>1718</td>\n",
589
       "    </tr>\n",
590
       "    <tr>\n",
591
       "      <th>Salmonella</th>\n",
592
       "      <td>1183</td>\n",
593
       "    </tr>\n",
594
       "    <tr>\n",
595
       "      <th>Bacillus</th>\n",
596
       "      <td>1172</td>\n",
597
       "    </tr>\n",
598
       "    <tr>\n",
599
       "      <th>Lactobacillus</th>\n",
600
       "      <td>953</td>\n",
601
       "    </tr>\n",
602
       "    <tr>\n",
603
       "      <th>Staphylococcus</th>\n",
604
       "      <td>889</td>\n",
605
       "    </tr>\n",
606
       "    <tr>\n",
607
       "      <th>Burkholderia</th>\n",
608
       "      <td>650</td>\n",
609
       "    </tr>\n",
610
       "    <tr>\n",
611
       "      <th>Enterococcus</th>\n",
612
       "      <td>626</td>\n",
613
       "    </tr>\n",
614
       "    <tr>\n",
615
       "      <th>Pseudomonas</th>\n",
616
       "      <td>613</td>\n",
617
       "    </tr>\n",
618
       "    <tr>\n",
619
       "      <th>Streptococcus</th>\n",
620
       "      <td>564</td>\n",
621
       "    </tr>\n",
622
       "    <tr>\n",
623
       "      <th>Acinetobacter</th>\n",
624
       "      <td>531</td>\n",
625
       "    </tr>\n",
626
       "    <tr>\n",
627
       "      <th>Bordetella</th>\n",
628
       "      <td>504</td>\n",
629
       "    </tr>\n",
630
       "    <tr>\n",
631
       "      <th>Vibrio</th>\n",
632
       "      <td>474</td>\n",
633
       "    </tr>\n",
634
       "    <tr>\n",
635
       "      <th>Xanthomonas</th>\n",
636
       "      <td>395</td>\n",
637
       "    </tr>\n",
638
       "    <tr>\n",
639
       "      <th>Mycobacterium</th>\n",
640
       "      <td>368</td>\n",
641
       "    </tr>\n",
642
       "    <tr>\n",
643
       "      <th>Borrelia</th>\n",
644
       "      <td>347</td>\n",
645
       "    </tr>\n",
646
       "    <tr>\n",
647
       "      <th>Campylobacter</th>\n",
648
       "      <td>339</td>\n",
649
       "    </tr>\n",
650
       "    <tr>\n",
651
       "      <th>Rhizobium</th>\n",
652
       "      <td>307</td>\n",
653
       "    </tr>\n",
654
       "    <tr>\n",
655
       "      <th>Enterobacter</th>\n",
656
       "      <td>298</td>\n",
657
       "    </tr>\n",
658
       "    <tr>\n",
659
       "      <th>Mycoplasma</th>\n",
660
       "      <td>290</td>\n",
661
       "    </tr>\n",
662
       "    <tr>\n",
663
       "      <th>Corynebacterium</th>\n",
664
       "      <td>258</td>\n",
665
       "    </tr>\n",
666
       "    <tr>\n",
667
       "      <th>Phaeobacter</th>\n",
668
       "      <td>256</td>\n",
669
       "    </tr>\n",
670
       "    <tr>\n",
671
       "      <th>Yersinia</th>\n",
672
       "      <td>243</td>\n",
673
       "    </tr>\n",
674
       "    <tr>\n",
675
       "      <th>Brucella</th>\n",
676
       "      <td>233</td>\n",
677
       "    </tr>\n",
678
       "    <tr>\n",
679
       "      <th>Clostridium</th>\n",
680
       "      <td>228</td>\n",
681
       "    </tr>\n",
682
       "    <tr>\n",
683
       "      <th>Listeria</th>\n",
684
       "      <td>227</td>\n",
685
       "    </tr>\n",
686
       "    <tr>\n",
687
       "      <th>Streptomyces</th>\n",
688
       "      <td>225</td>\n",
689
       "    </tr>\n",
690
       "    <tr>\n",
691
       "      <th>Chlamydia</th>\n",
692
       "      <td>202</td>\n",
693
       "    </tr>\n",
694
       "    <tr>\n",
695
       "      <th>Helicobacter</th>\n",
696
       "      <td>187</td>\n",
697
       "    </tr>\n",
698
       "    <tr>\n",
699
       "      <th>Candidatus</th>\n",
700
       "      <td>187</td>\n",
701
       "    </tr>\n",
702
       "    <tr>\n",
703
       "      <th>...</th>\n",
704
       "      <td>...</td>\n",
705
       "    </tr>\n",
706
       "    <tr>\n",
707
       "      <th>'Deinococcus</th>\n",
708
       "      <td>1</td>\n",
709
       "    </tr>\n",
710
       "    <tr>\n",
711
       "      <th>Mariniflexile</th>\n",
712
       "      <td>1</td>\n",
713
       "    </tr>\n",
714
       "    <tr>\n",
715
       "      <th>Marinithermus</th>\n",
716
       "      <td>1</td>\n",
717
       "    </tr>\n",
718
       "    <tr>\n",
719
       "      <th>Lactobacillales</th>\n",
720
       "      <td>1</td>\n",
721
       "    </tr>\n",
722
       "    <tr>\n",
723
       "      <th>Marinobacterium</th>\n",
724
       "      <td>1</td>\n",
725
       "    </tr>\n",
726
       "    <tr>\n",
727
       "      <th>Megamonas</th>\n",
728
       "      <td>1</td>\n",
729
       "    </tr>\n",
730
       "    <tr>\n",
731
       "      <th>Melioribacter</th>\n",
732
       "      <td>1</td>\n",
733
       "    </tr>\n",
734
       "    <tr>\n",
735
       "      <th>Melittangium</th>\n",
736
       "      <td>1</td>\n",
737
       "    </tr>\n",
738
       "    <tr>\n",
739
       "      <th>Methylobacillus</th>\n",
740
       "      <td>1</td>\n",
741
       "    </tr>\n",
742
       "    <tr>\n",
743
       "      <th>Methylocaldum</th>\n",
744
       "      <td>1</td>\n",
745
       "    </tr>\n",
746
       "    <tr>\n",
747
       "      <th>Magnetococcus</th>\n",
748
       "      <td>1</td>\n",
749
       "    </tr>\n",
750
       "    <tr>\n",
751
       "      <th>Mageeibacillus</th>\n",
752
       "      <td>1</td>\n",
753
       "    </tr>\n",
754
       "    <tr>\n",
755
       "      <th>Lysinimonas</th>\n",
756
       "      <td>1</td>\n",
757
       "    </tr>\n",
758
       "    <tr>\n",
759
       "      <th>Luteitalea</th>\n",
760
       "      <td>1</td>\n",
761
       "    </tr>\n",
762
       "    <tr>\n",
763
       "      <th>Lacunisphaera</th>\n",
764
       "      <td>1</td>\n",
765
       "    </tr>\n",
766
       "    <tr>\n",
767
       "      <th>Lautropia</th>\n",
768
       "      <td>1</td>\n",
769
       "    </tr>\n",
770
       "    <tr>\n",
771
       "      <th>Leadbetterella</th>\n",
772
       "      <td>1</td>\n",
773
       "    </tr>\n",
774
       "    <tr>\n",
775
       "      <th>Leminorella</th>\n",
776
       "      <td>1</td>\n",
777
       "    </tr>\n",
778
       "    <tr>\n",
779
       "      <th>Lentibacillus</th>\n",
780
       "      <td>1</td>\n",
781
       "    </tr>\n",
782
       "    <tr>\n",
783
       "      <th>Lentzea</th>\n",
784
       "      <td>1</td>\n",
785
       "    </tr>\n",
786
       "    <tr>\n",
787
       "      <th>Leptothrix</th>\n",
788
       "      <td>1</td>\n",
789
       "    </tr>\n",
790
       "    <tr>\n",
791
       "      <th>Levyella</th>\n",
792
       "      <td>1</td>\n",
793
       "    </tr>\n",
794
       "    <tr>\n",
795
       "      <th>Limnobaculum</th>\n",
796
       "      <td>1</td>\n",
797
       "    </tr>\n",
798
       "    <tr>\n",
799
       "      <th>Limnochorda</th>\n",
800
       "      <td>1</td>\n",
801
       "    </tr>\n",
802
       "    <tr>\n",
803
       "      <th>Litorilituus</th>\n",
804
       "      <td>1</td>\n",
805
       "    </tr>\n",
806
       "    <tr>\n",
807
       "      <th>Lonsdalea</th>\n",
808
       "      <td>1</td>\n",
809
       "    </tr>\n",
810
       "    <tr>\n",
811
       "      <th>Luteibacter</th>\n",
812
       "      <td>1</td>\n",
813
       "    </tr>\n",
814
       "    <tr>\n",
815
       "      <th>Luteipulveratus</th>\n",
816
       "      <td>1</td>\n",
817
       "    </tr>\n",
818
       "    <tr>\n",
819
       "      <th>secondary</th>\n",
820
       "      <td>1</td>\n",
821
       "    </tr>\n",
822
       "    <tr>\n",
823
       "      <th>plasmid1</th>\n",
824
       "      <td>0</td>\n",
825
       "    </tr>\n",
826
       "  </tbody>\n",
827
       "</table>\n",
828
       "<p>1120 rows × 1 columns</p>\n",
829
       "</div>"
830
      ],
831
      "text/plain": [
832
       "                 species\n",
833
       "genus                   \n",
834
       "Escherichia         2239\n",
835
       "Klebsiella          1718\n",
836
       "Salmonella          1183\n",
837
       "Bacillus            1172\n",
838
       "Lactobacillus        953\n",
839
       "Staphylococcus       889\n",
840
       "Burkholderia         650\n",
841
       "Enterococcus         626\n",
842
       "Pseudomonas          613\n",
843
       "Streptococcus        564\n",
844
       "Acinetobacter        531\n",
845
       "Bordetella           504\n",
846
       "Vibrio               474\n",
847
       "Xanthomonas          395\n",
848
       "Mycobacterium        368\n",
849
       "Borrelia             347\n",
850
       "Campylobacter        339\n",
851
       "Rhizobium            307\n",
852
       "Enterobacter         298\n",
853
       "Mycoplasma           290\n",
854
       "Corynebacterium      258\n",
855
       "Phaeobacter          256\n",
856
       "Yersinia             243\n",
857
       "Brucella             233\n",
858
       "Clostridium          228\n",
859
       "Listeria             227\n",
860
       "Streptomyces         225\n",
861
       "Chlamydia            202\n",
862
       "Helicobacter         187\n",
863
       "Candidatus           187\n",
864
       "...                  ...\n",
865
       "'Deinococcus           1\n",
866
       "Mariniflexile          1\n",
867
       "Marinithermus          1\n",
868
       "Lactobacillales        1\n",
869
       "Marinobacterium        1\n",
870
       "Megamonas              1\n",
871
       "Melioribacter          1\n",
872
       "Melittangium           1\n",
873
       "Methylobacillus        1\n",
874
       "Methylocaldum          1\n",
875
       "Magnetococcus          1\n",
876
       "Mageeibacillus         1\n",
877
       "Lysinimonas            1\n",
878
       "Luteitalea             1\n",
879
       "Lacunisphaera          1\n",
880
       "Lautropia              1\n",
881
       "Leadbetterella         1\n",
882
       "Leminorella            1\n",
883
       "Lentibacillus          1\n",
884
       "Lentzea                1\n",
885
       "Leptothrix             1\n",
886
       "Levyella               1\n",
887
       "Limnobaculum           1\n",
888
       "Limnochorda            1\n",
889
       "Litorilituus           1\n",
890
       "Lonsdalea              1\n",
891
       "Luteibacter            1\n",
892
       "Luteipulveratus        1\n",
893
       "secondary              1\n",
894
       "plasmid1               0\n",
895
       "\n",
896
       "[1120 rows x 1 columns]"
897
      ]
898
     },
899
     "execution_count": 30,
900
     "metadata": {},
901
     "output_type": "execute_result"
902
    }
903
   ],
904
   "source": [
905
    "inventory.groupby(\"genus\").agg({\"species\":\"count\"}).sort_values(\"species\",ascending=False)"
906
   ]
907
  },
908
  {
909
   "cell_type": "code",
910
   "execution_count": 24,
911
   "metadata": {
912
    "scrolled": false
913
   },
914
   "outputs": [
915
    {
916
     "data": {
917
      "text/html": [
918
       "<div>\n",
919
       "<style scoped>\n",
920
       "    .dataframe tbody tr th:only-of-type {\n",
921
       "        vertical-align: middle;\n",
922
       "    }\n",
923
       "\n",
924
       "    .dataframe tbody tr th {\n",
925
       "        vertical-align: top;\n",
926
       "    }\n",
927
       "\n",
928
       "    .dataframe thead th {\n",
929
       "        text-align: right;\n",
930
       "    }\n",
931
       "</style>\n",
932
       "<table border=\"1\" class=\"dataframe\">\n",
933
       "  <thead>\n",
934
       "    <tr style=\"text-align: right;\">\n",
935
       "      <th></th>\n",
936
       "      <th></th>\n",
937
       "      <th>count</th>\n",
938
       "    </tr>\n",
939
       "    <tr>\n",
940
       "      <th>genus</th>\n",
941
       "      <th>species</th>\n",
942
       "      <th></th>\n",
943
       "    </tr>\n",
944
       "  </thead>\n",
945
       "  <tbody>\n",
946
       "    <tr>\n",
947
       "      <th>'Catharanthus</th>\n",
948
       "      <th>roseus'</th>\n",
949
       "      <td>2</td>\n",
950
       "    </tr>\n",
951
       "    <tr>\n",
952
       "      <th>'Deinococcus</th>\n",
953
       "      <th>soli'</th>\n",
954
       "      <td>1</td>\n",
955
       "    </tr>\n",
956
       "    <tr>\n",
957
       "      <th>'Nostoc</th>\n",
958
       "      <th>azollae'</th>\n",
959
       "      <td>3</td>\n",
960
       "    </tr>\n",
961
       "    <tr>\n",
962
       "      <th>18,711,729</th>\n",
963
       "      <th>reads</th>\n",
964
       "      <td>1</td>\n",
965
       "    </tr>\n",
966
       "    <tr>\n",
967
       "      <th>Acaryochloris</th>\n",
968
       "      <th>marina</th>\n",
969
       "      <td>10</td>\n",
970
       "    </tr>\n",
971
       "    <tr>\n",
972
       "      <th rowspan=\"10\" valign=\"top\">Acetobacter</th>\n",
973
       "      <th>aceti</th>\n",
974
       "      <td>1</td>\n",
975
       "    </tr>\n",
976
       "    <tr>\n",
977
       "      <th>ascendens</th>\n",
978
       "      <td>1</td>\n",
979
       "    </tr>\n",
980
       "    <tr>\n",
981
       "      <th>orientalis</th>\n",
982
       "      <td>2</td>\n",
983
       "    </tr>\n",
984
       "    <tr>\n",
985
       "      <th>oryzifermentans</th>\n",
986
       "      <td>1</td>\n",
987
       "    </tr>\n",
988
       "    <tr>\n",
989
       "      <th>pasteurianus</th>\n",
990
       "      <td>91</td>\n",
991
       "    </tr>\n",
992
       "    <tr>\n",
993
       "      <th>persici</th>\n",
994
       "      <td>2</td>\n",
995
       "    </tr>\n",
996
       "    <tr>\n",
997
       "      <th>pomorum</th>\n",
998
       "      <td>7</td>\n",
999
       "    </tr>\n",
1000
       "    <tr>\n",
1001
       "      <th>senegalensis</th>\n",
1002
       "      <td>2</td>\n",
1003
       "    </tr>\n",
1004
       "    <tr>\n",
1005
       "      <th>sp.</th>\n",
1006
       "      <td>7</td>\n",
1007
       "    </tr>\n",
1008
       "    <tr>\n",
1009
       "      <th>tropicalis</th>\n",
1010
       "      <td>2</td>\n",
1011
       "    </tr>\n",
1012
       "    <tr>\n",
1013
       "      <th>Acetobacterium</th>\n",
1014
       "      <th>woodii</th>\n",
1015
       "      <td>1</td>\n",
1016
       "    </tr>\n",
1017
       "    <tr>\n",
1018
       "      <th>Acetohalobium</th>\n",
1019
       "      <th>arabaticum</th>\n",
1020
       "      <td>1</td>\n",
1021
       "    </tr>\n",
1022
       "    <tr>\n",
1023
       "      <th>Acetomicrobium</th>\n",
1024
       "      <th>mobile</th>\n",
1025
       "      <td>1</td>\n",
1026
       "    </tr>\n",
1027
       "    <tr>\n",
1028
       "      <th rowspan=\"4\" valign=\"top\">Acholeplasma</th>\n",
1029
       "      <th>axanthum</th>\n",
1030
       "      <td>1</td>\n",
1031
       "    </tr>\n",
1032
       "    <tr>\n",
1033
       "      <th>hippikon</th>\n",
1034
       "      <td>2</td>\n",
1035
       "    </tr>\n",
1036
       "    <tr>\n",
1037
       "      <th>laidlawii</th>\n",
1038
       "      <td>2</td>\n",
1039
       "    </tr>\n",
1040
       "    <tr>\n",
1041
       "      <th>oculi</th>\n",
1042
       "      <td>1</td>\n",
1043
       "    </tr>\n",
1044
       "    <tr>\n",
1045
       "      <th rowspan=\"5\" valign=\"top\">Achromobacter</th>\n",
1046
       "      <th>denitrificans</th>\n",
1047
       "      <td>3</td>\n",
1048
       "    </tr>\n",
1049
       "    <tr>\n",
1050
       "      <th>insolitus</th>\n",
1051
       "      <td>4</td>\n",
1052
       "    </tr>\n",
1053
       "    <tr>\n",
1054
       "      <th>sp.</th>\n",
1055
       "      <td>2</td>\n",
1056
       "    </tr>\n",
1057
       "    <tr>\n",
1058
       "      <th>spanius</th>\n",
1059
       "      <td>4</td>\n",
1060
       "    </tr>\n",
1061
       "    <tr>\n",
1062
       "      <th>xylosoxidans</th>\n",
1063
       "      <td>11</td>\n",
1064
       "    </tr>\n",
1065
       "    <tr>\n",
1066
       "      <th rowspan=\"2\" valign=\"top\">Acidaminococcus</th>\n",
1067
       "      <th>fermentans</th>\n",
1068
       "      <td>1</td>\n",
1069
       "    </tr>\n",
1070
       "    <tr>\n",
1071
       "      <th>intestini</th>\n",
1072
       "      <td>1</td>\n",
1073
       "    </tr>\n",
1074
       "    <tr>\n",
1075
       "      <th>Acidiferrobacter</th>\n",
1076
       "      <th>sp.</th>\n",
1077
       "      <td>1</td>\n",
1078
       "    </tr>\n",
1079
       "    <tr>\n",
1080
       "      <th>...</th>\n",
1081
       "      <th>...</th>\n",
1082
       "      <td>...</td>\n",
1083
       "    </tr>\n",
1084
       "    <tr>\n",
1085
       "      <th>Zymobacter</th>\n",
1086
       "      <th>palmae</th>\n",
1087
       "      <td>2</td>\n",
1088
       "    </tr>\n",
1089
       "    <tr>\n",
1090
       "      <th>Zymomonas</th>\n",
1091
       "      <th>mobilis</th>\n",
1092
       "      <td>49</td>\n",
1093
       "    </tr>\n",
1094
       "    <tr>\n",
1095
       "      <th>[Arcobacter]</th>\n",
1096
       "      <th>porcinus</th>\n",
1097
       "      <td>1</td>\n",
1098
       "    </tr>\n",
1099
       "    <tr>\n",
1100
       "      <th>[Bacillus</th>\n",
1101
       "      <th>thuringiensis]</th>\n",
1102
       "      <td>2</td>\n",
1103
       "    </tr>\n",
1104
       "    <tr>\n",
1105
       "      <th rowspan=\"2\" valign=\"top\">[Bacillus]</th>\n",
1106
       "      <th>caldolyticus</th>\n",
1107
       "      <td>2</td>\n",
1108
       "    </tr>\n",
1109
       "    <tr>\n",
1110
       "      <th>selenitireducens</th>\n",
1111
       "      <td>1</td>\n",
1112
       "    </tr>\n",
1113
       "    <tr>\n",
1114
       "      <th rowspan=\"2\" valign=\"top\">[Brevibacterium]</th>\n",
1115
       "      <th>flavum</th>\n",
1116
       "      <td>2</td>\n",
1117
       "    </tr>\n",
1118
       "    <tr>\n",
1119
       "      <th>frigoritolerans</th>\n",
1120
       "      <td>1</td>\n",
1121
       "    </tr>\n",
1122
       "    <tr>\n",
1123
       "      <th rowspan=\"8\" valign=\"top\">[Clostridium]</th>\n",
1124
       "      <th>bolteae</th>\n",
1125
       "      <td>2</td>\n",
1126
       "    </tr>\n",
1127
       "    <tr>\n",
1128
       "      <th>cellulolyticum</th>\n",
1129
       "      <td>1</td>\n",
1130
       "    </tr>\n",
1131
       "    <tr>\n",
1132
       "      <th>cellulosi</th>\n",
1133
       "      <td>1</td>\n",
1134
       "    </tr>\n",
1135
       "    <tr>\n",
1136
       "      <th>clariflavum</th>\n",
1137
       "      <td>1</td>\n",
1138
       "    </tr>\n",
1139
       "    <tr>\n",
1140
       "      <th>propionicum</th>\n",
1141
       "      <td>1</td>\n",
1142
       "    </tr>\n",
1143
       "    <tr>\n",
1144
       "      <th>saccharolyticum</th>\n",
1145
       "      <td>1</td>\n",
1146
       "    </tr>\n",
1147
       "    <tr>\n",
1148
       "      <th>scindens</th>\n",
1149
       "      <td>1</td>\n",
1150
       "    </tr>\n",
1151
       "    <tr>\n",
1152
       "      <th>stercorarium</th>\n",
1153
       "      <td>4</td>\n",
1154
       "    </tr>\n",
1155
       "    <tr>\n",
1156
       "      <th>[Enterobacter]</th>\n",
1157
       "      <th>lignolyticus</th>\n",
1158
       "      <td>1</td>\n",
1159
       "    </tr>\n",
1160
       "    <tr>\n",
1161
       "      <th rowspan=\"3\" valign=\"top\">[Eubacterium]</th>\n",
1162
       "      <th>eligens</th>\n",
1163
       "      <td>3</td>\n",
1164
       "    </tr>\n",
1165
       "    <tr>\n",
1166
       "      <th>hallii</th>\n",
1167
       "      <td>1</td>\n",
1168
       "    </tr>\n",
1169
       "    <tr>\n",
1170
       "      <th>rectale</th>\n",
1171
       "      <td>1</td>\n",
1172
       "    </tr>\n",
1173
       "    <tr>\n",
1174
       "      <th rowspan=\"2\" valign=\"top\">[Haemophilus]</th>\n",
1175
       "      <th>ducreyi</th>\n",
1176
       "      <td>14</td>\n",
1177
       "    </tr>\n",
1178
       "    <tr>\n",
1179
       "      <th>parasuis</th>\n",
1180
       "      <td>4</td>\n",
1181
       "    </tr>\n",
1182
       "    <tr>\n",
1183
       "      <th>[Mycobacterium]</th>\n",
1184
       "      <th>chelonae</th>\n",
1185
       "      <td>2</td>\n",
1186
       "    </tr>\n",
1187
       "    <tr>\n",
1188
       "      <th>[Pasteurella]</th>\n",
1189
       "      <th>aerogenes</th>\n",
1190
       "      <td>1</td>\n",
1191
       "    </tr>\n",
1192
       "    <tr>\n",
1193
       "      <th>[Polyangium]</th>\n",
1194
       "      <th>brachysporum</th>\n",
1195
       "      <td>1</td>\n",
1196
       "    </tr>\n",
1197
       "    <tr>\n",
1198
       "      <th>[Pseudomonas</th>\n",
1199
       "      <th>syringae]</th>\n",
1200
       "      <td>3</td>\n",
1201
       "    </tr>\n",
1202
       "    <tr>\n",
1203
       "      <th>[Pseudomonas]</th>\n",
1204
       "      <th>mesoacidophila</th>\n",
1205
       "      <td>4</td>\n",
1206
       "    </tr>\n",
1207
       "    <tr>\n",
1208
       "      <th>complete</th>\n",
1209
       "      <th>chromosome</th>\n",
1210
       "      <td>2</td>\n",
1211
       "    </tr>\n",
1212
       "    <tr>\n",
1213
       "      <th>gamma</th>\n",
1214
       "      <th>proteobacterium</th>\n",
1215
       "      <td>1</td>\n",
1216
       "    </tr>\n",
1217
       "    <tr>\n",
1218
       "      <th>secondary</th>\n",
1219
       "      <th>endosymbiont</th>\n",
1220
       "      <td>1</td>\n",
1221
       "    </tr>\n",
1222
       "  </tbody>\n",
1223
       "</table>\n",
1224
       "<p>3186 rows × 1 columns</p>\n",
1225
       "</div>"
1226
      ],
1227
      "text/plain": [
1228
       "                                   count\n",
1229
       "genus            species                \n",
1230
       "'Catharanthus    roseus'               2\n",
1231
       "'Deinococcus     soli'                 1\n",
1232
       "'Nostoc          azollae'              3\n",
1233
       "18,711,729       reads                 1\n",
1234
       "Acaryochloris    marina               10\n",
1235
       "Acetobacter      aceti                 1\n",
1236
       "                 ascendens             1\n",
1237
       "                 orientalis            2\n",
1238
       "                 oryzifermentans       1\n",
1239
       "                 pasteurianus         91\n",
1240
       "                 persici               2\n",
1241
       "                 pomorum               7\n",
1242
       "                 senegalensis          2\n",
1243
       "                 sp.                   7\n",
1244
       "                 tropicalis            2\n",
1245
       "Acetobacterium   woodii                1\n",
1246
       "Acetohalobium    arabaticum            1\n",
1247
       "Acetomicrobium   mobile                1\n",
1248
       "Acholeplasma     axanthum              1\n",
1249
       "                 hippikon              2\n",
1250
       "                 laidlawii             2\n",
1251
       "                 oculi                 1\n",
1252
       "Achromobacter    denitrificans         3\n",
1253
       "                 insolitus             4\n",
1254
       "                 sp.                   2\n",
1255
       "                 spanius               4\n",
1256
       "                 xylosoxidans         11\n",
1257
       "Acidaminococcus  fermentans            1\n",
1258
       "                 intestini             1\n",
1259
       "Acidiferrobacter sp.                   1\n",
1260
       "...                                  ...\n",
1261
       "Zymobacter       palmae                2\n",
1262
       "Zymomonas        mobilis              49\n",
1263
       "[Arcobacter]     porcinus              1\n",
1264
       "[Bacillus        thuringiensis]        2\n",
1265
       "[Bacillus]       caldolyticus          2\n",
1266
       "                 selenitireducens      1\n",
1267
       "[Brevibacterium] flavum                2\n",
1268
       "                 frigoritolerans       1\n",
1269
       "[Clostridium]    bolteae               2\n",
1270
       "                 cellulolyticum        1\n",
1271
       "                 cellulosi             1\n",
1272
       "                 clariflavum           1\n",
1273
       "                 propionicum           1\n",
1274
       "                 saccharolyticum       1\n",
1275
       "                 scindens              1\n",
1276
       "                 stercorarium          4\n",
1277
       "[Enterobacter]   lignolyticus          1\n",
1278
       "[Eubacterium]    eligens               3\n",
1279
       "                 hallii                1\n",
1280
       "                 rectale               1\n",
1281
       "[Haemophilus]    ducreyi              14\n",
1282
       "                 parasuis              4\n",
1283
       "[Mycobacterium]  chelonae              2\n",
1284
       "[Pasteurella]    aerogenes             1\n",
1285
       "[Polyangium]     brachysporum          1\n",
1286
       "[Pseudomonas     syringae]             3\n",
1287
       "[Pseudomonas]    mesoacidophila        4\n",
1288
       "complete         chromosome            2\n",
1289
       "gamma            proteobacterium       1\n",
1290
       "secondary        endosymbiont          1\n",
1291
       "\n",
1292
       "[3186 rows x 1 columns]"
1293
      ]
1294
     },
1295
     "execution_count": 24,
1296
     "metadata": {},
1297
     "output_type": "execute_result"
1298
    }
1299
   ],
1300
   "source": [
1301
    "inventory.groupby([\"genus\", \"species\"]).agg({\"species\": \"count\"})\n",
1302
    "inventory.columns=[\"count\"]\n",
1303
    "inventory"
1304
   ]
1305
  },
1306
  {
1307
   "cell_type": "code",
1308
   "execution_count": 117,
1309
   "metadata": {},
1310
   "outputs": [
1311
    {
1312
     "data": {
1313
      "text/html": [
1314
       "<div>\n",
1315
       "<style scoped>\n",
1316
       "    .dataframe tbody tr th:only-of-type {\n",
1317
       "        vertical-align: middle;\n",
1318
       "    }\n",
1319
       "\n",
1320
       "    .dataframe tbody tr th {\n",
1321
       "        vertical-align: top;\n",
1322
       "    }\n",
1323
       "\n",
1324
       "    .dataframe thead th {\n",
1325
       "        text-align: right;\n",
1326
       "    }\n",
1327
       "</style>\n",
1328
       "<table border=\"1\" class=\"dataframe\">\n",
1329
       "  <thead>\n",
1330
       "    <tr style=\"text-align: right;\">\n",
1331
       "      <th></th>\n",
1332
       "      <th>n_sequences</th>\n",
1333
       "      <th>species</th>\n",
1334
       "    </tr>\n",
1335
       "    <tr>\n",
1336
       "      <th>genus</th>\n",
1337
       "      <th></th>\n",
1338
       "      <th></th>\n",
1339
       "    </tr>\n",
1340
       "  </thead>\n",
1341
       "  <tbody>\n",
1342
       "    <tr>\n",
1343
       "      <th>Bacillus</th>\n",
1344
       "      <td>1132</td>\n",
1345
       "      <td>11</td>\n",
1346
       "    </tr>\n",
1347
       "    <tr>\n",
1348
       "      <th>Streptomyces</th>\n",
1349
       "      <td>743</td>\n",
1350
       "      <td>5</td>\n",
1351
       "    </tr>\n",
1352
       "    <tr>\n",
1353
       "      <th>Vibrio</th>\n",
1354
       "      <td>468</td>\n",
1355
       "      <td>5</td>\n",
1356
       "    </tr>\n",
1357
       "    <tr>\n",
1358
       "      <th>Rhizobium</th>\n",
1359
       "      <td>325</td>\n",
1360
       "      <td>6</td>\n",
1361
       "    </tr>\n",
1362
       "    <tr>\n",
1363
       "      <th>Pseudomonas</th>\n",
1364
       "      <td>304</td>\n",
1365
       "      <td>8</td>\n",
1366
       "    </tr>\n",
1367
       "    <tr>\n",
1368
       "      <th>Staphylococcus</th>\n",
1369
       "      <td>301</td>\n",
1370
       "      <td>6</td>\n",
1371
       "    </tr>\n",
1372
       "    <tr>\n",
1373
       "      <th>Clostridium</th>\n",
1374
       "      <td>259</td>\n",
1375
       "      <td>5</td>\n",
1376
       "    </tr>\n",
1377
       "    <tr>\n",
1378
       "      <th>Streptococcus</th>\n",
1379
       "      <td>222</td>\n",
1380
       "      <td>6</td>\n",
1381
       "    </tr>\n",
1382
       "    <tr>\n",
1383
       "      <th>Planktothrix</th>\n",
1384
       "      <td>179</td>\n",
1385
       "      <td>5</td>\n",
1386
       "    </tr>\n",
1387
       "    <tr>\n",
1388
       "      <th>Stenotrophomonas</th>\n",
1389
       "      <td>176</td>\n",
1390
       "      <td>5</td>\n",
1391
       "    </tr>\n",
1392
       "    <tr>\n",
1393
       "      <th>Escherichia</th>\n",
1394
       "      <td>133</td>\n",
1395
       "      <td>3</td>\n",
1396
       "    </tr>\n",
1397
       "    <tr>\n",
1398
       "      <th>Paenibacillus</th>\n",
1399
       "      <td>127</td>\n",
1400
       "      <td>3</td>\n",
1401
       "    </tr>\n",
1402
       "    <tr>\n",
1403
       "      <th>Listeria</th>\n",
1404
       "      <td>104</td>\n",
1405
       "      <td>4</td>\n",
1406
       "    </tr>\n",
1407
       "    <tr>\n",
1408
       "      <th>Corynebacterium</th>\n",
1409
       "      <td>103</td>\n",
1410
       "      <td>7</td>\n",
1411
       "    </tr>\n",
1412
       "    <tr>\n",
1413
       "      <th>Klebsiella</th>\n",
1414
       "      <td>16</td>\n",
1415
       "      <td>3</td>\n",
1416
       "    </tr>\n",
1417
       "    <tr>\n",
1418
       "      <th>Shigella</th>\n",
1419
       "      <td>14</td>\n",
1420
       "      <td>3</td>\n",
1421
       "    </tr>\n",
1422
       "    <tr>\n",
1423
       "      <th>Salmonella</th>\n",
1424
       "      <td>4</td>\n",
1425
       "      <td>2</td>\n",
1426
       "    </tr>\n",
1427
       "    <tr>\n",
1428
       "      <th>Enterobacter</th>\n",
1429
       "      <td>1</td>\n",
1430
       "      <td>1</td>\n",
1431
       "    </tr>\n",
1432
       "  </tbody>\n",
1433
       "</table>\n",
1434
       "</div>"
1435
      ],
1436
      "text/plain": [
1437
       "                  n_sequences  species\n",
1438
       "genus                                 \n",
1439
       "Bacillus                 1132       11\n",
1440
       "Streptomyces              743        5\n",
1441
       "Vibrio                    468        5\n",
1442
       "Rhizobium                 325        6\n",
1443
       "Pseudomonas               304        8\n",
1444
       "Staphylococcus            301        6\n",
1445
       "Clostridium               259        5\n",
1446
       "Streptococcus             222        6\n",
1447
       "Planktothrix              179        5\n",
1448
       "Stenotrophomonas          176        5\n",
1449
       "Escherichia               133        3\n",
1450
       "Paenibacillus             127        3\n",
1451
       "Listeria                  104        4\n",
1452
       "Corynebacterium           103        7\n",
1453
       "Klebsiella                 16        3\n",
1454
       "Shigella                   14        3\n",
1455
       "Salmonella                  4        2\n",
1456
       "Enterobacter                1        1"
1457
      ]
1458
     },
1459
     "execution_count": 117,
1460
     "metadata": {},
1461
     "output_type": "execute_result"
1462
    }
1463
   ],
1464
   "source": [
1465
    "counts = inventory.reset_index().groupby(\"genus\").agg({\"count\", sum}).drop((\"species\"), axis=1)\n",
1466
    "counts.columns=[\"n_sequences\",\"species\"]\n",
1467
    "counts.sort_values(\"n_sequences\", ascending=False)"
1468
   ]
1469
  },
1470
  {
1471
   "cell_type": "code",
1472
   "execution_count": null,
1473
   "metadata": {},
1474
   "outputs": [],
1475
   "source": []
1476
  }
1477
 ],
1478
 "metadata": {
1479
  "kernelspec": {
1480
   "display_name": "Python 3",
1481
   "language": "python",
1482
   "name": "python3"
1483
  },
1484
  "language_info": {
1485
   "codemirror_mode": {
1486
    "name": "ipython",
1487
    "version": 3
1488
   },
1489
   "file_extension": ".py",
1490
   "mimetype": "text/x-python",
1491
   "name": "python",
1492
   "nbconvert_exporter": "python",
1493
   "pygments_lexer": "ipython3",
1494
   "version": "3.6.8"
1495
  },
1496
  "varInspector": {
1497
   "cols": {
1498
    "lenName": 16,
1499
    "lenType": 16,
1500
    "lenVar": 40
1501
   },
1502
   "kernels_config": {
1503
    "python": {
1504
     "delete_cmd_postfix": "",
1505
     "delete_cmd_prefix": "del ",
1506
     "library": "var_list.py",
1507
     "varRefreshCmd": "print(var_dic_list())"
1508
    },
1509
    "r": {
1510
     "delete_cmd_postfix": ") ",
1511
     "delete_cmd_prefix": "rm(",
1512
     "library": "var_list.r",
1513
     "varRefreshCmd": "cat(var_dic_list()) "
1514
    }
1515
   },
1516
   "types_to_exclude": [
1517
    "module",
1518
    "function",
1519
    "builtin_function_or_method",
1520
    "instance",
1521
    "_Feature"
1522
   ],
1523
   "window_display": false
1524
  }
1525
 },
1526
 "nbformat": 4,
1527
 "nbformat_minor": 2
1528
}