{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "#export\n", "from gensim.models import Word2Vec\n", "from gensim.models.word2vec import LineSentence\n", "from glob import glob\n", "import os\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import pandas as pd\n", "import seaborn as sns; sns.set(color_codes=True)\n", "from sklearn.decomposition import PCA\n", "from sklearn.manifold import TSNE\n", "from sklearn import manifold,neighbors\n", "from sklearn.model_selection import train_test_split\n", "from scipy.cluster.hierarchy import dendrogram, linkage, to_tree, fcluster,distance \n", "from matplotlib import pyplot as plt\n", "from mpl_toolkits.mplot3d import Axes3D\n", "from Bio import SeqIO\n", "from Bio.Align import MultipleSeqAlignment\n", "from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor\n", "from Bio import Phylo\n", "from tqdm import tqdm\n", "from sklearn.preprocessing import MinMaxScaler\n", "import gc\n", "import random\n", "import multiprocessing as mp\n", "from gensim.models import KeyedVectors" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Sample Ksent " ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "#export\n", "FASTA = \"/home/serge/data/genomes/ncbi-genomes-2019-04-07/\"\n", "labels = []\n", "ff = glob(f\"{FASTA}*.gz\")\n", "\n", "import gzip\n", "from functools import partial\n", "import re\n", "\n", "def get_spicies(d):\n", " sp = d.split()[2]\n", " if sp == \"sp.\": sp = \"\".join(d.split()[2:4])\n", " return sp\n", "\n", "def clean(d): return re.sub(r'[{}\\[\\]\\\"\\']',\"\",d)\n", "\n", "def get_description(d, c=True):return clean(d) if c else d\n", "\n", "def get_genus(d):\n", " return d.split()[1]\n", "\n", "def get_family(d):\n", " return \" \".join([get_genus(d),get_spicies(d)])\n", "\n", "def keep(k, d):\n", " return k in d if type(k) == str else any([x in d for x in k])\n", "\n", "\n", "def read_fastas(files, label, k=\" \", seq=True, d=\"plasmid\", compressed=True):\n", " if compressed:\n", " _open = partial(gzip.open, mode=\"rt\")\n", " else:\n", " _open = open\n", " for file in tqdm(files):\n", " with _open(file) as handle:\n", " for record in SeqIO.parse(handle, \"fasta\"):\n", " if keep(k,record.description) and not keep(d, record.description):\n", " yield (str(record.seq),label(record.description),len(record.seq)) if seq else (label(record.description),len(record.seq))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "inventory = pd.DataFrame(read_fastas(ff,label=get_description, seq=False), columns=[\"description\",\"seq_len\"])" ] }, { "cell_type": "code", "execution_count": 191, "metadata": { "scrolled": true }, "outputs": [], "source": [ "inventory[\"genus\"] = [get_genus(x) for x in inventory.description.values]\n", "inventory[\"spicies\"]=[get_spicies(x) for x in inventory.description.values]" ] }, { "cell_type": "code", "execution_count": 208, "metadata": {}, "outputs": [], "source": [ "b= inventory.groupby([\"genus\", \"spicies\"]).agg({\"description\": \"count\" })" ] }, { "cell_type": "code", "execution_count": 210, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | description | \n", "
---|---|
spicies | \n", "\n", " |
subtilis | \n", "107 | \n", "
velezensis | \n", "72 | \n", "
anthracis | \n", "51 | \n", "
cereus | \n", "49 | \n", "
thuringiensis | \n", "45 | \n", "
amyloliquefaciens | \n", "36 | \n", "
licheniformis | \n", "25 | \n", "
cytotoxicus | \n", "11 | \n", "
pumilus | \n", "9 | \n", "
megaterium | \n", "9 | \n", "
coagulans | \n", "8 | \n", "
paralicheniformis | \n", "7 | \n", "
safensis | \n", "5 | \n", "
cellulasensis | \n", "4 | \n", "
altitudinis | \n", "4 | \n", "
clausii | \n", "3 | \n", "
vallismortis | \n", "3 | \n", "
atrophaeus | \n", "3 | \n", "
glycinifermentans | \n", "3 | \n", "
simplex | \n", "2 | \n", "
mycoides | \n", "2 | \n", "
circulans | \n", "2 | \n", "
weihenstephanensis | \n", "2 | \n", "
pseudomycoides | \n", "2 | \n", "
sp.JS, | \n", "1 | \n", "
sp.IHB | \n", "1 | \n", "
sp.HBCD-sjtu | \n", "1 | \n", "
sp.H15-1 | \n", "1 | \n", "
sp.BH072, | \n", "1 | \n", "
sp.FJAT-14266 | \n", "1 | \n", "
... | \n", "... | \n", "
flexus | \n", "1 | \n", "
filamentosus | \n", "1 | \n", "
cohnii | \n", "1 | \n", "
cellulosilyticus | \n", "1 | \n", "
caldolyticus | \n", "1 | \n", "
butanolivorans | \n", "1 | \n", "
bombysepticus | \n", "1 | \n", "
beveridgei | \n", "1 | \n", "
asahii | \n", "1 | \n", "
aryabhattai | \n", "1 | \n", "
horikoshii | \n", "1 | \n", "
kochii | \n", "1 | \n", "
sonorensis | \n", "1 | \n", "
krulwichiae | \n", "1 | \n", "
smithii | \n", "1 | \n", "
siamensis | \n", "1 | \n", "
selenitireducens | \n", "1 | \n", "
albus | \n", "1 | \n", "
pseudofirmus | \n", "1 | \n", "
oceanisediminis | \n", "1 | \n", "
muralis | \n", "1 | \n", "
mobilis | \n", "1 | \n", "
methylotrophicus | \n", "1 | \n", "
methanolicus | \n", "1 | \n", "
mesonae | \n", "1 | \n", "
marisflavi | \n", "1 | \n", "
litoralis | \n", "1 | \n", "
lentus | \n", "1 | \n", "
lehensis | \n", "1 | \n", "
xiamenensis | \n", "1 | \n", "
94 rows × 1 columns
\n", "\n", " | description | \n", "
---|---|
spicies | \n", "\n", " |
aeruginosa | \n", "182 | \n", "
chlororaphis | \n", "43 | \n", "
syringae | \n", "29 | \n", "
putida | \n", "29 | \n", "
fluorescens | \n", "22 | \n", "
stutzeri | \n", "17 | \n", "
protegens | \n", "7 | \n", "
synxantha | \n", "6 | \n", "
mendocina | \n", "6 | \n", "
orientalis | \n", "5 | \n", "
monteilii | \n", "5 | \n", "
brassicacearum | \n", "4 | \n", "
pseudoalcaligenes | \n", "3 | \n", "
mesoacidophila | \n", "3 | \n", "
koreensis | \n", "3 | \n", "
fulva | \n", "3 | \n", "
frederiksbergensis | \n", "3 | \n", "
parafulva | \n", "3 | \n", "
entomophila | \n", "3 | \n", "
citronellolis | \n", "2 | \n", "
fragi | \n", "2 | \n", "
amygdali | \n", "2 | \n", "
plecoglossicida | \n", "2 | \n", "
mosselii | \n", "2 | \n", "
taetrolens | \n", "2 | \n", "
azotoformans | \n", "2 | \n", "
savastanoi | \n", "2 | \n", "
sp.URMO17WK12:I11 | \n", "2 | \n", "
sp.TCU-HL1 | \n", "1 | \n", "
sp.R11-23-07 | \n", "1 | \n", "
... | \n", "... | \n", "
balearica | \n", "1 | \n", "
avellanae | \n", "1 | \n", "
antarctica | \n", "1 | \n", "
alkylphenolica | \n", "1 | \n", "
alcaliphila | \n", "1 | \n", "
rhizosphaerae | \n", "1 | \n", "
soli | \n", "1 | \n", "
sp.LPH1 | \n", "1 | \n", "
sp.FGI182, | \n", "1 | \n", "
sp.LH1G9 | \n", "1 | \n", "
sp.LG1E9 | \n", "1 | \n", "
sp.LG1D9 | \n", "1 | \n", "
sp.LBUM920 | \n", "1 | \n", "
sp.L10.10, | \n", "1 | \n", "
sp.K2W31S-8 | \n", "1 | \n", "
sp.JY-Q, | \n", "1 | \n", "
sp.HLS-6 | \n", "1 | \n", "
alcaligenes | \n", "1 | \n", "
sp.FDAARGOS_380 | \n", "1 | \n", "
sp.31-12 | \n", "1 | \n", "
sp.DY-1 | \n", "1 | \n", "
sp.DTU12.3 | \n", "1 | \n", "
sp.DR | \n", "1 | \n", "
sp.CMR5c | \n", "1 | \n", "
sp.CMR12a | \n", "1 | \n", "
sp.CCOS | \n", "1 | \n", "
sp.CC6-YY-74 | \n", "1 | \n", "
sp.AK6U | \n", "1 | \n", "
sp.58 | \n", "1 | \n", "
yamanorum | \n", "1 | \n", "
118 rows × 1 columns
\n", "