Switch to unified view

a b/ipynb/sequences to dataframe.ipynb
1
{
2
 "cells": [
3
  {
4
   "cell_type": "code",
5
   "execution_count": 1,
6
   "metadata": {},
7
   "outputs": [],
8
   "source": [
9
    "%load_ext autoreload\n",
10
    "%autoreload 2"
11
   ]
12
  },
13
  {
14
   "cell_type": "code",
15
   "execution_count": 31,
16
   "metadata": {},
17
   "outputs": [],
18
   "source": [
19
    "#export\n",
20
    "import sys\n",
21
    "sys.path.append(\"..\")\n",
22
    "from faigen.data import sequence \n",
23
    "from faigen.data.sequence import regex_filter, count_filter, Dna2VecDataBunch,Dna2VecList, seq_record\n",
24
    "from functools import partial\n",
25
    "import pandas as pd\n",
26
    "import numpy as np\n",
27
    "from sklearn.decomposition import PCA\n",
28
    "from sklearn import manifold,neighbors\n",
29
    "from scipy.cluster.hierarchy import dendrogram, linkage  \n",
30
    "from matplotlib import pyplot as plt\n",
31
    "import seaborn as sns; sns.set(color_codes=True)\n",
32
    "import plotly.plotly as py\n",
33
    "import plotly.graph_objs as go\n",
34
    "from fastai import *\n",
35
    "from fastai.data_block import *\n",
36
    "from fastai.basic_train import *\n",
37
    "from fastai.layers import *\n",
38
    "from fastai.metrics import *\n",
39
    "from fastai.text import *\n",
40
    "from gensim.models import Word2Vec\n",
41
    "import torch \n",
42
    "import torch.nn as nn\n",
43
    "import torch.nn.functional as F\n",
44
    "import gc \n",
45
    "from itertools import islice\n",
46
    "from tqdm import tqdm\n"
47
   ]
48
  },
49
  {
50
   "cell_type": "code",
51
   "execution_count": 4,
52
   "metadata": {},
53
   "outputs": [
54
    {
55
     "name": "stdout",
56
     "output_type": "stream",
57
     "text": [
58
      "Loading embedding\n"
59
     ]
60
    }
61
   ],
62
   "source": [
63
    "#export\n",
64
    "print(\"Loading embedding\")\n",
65
    "word_vectors = Word2Vec.load_word2vec_format('/data/genomes/embeddings/dna2vec-20190612-1611-k10to10-100d-10c-4870Mbp-sliding-kPR.w2v') "
66
   ]
67
  },
68
  {
69
   "cell_type": "code",
70
   "execution_count": 5,
71
   "metadata": {},
72
   "outputs": [
73
    {
74
     "data": {
75
      "text/plain": [
76
       "14495"
77
      ]
78
     },
79
     "execution_count": 5,
80
     "metadata": {},
81
     "output_type": "execute_result"
82
    }
83
   ],
84
   "source": [
85
    "#export\n",
86
    "# DB=\"/data/genomes/GenSeq_fastas\"\n",
87
    "# DB='/home/serge/development/genomes/ncbi-genomes-2019-04-07/bacterial genomes'\n",
88
    "# DB=\"/home/serge/database/data/genomes/ncbi-genomes-2019-04-07/Bacillus\"\n",
89
    "DB=\"/home/serge/database/data/genomes/bacillus/ncbi-genomes-2019-06-25\"\n",
90
    "data, X, dfx = None,None,None\n",
91
    "bunch=None\n",
92
    "learner=None\n",
93
    "gc.collect()"
94
   ]
95
  },
96
  {
97
   "cell_type": "code",
98
   "execution_count": 6,
99
   "metadata": {},
100
   "outputs": [
101
    {
102
     "name": "stderr",
103
     "output_type": "stream",
104
     "text": [
105
      "100%|██████████| 3964/3964 [06:13<00:00, 15.06it/s]\n",
106
      "100%|██████████| 3964/3964 [05:12<00:00, 14.89it/s]\n"
107
     ]
108
    }
109
   ],
110
   "source": [
111
    "filters=[partial(regex_filter, rx=\"plasmid\", keep=False)]\n",
112
    "data = sequence.Dna2VecList.from_folder(DB,filters=filters,n_cpus=7,emb=word_vectors,recurse=True)\n",
113
    "sequence.GSFileProcessor().process(data)\n"
114
   ]
115
  },
116
  {
117
   "cell_type": "code",
118
   "execution_count": null,
119
   "metadata": {},
120
   "outputs": [],
121
   "source": []
122
  },
123
  {
124
   "cell_type": "code",
125
   "execution_count": 58,
126
   "metadata": {},
127
   "outputs": [],
128
   "source": [
129
    "dfseq = pd.DataFrame.from_dict({\"seq\": list(map(str, data.items)),\n",
130
    "                                \"description\": data.descriptions, \n",
131
    "                                \"file\":data.files,\n",
132
    "                                \"id\":data.ids, \n",
133
    "                                \"name\":data.names})"
134
   ]
135
  },
136
  {
137
   "cell_type": "code",
138
   "execution_count": null,
139
   "metadata": {},
140
   "outputs": [],
141
   "source": []
142
  },
143
  {
144
   "cell_type": "code",
145
   "execution_count": 60,
146
   "metadata": {},
147
   "outputs": [],
148
   "source": [
149
    "dfseq.to_pickle(\"/home/serge/database/data/genomes/bacillus/ncbi-genomes-2019-06-25/all_sequences-no-plasmid.pkl\")"
150
   ]
151
  },
152
  {
153
   "cell_type": "code",
154
   "execution_count": 9,
155
   "metadata": {},
156
   "outputs": [],
157
   "source": [
158
    "def k_mers(sequence, k):\n",
159
    "    it = iter(sequence)\n",
160
    "    result = tuple(islice(it, k))\n",
161
    "    if len(result) == k:\n",
162
    "        yield \"\".join(result)\n",
163
    "    for elem in it:\n",
164
    "        result = result[1:] + (elem,)\n",
165
    "        yield \"\".join(result)"
166
   ]
167
  },
168
  {
169
   "cell_type": "code",
170
   "execution_count": 7,
171
   "metadata": {},
172
   "outputs": [
173
    {
174
     "data": {
175
      "text/plain": [
176
       "Seq('ATTTCCCATGAAATAGGTTCGGTTCTGTTAGTAAAAAATTCGAAATATAGTAAG...NNN', SingleLetterAlphabet())"
177
      ]
178
     },
179
     "execution_count": 7,
180
     "metadata": {},
181
     "output_type": "execute_result"
182
    }
183
   ],
184
   "source": [
185
    "data.items[0]"
186
   ]
187
  },
188
  {
189
   "cell_type": "code",
190
   "execution_count": 13,
191
   "metadata": {
192
    "scrolled": true
193
   },
194
   "outputs": [],
195
   "source": [
196
    "mers = np.asarray([word_vectors[x] for x in k_mers(str(data.items[0]), 10) if set(x) == set('ATGC')])"
197
   ]
198
  },
199
  {
200
   "cell_type": "code",
201
   "execution_count": 17,
202
   "metadata": {},
203
   "outputs": [
204
    {
205
     "data": {
206
      "text/plain": [
207
       "(100,)"
208
      ]
209
     },
210
     "execution_count": 17,
211
     "metadata": {},
212
     "output_type": "execute_result"
213
    }
214
   ],
215
   "source": [
216
    "mers.mean(axis=0).shape"
217
   ]
218
  },
219
  {
220
   "cell_type": "code",
221
   "execution_count": 44,
222
   "metadata": {},
223
   "outputs": [],
224
   "source": [
225
    "class Vectorizer:\n",
226
    "    def __init__(self,texts=None,  ngram=10, skip=0, n_cpus=7, chunksize=1000):\n",
227
    "        self.texts, self.ngram, self.skip, self.n_cpus, self.chunksize = texts, ngram, skip, n_cpus, chunksize\n",
228
    "                    \n",
229
    "    def vectorizer(self, t):\n",
230
    "        if self.ngram == 1:\n",
231
    "            toks = list(t)\n",
232
    "            if self.skip > 0:\n",
233
    "                toks = toks[::2] if self.skip == 1 else toks[::self.skip]\n",
234
    "        else:\n",
235
    "            toks = [t[i:i + self.ngram] for i in range(0, len(t), self.ngram + self.skip) if i+self.ngram < len(t)] \n",
236
    "        res = np.asarray(word_vectors[filter(lambda x: set(x) == set(\"ACGT\"), toks)]).mean(axis=0)\n",
237
    "        toks=None\n",
238
    "        return res\n",
239
    "\n",
240
    "    def _process_all_1(self, texts):\n",
241
    "        return [self.vectorizer(str(t)) for t in texts]\n",
242
    "\n",
243
    "    def process_all(self, texts):\n",
244
    "        if self.n_cpus <= 1: return self._process_all_1(texts)\n",
245
    "        with ProcessPoolExecutor(self.n_cpus) as e:\n",
246
    "            res = sum(e.map(self._process_all_1, partition_by_cores(texts, self.n_cpus)), [])\n",
247
    "        return res\n",
248
    "        \n",
249
    "    def vectorize(self,texts=None):\n",
250
    "        texts = self.texts if self.texts is not None else texts\n",
251
    "        vectors = []\n",
252
    "        chunks = len(texts) // self.chunksize + 1\n",
253
    "        for i in tqdm(range(chunks)):\n",
254
    "            advance = min((len(texts) - i * self.chunksize), self.chunksize)\n",
255
    "            vectors += self.process_all(texts[i:i + advance])\n",
256
    "        return vectors"
257
   ]
258
  },
259
  {
260
   "cell_type": "code",
261
   "execution_count": 3,
262
   "metadata": {},
263
   "outputs": [
264
    {
265
     "ename": "TypeError",
266
     "evalue": "can only concatenate list (not \"int\") to list",
267
     "output_type": "error",
268
     "traceback": [
269
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
270
      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
271
      "\u001b[0;32m<ipython-input-3-b170b0d83771>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
272
      "\u001b[0;31mTypeError\u001b[0m: can only concatenate list (not \"int\") to list"
273
     ]
274
    }
275
   ],
276
   "source": [
277
    "sum(list([1]),[])"
278
   ]
279
  },
280
  {
281
   "cell_type": "code",
282
   "execution_count": 6,
283
   "metadata": {},
284
   "outputs": [
285
    {
286
     "name": "stdout",
287
     "output_type": "stream",
288
     "text": [
289
      "Object `ProcessPoolExecutor` not found.\n"
290
     ]
291
    }
292
   ],
293
   "source": [
294
    "ProcessPoolExecutor??"
295
   ]
296
  },
297
  {
298
   "cell_type": "code",
299
   "execution_count": 5,
300
   "metadata": {},
301
   "outputs": [
302
    {
303
     "ename": "NameError",
304
     "evalue": "name 'partition_by_cores' is not defined",
305
     "output_type": "error",
306
     "traceback": [
307
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
308
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
309
      "\u001b[0;32m<ipython-input-5-9f45c6633e01>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mres\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0mpartition_by_cores\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m7\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
310
      "\u001b[0;31mNameError\u001b[0m: name 'partition_by_cores' is not defined"
311
     ]
312
    }
313
   ],
314
   "source": [
315
    "res= partition_by_cores(data.items, 7)"
316
   ]
317
  },
318
  {
319
   "cell_type": "code",
320
   "execution_count": 51,
321
   "metadata": {},
322
   "outputs": [
323
    {
324
     "data": {
325
      "text/plain": [
326
       "7"
327
      ]
328
     },
329
     "execution_count": 51,
330
     "metadata": {},
331
     "output_type": "execute_result"
332
    }
333
   ],
334
   "source": [
335
    "len(res)"
336
   ]
337
  },
338
  {
339
   "cell_type": "code",
340
   "execution_count": 45,
341
   "metadata": {},
342
   "outputs": [
343
    {
344
     "name": "stderr",
345
     "output_type": "stream",
346
     "text": [
347
      "\n",
348
      "\n",
349
      "\n",
350
      "\n",
351
      "\n",
352
      "\n",
353
      "\n",
354
      "  0%|          | 0/448 [00:00<?, ?it/s]\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A"
355
     ]
356
    },
357
    {
358
     "ename": "TypeError",
359
     "evalue": "zip argument #2 must support iteration",
360
     "output_type": "error",
361
     "traceback": [
362
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
363
      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
364
      "\u001b[0;32m<ipython-input-45-cacfbd0a088c>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mvectors\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mVectorizer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvectorize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
365
      "\u001b[0;32m<ipython-input-44-aef08dea39e1>\u001b[0m in \u001b[0;36mvectorize\u001b[0;34m(self, texts)\u001b[0m\n\u001b[1;32m     30\u001b[0m         \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchunks\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     31\u001b[0m             \u001b[0madvance\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtexts\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mi\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchunksize\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchunksize\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 32\u001b[0;31m             \u001b[0mvectors\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprocess_all\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtexts\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mi\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0madvance\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     33\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mvectors\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
366
      "\u001b[0;32m<ipython-input-44-aef08dea39e1>\u001b[0m in \u001b[0;36mprocess_all\u001b[0;34m(self, texts)\u001b[0m\n\u001b[1;32m     21\u001b[0m         \u001b[0;32mwith\u001b[0m \u001b[0mProcessPoolExecutor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mn_cpus\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     22\u001b[0m             res = sum(e.map(self._process_all_1,\n\u001b[0;32m---> 23\u001b[0;31m                              partition_by_cores(texts, self.n_cpus),5), [])\n\u001b[0m\u001b[1;32m     24\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mres\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     25\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
367
      "\u001b[0;32m~/anaconda3/envs/bio/lib/python3.6/concurrent/futures/process.py\u001b[0m in \u001b[0;36mmap\u001b[0;34m(self, fn, timeout, chunksize, *iterables)\u001b[0m\n\u001b[1;32m    494\u001b[0m         results = super().map(partial(_process_chunk, fn),\n\u001b[1;32m    495\u001b[0m                               \u001b[0m_get_chunks\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0miterables\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mchunksize\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mchunksize\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 496\u001b[0;31m                               timeout=timeout)\n\u001b[0m\u001b[1;32m    497\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0m_chain_from_iterable_of_lists\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresults\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    498\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
368
      "\u001b[0;32m~/anaconda3/envs/bio/lib/python3.6/concurrent/futures/_base.py\u001b[0m in \u001b[0;36mmap\u001b[0;34m(self, fn, timeout, chunksize, *iterables)\u001b[0m\n\u001b[1;32m    573\u001b[0m             \u001b[0mend_time\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtimeout\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmonotonic\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    574\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 575\u001b[0;31m         \u001b[0mfs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msubmit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0margs\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mzip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0miterables\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    576\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    577\u001b[0m         \u001b[0;31m# Yield must be hidden in closure so that the futures are submitted\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
369
      "\u001b[0;32m~/anaconda3/envs/bio/lib/python3.6/concurrent/futures/_base.py\u001b[0m in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m    573\u001b[0m             \u001b[0mend_time\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtimeout\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmonotonic\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    574\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 575\u001b[0;31m         \u001b[0mfs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msubmit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0margs\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mzip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0miterables\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    576\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    577\u001b[0m         \u001b[0;31m# Yield must be hidden in closure so that the futures are submitted\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
370
      "\u001b[0;32m~/anaconda3/envs/bio/lib/python3.6/concurrent/futures/process.py\u001b[0m in \u001b[0;36m_get_chunks\u001b[0;34m(chunksize, *iterables)\u001b[0m\n\u001b[1;32m    135\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_get_chunks\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0miterables\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mchunksize\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    136\u001b[0m     \u001b[0;34m\"\"\" Iterates over zip()ed iterables in chunks. \"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 137\u001b[0;31m     \u001b[0mit\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mzip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0miterables\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    138\u001b[0m     \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    139\u001b[0m         \u001b[0mchunk\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtuple\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mitertools\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mislice\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mit\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mchunksize\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
371
      "\u001b[0;31mTypeError\u001b[0m: zip argument #2 must support iteration"
372
     ]
373
    }
374
   ],
375
   "source": [
376
    "vectors = Vectorizer().vectorize(data.items)"
377
   ]
378
  },
379
  {
380
   "cell_type": "code",
381
   "execution_count": 26,
382
   "metadata": {
383
    "collapsed": true
384
   },
385
   "outputs": [
386
    {
387
     "name": "stderr",
388
     "output_type": "stream",
389
     "text": [
390
      "\n",
391
      "  0%|          | 0/447112 [00:00<?, ?it/s]\u001b[A"
392
     ]
393
    },
394
    {
395
     "ename": "KeyboardInterrupt",
396
     "evalue": "",
397
     "output_type": "error",
398
     "traceback": [
399
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
400
      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
401
      "\u001b[0;32m<ipython-input-26-1645d2202b3e>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0mvectors\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0msequence\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m     \u001b[0mmers\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0masarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mword_vectors\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mk_mers\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msequence\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'ATGC'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      4\u001b[0m     \u001b[0mvectors\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmers\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmean\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
402
      "\u001b[0;32m<ipython-input-26-1645d2202b3e>\u001b[0m in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0mvectors\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0msequence\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m     \u001b[0mmers\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0masarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mword_vectors\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mk_mers\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msequence\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'ATGC'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      4\u001b[0m     \u001b[0mvectors\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmers\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmean\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
403
      "\u001b[0;32m<ipython-input-9-dff9effaec09>\u001b[0m in \u001b[0;36mk_mers\u001b[0;34m(sequence, k)\u001b[0m\n\u001b[1;32m      5\u001b[0m         \u001b[0;32myield\u001b[0m \u001b[0;34m\"\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      6\u001b[0m     \u001b[0;32mfor\u001b[0m \u001b[0melem\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mit\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m         \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0melem\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      8\u001b[0m         \u001b[0;32myield\u001b[0m \u001b[0;34m\"\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
404
      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
405
     ]
406
    }
407
   ],
408
   "source": [
409
    "vectors = []\n",
410
    "for sequence in tqdm(data.items):\n",
411
    "    mers = np.asarray([word_vectors[x] for x in k_mers(sequence, 10) if set(x) == set('ATGC')])\n",
412
    "    vectors.append(mers.mean(axis=0))\n",
413
    "    "
414
   ]
415
  },
416
  {
417
   "cell_type": "code",
418
   "execution_count": null,
419
   "metadata": {},
420
   "outputs": [],
421
   "source": []
422
  }
423
 ],
424
 "metadata": {
425
  "kernelspec": {
426
   "display_name": "Python [conda env:bio] *",
427
   "language": "python",
428
   "name": "conda-env-bio-py"
429
  },
430
  "language_info": {
431
   "codemirror_mode": {
432
    "name": "ipython",
433
    "version": 3
434
   },
435
   "file_extension": ".py",
436
   "mimetype": "text/x-python",
437
   "name": "python",
438
   "nbconvert_exporter": "python",
439
   "pygments_lexer": "ipython3",
440
   "version": "3.6.8"
441
  }
442
 },
443
 "nbformat": 4,
444
 "nbformat_minor": 2
445
}