|
a |
|
b/exp/nb_BacteriaClassifier.py |
|
|
1 |
|
|
|
2 |
################################################# |
|
|
3 |
### THIS FILE WAS AUTOGENERATED! DO NOT EDIT! ### |
|
|
4 |
################################################# |
|
|
5 |
# file to edit: dev_nb/BacteriaClassifier.ipynb |
|
|
6 |
|
|
|
7 |
import sys |
|
|
8 |
sys.path.append("..") |
|
|
9 |
from faigen.data import sequence |
|
|
10 |
from faigen.data.sequence import regex_filter, count_filter, Dna2VecDataBunch |
|
|
11 |
from functools import partial |
|
|
12 |
import pandas as pd |
|
|
13 |
import numpy as np |
|
|
14 |
from sklearn.decomposition import PCA |
|
|
15 |
from sklearn import manifold,neighbors |
|
|
16 |
from scipy.cluster.hierarchy import dendrogram, linkage |
|
|
17 |
from matplotlib import pyplot as plt |
|
|
18 |
import seaborn as sns; sns.set(color_codes=True) |
|
|
19 |
import plotly.plotly as py |
|
|
20 |
import plotly.graph_objs as go |
|
|
21 |
from fastai import * |
|
|
22 |
from fastai.data_block import * |
|
|
23 |
from fastai.basic_train import * |
|
|
24 |
from fastai.layers import * |
|
|
25 |
from fastai.metrics import * |
|
|
26 |
from gensim.models import Word2Vec |
|
|
27 |
import torch |
|
|
28 |
import torch.nn as nn |
|
|
29 |
import torch.nn.functional as F |
|
|
30 |
|
|
|
31 |
if __name__=='__main__': |
|
|
32 |
|
|
|
33 |
print("Loading embedding") |
|
|
34 |
word_vectors = Word2Vec.load_word2vec_format('../faigen/pretrained/embeddings/dna2vec-20190611-1940-k8to8-100d-10c-4870Mbp-sliding-LmP.w2v') |
|
|
35 |
|
|
|
36 |
print("Loading Data") |
|
|
37 |
DB="/data/genomes/GenSeq_fastas" |
|
|
38 |
# DB='/home/serge/development/genomes/ncbi-genomes-2019-04-07/bacterial genomes' |
|
|
39 |
|
|
|
40 |
filters=[partial(regex_filter, rx="Bacillus|Staphylococcus|Vibrio|Rhizobium"),partial(regex_filter, rx="plasmid?\s", keep=False)] |
|
|
41 |
# partial(count_filter,num_fastas=(1,1), keep=1)] |
|
|
42 |
|
|
|
43 |
bunch = Dna2VecDataBunch.from_folder(DB,test="test", |
|
|
44 |
filters=filters, |
|
|
45 |
labeler=lambda x: x.split()[1], |
|
|
46 |
emb=word_vectors,ngram=8,skip=0, |
|
|
47 |
n_cpus=7,agg=partial(np.mean, axis=0)) |
|
|
48 |
|
|
|
49 |
print("Creating Learner") |
|
|
50 |
layers=[nn.Linear(bunch.train_dl.x.c,10),nn.ReLU(), |
|
|
51 |
nn.Linear(10,bunch.train_dl.y.c)] |
|
|
52 |
bac_classifier = SequentialEx(*layers) |
|
|
53 |
print(bac_classifier) |
|
|
54 |
learn = Learner(bunch, bac_classifier, metrics=[accuracy]) |
|
|
55 |
|
|
|
56 |
print ("Training") |
|
|
57 |
learn.fit_one_cycle(3,5e-2) |