Switch to unified view

a b/exp/nb_BacteriaClassifier.py
1
2
#################################################
3
### THIS FILE WAS AUTOGENERATED! DO NOT EDIT! ###
4
#################################################
5
# file to edit: dev_nb/BacteriaClassifier.ipynb
6
7
import sys
8
sys.path.append("..")
9
from faigen.data import sequence
10
from faigen.data.sequence import regex_filter, count_filter, Dna2VecDataBunch
11
from functools import partial
12
import pandas as pd
13
import numpy as np
14
from sklearn.decomposition import PCA
15
from sklearn import manifold,neighbors
16
from scipy.cluster.hierarchy import dendrogram, linkage
17
from matplotlib import pyplot as plt
18
import seaborn as sns; sns.set(color_codes=True)
19
import plotly.plotly as py
20
import plotly.graph_objs as go
21
from fastai import *
22
from fastai.data_block import *
23
from fastai.basic_train import *
24
from fastai.layers import *
25
from fastai.metrics import *
26
from gensim.models import Word2Vec
27
import torch
28
import torch.nn as nn
29
import torch.nn.functional as F
30
31
if __name__=='__main__':
32
33
    print("Loading embedding")
34
    word_vectors = Word2Vec.load_word2vec_format('../faigen/pretrained/embeddings/dna2vec-20190611-1940-k8to8-100d-10c-4870Mbp-sliding-LmP.w2v')
35
36
    print("Loading Data")
37
    DB="/data/genomes/GenSeq_fastas"
38
    # DB='/home/serge/development/genomes/ncbi-genomes-2019-04-07/bacterial genomes'
39
40
    filters=[partial(regex_filter, rx="Bacillus|Staphylococcus|Vibrio|Rhizobium"),partial(regex_filter, rx="plasmid?\s", keep=False)]
41
    #        partial(count_filter,num_fastas=(1,1), keep=1)]
42
43
    bunch = Dna2VecDataBunch.from_folder(DB,test="test",
44
                 filters=filters,
45
                 labeler=lambda x: x.split()[1],
46
                 emb=word_vectors,ngram=8,skip=0,
47
                 n_cpus=7,agg=partial(np.mean, axis=0))
48
49
    print("Creating Learner")
50
    layers=[nn.Linear(bunch.train_dl.x.c,10),nn.ReLU(),
51
            nn.Linear(10,bunch.train_dl.y.c)]
52
    bac_classifier = SequentialEx(*layers)
53
    print(bac_classifier)
54
    learn = Learner(bunch, bac_classifier, metrics=[accuracy])
55
56
    print ("Training")
57
    learn.fit_one_cycle(3,5e-2)