a b/svm.py
1
import json
2
import csv
3
import numpy as np
4
import os
5
import pandas as pd
6
import random
7
import tensorflow as tf
8
9
from functions import quan_detector, most_repeared_promoter,dataset
10
from sklearn.metrics import confusion_matrix
11
12
from sklearn import datasets, linear_model,svm
13
from sklearn.metrics import mean_squared_error, r2_score
14
15
np.random.seed(42)
16
tf.set_random_seed(42)
17
random.seed(42)
18
19
20
out_put_header = ['Promoter region','Posotive_zeros','Negative_zeros','Sum_zeros',
21
                  'Positive_freq', 'Negative_freq','Sum_freq',
22
                  'Sum_all','Percent_all', 'Vector_freq',
23
                  "True positive", "False positive", "True negative", "False negative", "Accuracy",
24
                  '>50%']
25
26
output_file_name = 'output_svm.csv'
27
with open(output_file_name,'w') as f:
28
    writer = csv.writer(f)
29
    writer.writerow(out_put_header)
30
31
labels_file = 'labes.csv'
32
labels_df = pd.read_csv(labels_file, index_col=0)
33
ids_csv = labels_df.FID.tolist()
34
35
36
promoters_list = range(100)
37
for promoter_num in promoters_list:
38
    print promoter_num
39
    promoter_file = 'promoters/chr22_'+str(promoter_num)+'.json'
40
    # # read files
41
    with open(promoter_file) as json_data:
42
        ind_var = json.load(json_data)
43
    ids_json = ind_var.keys()
44
45
    var_num = []
46
    for i in ids_csv:
47
        id_name = str(i)
48
        temp = ind_var[id_name]
49
        var_seq = map(int, temp)
50
        var_num.append(var_seq)
51
52
    labels_df['vars'] = var_num
53
    lab_num = {1: [1,0], # positive
54
               2: [0,1]} # negative
55
56
    pheno_new = []
57
    for i in labels_df.Pheno.tolist():
58
        pheno_new.append(lab_num[i])
59
    d = {"Pheno": pheno_new, "Vars":labels_df.vars}
60
    dataset_ = pd.DataFrame(d)
61
62
    dataset_X = np.array(dataset_.Vars.tolist())
63
    dataset_Y = np.array(dataset_.Pheno.tolist())
64
    t_idx = [int(line.strip()) for line in open("train_id.txt", 'r')]
65
    dataset_X= dataset_X[t_idx]
66
    dataset_Y = dataset_Y[t_idx]
67
68
69
    N = len(dataset_X)
70
71
    # repeat information
72
    per_zeros, p_zeros,n_zeros = quan_detector(dataset_X,dataset_Y)
73
    count_zeros = p_zeros+n_zeros # sum of individuals without any variants
74
75
    most_vector, max_count,count_vector = most_repeared_promoter(dataset_X,dataset_Y)
76
    _, p_count,n_count = count_vector
77
78
    vart_pos = []
79
    for i in range(len(most_vector)):
80
        if most_vector[i] != '0':
81
            vart_pos.append(i)
82
83
    np.random.seed(42)
84
    tf.set_random_seed(42)
85
    random.seed(42)
86
87
    # network accuracy
88
89
    x_train, y_train,x_test,y_test = dataset(dataset_X,dataset_Y,test_ratio=0.1)
90
    y_train = np.argmax(y_train, axis=1)
91
    y_test = np.argmax(y_test, axis=1)
92
    # Create linear regression object
93
    lsvm = svm.SVC(kernel='rbf', gamma=0.7, C=1.0)
94
95
    # Train the model using the training sets
96
    lsvm.fit(x_train, y_train)
97
98
    # Make predictions using the testing set
99
    y_pred = lsvm.predict(x_test)
100
    # y_pred = np.argmax(y_pred,axis=1)
101
    y_test_num = y_test
102
    tn, fp, fn, tp = confusion_matrix(y_test_num, y_pred).ravel()
103
104
    acc = (tp+tn)*1./(tp+fp+tn+fn)
105
106
    info = ['promoter '+str(promoter_num), p_zeros,n_zeros,count_zeros,
107
            p_count, n_count, max_count,
108
            max_count + count_zeros, (max_count + count_zeros)*1./N, vart_pos,
109
            tp, fp, tn, fn, acc, acc>0.5]
110
111
112
    with open(output_file_name,'a') as f:
113
        writer = csv.writer(f)
114
        writer.writerow(info)
115
print "Done"