|
a |
|
b/kernel_classifier.py |
|
|
1 |
import json |
|
|
2 |
import csv |
|
|
3 |
import numpy as np |
|
|
4 |
import os |
|
|
5 |
import pandas as pd |
|
|
6 |
import random |
|
|
7 |
import tensorflow as tf |
|
|
8 |
|
|
|
9 |
from functions import quan_detector, most_repeared_promoter,dataset |
|
|
10 |
from sklearn.metrics import confusion_matrix |
|
|
11 |
|
|
|
12 |
from sklearn import datasets, linear_model,svm |
|
|
13 |
from sklearn.metrics import mean_squared_error, r2_score |
|
|
14 |
|
|
|
15 |
np.random.seed(42) |
|
|
16 |
tf.set_random_seed(42) |
|
|
17 |
random.seed(42) |
|
|
18 |
|
|
|
19 |
|
|
|
20 |
|
|
|
21 |
def get_input_fn(dataset_split, batch_size, capacity=10000, min_after_dequeue=3000): |
|
|
22 |
|
|
|
23 |
def _input_fn(): |
|
|
24 |
images_batch, labels_batch = tf.train.shuffle_batch( |
|
|
25 |
tensors=dataset_split, |
|
|
26 |
batch_size=batch_size, |
|
|
27 |
capacity=capacity, |
|
|
28 |
min_after_dequeue=min_after_dequeue, |
|
|
29 |
enqueue_many=True, |
|
|
30 |
num_threads=4) |
|
|
31 |
features_map = {'images': images_batch} |
|
|
32 |
return features_map, labels_batch |
|
|
33 |
|
|
|
34 |
return _input_fn |
|
|
35 |
|
|
|
36 |
out_put_header = ['Promoter region','Posotive_zeros','Negative_zeros','Sum_zeros', |
|
|
37 |
'Positive_freq', 'Negative_freq','Sum_freq', |
|
|
38 |
'Sum_all','Percent_all', 'Vector_freq', |
|
|
39 |
"AUC","Accuracy", |
|
|
40 |
'>50%'] |
|
|
41 |
|
|
|
42 |
output_file_name = 'output_kernelC.csv' |
|
|
43 |
# with open(output_file_name,'w') as f: |
|
|
44 |
# writer = csv.writer(f) |
|
|
45 |
# writer.writerow(out_put_header) |
|
|
46 |
|
|
|
47 |
labels_file = 'labes.csv' |
|
|
48 |
labels_df = pd.read_csv(labels_file, index_col=0) |
|
|
49 |
ids_csv = labels_df.FID.tolist() |
|
|
50 |
|
|
|
51 |
|
|
|
52 |
promoters_list = range(100) |
|
|
53 |
for promoter_num in promoters_list: |
|
|
54 |
print promoter_num |
|
|
55 |
promoter_file = 'promoters/chr22_'+str(promoter_num)+'.json' |
|
|
56 |
# # read files |
|
|
57 |
with open(promoter_file) as json_data: |
|
|
58 |
ind_var = json.load(json_data) |
|
|
59 |
ids_json = ind_var.keys() |
|
|
60 |
|
|
|
61 |
var_num = [] |
|
|
62 |
for i in ids_csv: |
|
|
63 |
id_name = str(i) |
|
|
64 |
temp = ind_var[id_name] |
|
|
65 |
var_seq = map(int, temp) |
|
|
66 |
var_num.append(var_seq) |
|
|
67 |
|
|
|
68 |
labels_df['vars'] = var_num |
|
|
69 |
lab_num = {1: [1,0], # positive |
|
|
70 |
2: [0,1]} # negative |
|
|
71 |
|
|
|
72 |
pheno_new = [] |
|
|
73 |
for i in labels_df.Pheno.tolist(): |
|
|
74 |
pheno_new.append(lab_num[i]) |
|
|
75 |
d = {"Pheno": pheno_new, "Vars":labels_df.vars} |
|
|
76 |
dataset_ = pd.DataFrame(d) |
|
|
77 |
|
|
|
78 |
dataset_X = np.array(dataset_.Vars.tolist(),dtype=np.float32) |
|
|
79 |
dataset_Y = np.array(dataset_.Pheno.tolist(),dtype=np.float32) |
|
|
80 |
|
|
|
81 |
N = len(dataset_X) |
|
|
82 |
|
|
|
83 |
# repeat information |
|
|
84 |
per_zeros, p_zeros,n_zeros = quan_detector(dataset_X,dataset_Y) |
|
|
85 |
count_zeros = p_zeros+n_zeros # sum of individuals without any variants |
|
|
86 |
|
|
|
87 |
most_vector, max_count,count_vector = most_repeared_promoter(dataset_X,dataset_Y) |
|
|
88 |
_, p_count,n_count = count_vector |
|
|
89 |
|
|
|
90 |
vart_pos = [] |
|
|
91 |
for i in range(len(most_vector)): |
|
|
92 |
if most_vector[i] != '0.0': |
|
|
93 |
vart_pos.append(i) |
|
|
94 |
|
|
|
95 |
np.random.seed(42) |
|
|
96 |
tf.set_random_seed(42) |
|
|
97 |
random.seed(42) |
|
|
98 |
|
|
|
99 |
# network accuracy |
|
|
100 |
|
|
|
101 |
x_train, y_train,x_test,y_test = dataset(dataset_X,dataset_Y,test_ratio=0.1) |
|
|
102 |
y_train = np.argmax(y_train, axis=1) |
|
|
103 |
y_test = np.argmax(y_test, axis=1) |
|
|
104 |
|
|
|
105 |
data = {} |
|
|
106 |
data['train'] = [x_train, y_train] |
|
|
107 |
data['test'] = [x_test, y_test] |
|
|
108 |
|
|
|
109 |
|
|
|
110 |
train_input_fn = get_input_fn(data['train'], batch_size=256) |
|
|
111 |
eval_input_fn = get_input_fn(data['test'], batch_size=len(y_test)) |
|
|
112 |
|
|
|
113 |
image_column = tf.contrib.layers.real_valued_column('images', dimension=64) |
|
|
114 |
optimizer = tf.train.FtrlOptimizer( |
|
|
115 |
learning_rate=50.0, l2_regularization_strength=0.001) |
|
|
116 |
|
|
|
117 |
kernel_mapper = tf.contrib.kernel_methods.RandomFourierFeatureMapper( |
|
|
118 |
input_dim=64, output_dim=2000, stddev=5.0, name='rffm') |
|
|
119 |
kernel_mappers = {image_column: [kernel_mapper]} |
|
|
120 |
estimator = tf.contrib.kernel_methods.KernelLinearClassifier( |
|
|
121 |
n_classes=2, optimizer=optimizer, kernel_mappers=kernel_mappers) |
|
|
122 |
|
|
|
123 |
estimator.fit(input_fn=train_input_fn, steps=2000) |
|
|
124 |
eval_metrics = estimator.evaluate(input_fn=eval_input_fn, steps=1) |
|
|
125 |
# print(eval_metrics.items()) |
|
|
126 |
# # Make predictions using the testing set |
|
|
127 |
# y_pred = estimator.predict(input_fn=eval_input_fn) |
|
|
128 |
# # y_pred = np.argmax(y_pred,axis=1) |
|
|
129 |
# y_test_num = y_test |
|
|
130 |
# tn, fp, fn, tp = confusion_matrix(y_test_num, y_pred).ravel() |
|
|
131 |
|
|
|
132 |
acc = eval_metrics['accuracy'] |
|
|
133 |
auc = eval_metrics['auc_precision_recall'] |
|
|
134 |
|
|
|
135 |
|
|
|
136 |
info = ['promoter '+str(promoter_num), p_zeros,n_zeros,count_zeros, |
|
|
137 |
p_count, n_count, max_count, |
|
|
138 |
max_count + count_zeros, (max_count + count_zeros)*1./N, vart_pos, |
|
|
139 |
auc, acc, acc>0.5] |
|
|
140 |
|
|
|
141 |
|
|
|
142 |
with open(output_file_name,'a') as f: |
|
|
143 |
writer = csv.writer(f) |
|
|
144 |
writer.writerow(info) |
|
|
145 |
print "Done" |