a b/kernel_classifier.py
1
import json
2
import csv
3
import numpy as np
4
import os
5
import pandas as pd
6
import random
7
import tensorflow as tf
8
9
from functions import quan_detector, most_repeared_promoter,dataset
10
from sklearn.metrics import confusion_matrix
11
12
from sklearn import datasets, linear_model,svm
13
from sklearn.metrics import mean_squared_error, r2_score
14
15
np.random.seed(42)
16
tf.set_random_seed(42)
17
random.seed(42)
18
19
20
21
def get_input_fn(dataset_split, batch_size, capacity=10000, min_after_dequeue=3000):
22
23
  def _input_fn():
24
    images_batch, labels_batch = tf.train.shuffle_batch(
25
        tensors=dataset_split,
26
        batch_size=batch_size,
27
        capacity=capacity,
28
        min_after_dequeue=min_after_dequeue,
29
        enqueue_many=True,
30
        num_threads=4)
31
    features_map = {'images': images_batch}
32
    return features_map, labels_batch
33
34
  return _input_fn
35
36
out_put_header = ['Promoter region','Posotive_zeros','Negative_zeros','Sum_zeros',
37
                  'Positive_freq', 'Negative_freq','Sum_freq',
38
                  'Sum_all','Percent_all', 'Vector_freq',
39
                  "AUC","Accuracy",
40
                  '>50%']
41
42
output_file_name = 'output_kernelC.csv'
43
# with open(output_file_name,'w') as f:
44
#     writer = csv.writer(f)
45
#     writer.writerow(out_put_header)
46
47
labels_file = 'labes.csv'
48
labels_df = pd.read_csv(labels_file, index_col=0)
49
ids_csv = labels_df.FID.tolist()
50
51
52
promoters_list = range(100)
53
for promoter_num in promoters_list:
54
    print promoter_num
55
    promoter_file = 'promoters/chr22_'+str(promoter_num)+'.json'
56
    # # read files
57
    with open(promoter_file) as json_data:
58
        ind_var = json.load(json_data)
59
    ids_json = ind_var.keys()
60
61
    var_num = []
62
    for i in ids_csv:
63
        id_name = str(i)
64
        temp = ind_var[id_name]
65
        var_seq = map(int, temp)
66
        var_num.append(var_seq)
67
68
    labels_df['vars'] = var_num
69
    lab_num = {1: [1,0], # positive
70
               2: [0,1]} # negative
71
72
    pheno_new = []
73
    for i in labels_df.Pheno.tolist():
74
        pheno_new.append(lab_num[i])
75
    d = {"Pheno": pheno_new, "Vars":labels_df.vars}
76
    dataset_ = pd.DataFrame(d)
77
78
    dataset_X = np.array(dataset_.Vars.tolist(),dtype=np.float32)
79
    dataset_Y = np.array(dataset_.Pheno.tolist(),dtype=np.float32)
80
81
    N = len(dataset_X)
82
83
    # repeat information
84
    per_zeros, p_zeros,n_zeros = quan_detector(dataset_X,dataset_Y)
85
    count_zeros = p_zeros+n_zeros # sum of individuals without any variants
86
87
    most_vector, max_count,count_vector = most_repeared_promoter(dataset_X,dataset_Y)
88
    _, p_count,n_count = count_vector
89
90
    vart_pos = []
91
    for i in range(len(most_vector)):
92
        if most_vector[i] != '0.0':
93
            vart_pos.append(i)
94
95
    np.random.seed(42)
96
    tf.set_random_seed(42)
97
    random.seed(42)
98
99
    # network accuracy
100
101
    x_train, y_train,x_test,y_test = dataset(dataset_X,dataset_Y,test_ratio=0.1)
102
    y_train = np.argmax(y_train, axis=1)
103
    y_test = np.argmax(y_test, axis=1)
104
105
    data = {}
106
    data['train'] = [x_train, y_train]
107
    data['test'] = [x_test, y_test]
108
109
110
    train_input_fn = get_input_fn(data['train'], batch_size=256)
111
    eval_input_fn = get_input_fn(data['test'], batch_size=len(y_test))
112
113
    image_column = tf.contrib.layers.real_valued_column('images', dimension=64)
114
    optimizer = tf.train.FtrlOptimizer(
115
        learning_rate=50.0, l2_regularization_strength=0.001)
116
117
    kernel_mapper = tf.contrib.kernel_methods.RandomFourierFeatureMapper(
118
        input_dim=64, output_dim=2000, stddev=5.0, name='rffm')
119
    kernel_mappers = {image_column: [kernel_mapper]}
120
    estimator = tf.contrib.kernel_methods.KernelLinearClassifier(
121
        n_classes=2, optimizer=optimizer, kernel_mappers=kernel_mappers)
122
123
    estimator.fit(input_fn=train_input_fn, steps=2000)
124
    eval_metrics = estimator.evaluate(input_fn=eval_input_fn, steps=1)
125
    # print(eval_metrics.items())
126
    # # Make predictions using the testing set
127
    # y_pred = estimator.predict(input_fn=eval_input_fn)
128
    # # y_pred = np.argmax(y_pred,axis=1)
129
    # y_test_num = y_test
130
    # tn, fp, fn, tp = confusion_matrix(y_test_num, y_pred).ravel()
131
132
    acc = eval_metrics['accuracy']
133
    auc = eval_metrics['auc_precision_recall']
134
135
136
    info = ['promoter '+str(promoter_num), p_zeros,n_zeros,count_zeros,
137
            p_count, n_count, max_count,
138
            max_count + count_zeros, (max_count + count_zeros)*1./N, vart_pos,
139
            auc, acc, acc>0.5]
140
141
142
    with open(output_file_name,'a') as f:
143
        writer = csv.writer(f)
144
        writer.writerow(info)
145
print "Done"