|
a |
|
b/linear-reg.py |
|
|
1 |
import json |
|
|
2 |
import csv |
|
|
3 |
import numpy as np |
|
|
4 |
import os |
|
|
5 |
import pandas as pd |
|
|
6 |
import random |
|
|
7 |
import tensorflow as tf |
|
|
8 |
|
|
|
9 |
from functions import quan_detector, most_repeared_promoter,dataset |
|
|
10 |
from sklearn.metrics import confusion_matrix |
|
|
11 |
|
|
|
12 |
from sklearn import datasets, linear_model |
|
|
13 |
from sklearn.metrics import mean_squared_error, r2_score |
|
|
14 |
|
|
|
15 |
np.random.seed(42) |
|
|
16 |
tf.set_random_seed(42) |
|
|
17 |
random.seed(42) |
|
|
18 |
|
|
|
19 |
|
|
|
20 |
labels_file = 'labes.csv' |
|
|
21 |
labels_df = pd.read_csv(labels_file, index_col=0) |
|
|
22 |
ids_csv = labels_df.FID.tolist() |
|
|
23 |
|
|
|
24 |
|
|
|
25 |
promoters_list = range(1,2484) |
|
|
26 |
dataset_X = [] |
|
|
27 |
for promoter_num in promoters_list: |
|
|
28 |
promoter_file = 'promoters/chr22_'+str(promoter_num)+'.json' |
|
|
29 |
# # read files |
|
|
30 |
with open(promoter_file) as json_data: |
|
|
31 |
ind_var = json.load(json_data) |
|
|
32 |
ids_json = ind_var.keys() |
|
|
33 |
|
|
|
34 |
var_num = [] |
|
|
35 |
for i in ids_csv: |
|
|
36 |
id_name = str(i) |
|
|
37 |
temp = ind_var[id_name] |
|
|
38 |
var_seq = map(int, temp) |
|
|
39 |
var_num.append(var_seq) |
|
|
40 |
|
|
|
41 |
labels_df['vars'] = var_num |
|
|
42 |
lab_num = {1: [1, 0], # positive |
|
|
43 |
2: [0, 1]} # negative |
|
|
44 |
|
|
|
45 |
pheno_new = [] |
|
|
46 |
for i in labels_df.Pheno.tolist(): |
|
|
47 |
pheno_new.append(lab_num[i]) |
|
|
48 |
d = {"Pheno": pheno_new, "Vars":labels_df.vars} |
|
|
49 |
dataset_ = pd.DataFrame(d) |
|
|
50 |
|
|
|
51 |
dataset_X .append(dataset_.Vars.tolist()) |
|
|
52 |
dataset_Y = np.array(dataset_.Pheno.tolist()) |
|
|
53 |
|
|
|
54 |
dataset_X = np.array(dataset_X).reshape(11908,64*2580,1) |
|
|
55 |
N = len(dataset_X) |
|
|
56 |
|
|
|
57 |
|
|
|
58 |
# network accuracy |
|
|
59 |
x_train, y_train,x_test,y_test = dataset(dataset_X,dataset_Y,test_ratio=0.1) |
|
|
60 |
|
|
|
61 |
# Create linear regression object |
|
|
62 |
regr = linear_model.LinearRegression() |
|
|
63 |
|
|
|
64 |
# Train the model using the training sets |
|
|
65 |
regr.fit(x_train, y_train) |
|
|
66 |
|
|
|
67 |
# Make predictions using the testing set |
|
|
68 |
y_pred = regr.predict(x_test) |
|
|
69 |
y_pred = np.argmax(y_pred,axis=1) |
|
|
70 |
y_test_num = np.argmax(y_test,axis=1) |
|
|
71 |
tn, fp, fn, tp = confusion_matrix(y_test_num, y_pred).ravel() |
|
|
72 |
|
|
|
73 |
acc = (tp+tn)*1./(tp+fp+tn+fn) |
|
|
74 |
print acc |
|
|
75 |
dataset_Y = np.argmax(dataset_Y,axis=-1) |
|
|
76 |
x_train, y_train,x_test,y_test = dataset(dataset_X,dataset_Y,test_ratio=0.1) |
|
|
77 |
logisticRegr = linear_model.LogisticRegression() |
|
|
78 |
|
|
|
79 |
regr.fit(x_train, y_train) |
|
|
80 |
|
|
|
81 |
# Make predictions using the testing set |
|
|
82 |
y_pred = regr.predict(x_test) |
|
|
83 |
y_test_num = np.argmax(y_test,axis=1) |
|
|
84 |
tn, fp, fn, tp = confusion_matrix(y_test_num, y_pred).ravel() |
|
|
85 |
|
|
|
86 |
acc = (tp+tn)*1./(tp+fp+tn+fn) |
|
|
87 |
print "LogisticRegression acc is:",acc |