--- a +++ b/linear-reg.py @@ -0,0 +1,87 @@ +import json +import csv +import numpy as np +import os +import pandas as pd +import random +import tensorflow as tf + +from functions import quan_detector, most_repeared_promoter,dataset +from sklearn.metrics import confusion_matrix + +from sklearn import datasets, linear_model +from sklearn.metrics import mean_squared_error, r2_score + +np.random.seed(42) +tf.set_random_seed(42) +random.seed(42) + + +labels_file = 'labes.csv' +labels_df = pd.read_csv(labels_file, index_col=0) +ids_csv = labels_df.FID.tolist() + + +promoters_list = range(1,2484) +dataset_X = [] +for promoter_num in promoters_list: + promoter_file = 'promoters/chr22_'+str(promoter_num)+'.json' + # # read files + with open(promoter_file) as json_data: + ind_var = json.load(json_data) + ids_json = ind_var.keys() + + var_num = [] + for i in ids_csv: + id_name = str(i) + temp = ind_var[id_name] + var_seq = map(int, temp) + var_num.append(var_seq) + + labels_df['vars'] = var_num + lab_num = {1: [1, 0], # positive + 2: [0, 1]} # negative + + pheno_new = [] + for i in labels_df.Pheno.tolist(): + pheno_new.append(lab_num[i]) + d = {"Pheno": pheno_new, "Vars":labels_df.vars} + dataset_ = pd.DataFrame(d) + + dataset_X .append(dataset_.Vars.tolist()) + dataset_Y = np.array(dataset_.Pheno.tolist()) + +dataset_X = np.array(dataset_X).reshape(11908,64*2580,1) +N = len(dataset_X) + + +# network accuracy +x_train, y_train,x_test,y_test = dataset(dataset_X,dataset_Y,test_ratio=0.1) + +# Create linear regression object +regr = linear_model.LinearRegression() + +# Train the model using the training sets +regr.fit(x_train, y_train) + +# Make predictions using the testing set +y_pred = regr.predict(x_test) +y_pred = np.argmax(y_pred,axis=1) +y_test_num = np.argmax(y_test,axis=1) +tn, fp, fn, tp = confusion_matrix(y_test_num, y_pred).ravel() + +acc = (tp+tn)*1./(tp+fp+tn+fn) +print acc +dataset_Y = np.argmax(dataset_Y,axis=-1) +x_train, y_train,x_test,y_test = dataset(dataset_X,dataset_Y,test_ratio=0.1) +logisticRegr = linear_model.LogisticRegression() + +regr.fit(x_train, y_train) + +# Make predictions using the testing set +y_pred = regr.predict(x_test) +y_test_num = np.argmax(y_test,axis=1) +tn, fp, fn, tp = confusion_matrix(y_test_num, y_pred).ravel() + +acc = (tp+tn)*1./(tp+fp+tn+fn) +print "LogisticRegression acc is:",acc