[d2c46b]: / other compare / Covariate Shift_svm.py

Download this file

186 lines (173 with data), 11.3 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
# -*- coding: utf-8 -*-
"""
Created on Thu Aug 19 11:07:40 2021
@author: zhaoxt
"""
#https://www.cnblogs.com/MiQing4in/p/13397596.html
#https://zhuanlan.zhihu.com/p/205183444
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import roc_curve
from sklearn.svm import LinearSVC
from sklearn.metrics import auc
import random
name = "AdversarialValidation1"#dmp_lassoxgboost;age1Coefficient5EHR
names= ""
TRAIN_FILE = "data\\20210819deepfm_feature_%s%s.csv"%(name,names)
TEST_FILE = "data\\20210819deepfm_feature_%s%s_test.csv"%(name,names)
train = pd.read_csv(TRAIN_FILE)
test = pd.read_csv(TEST_FILE)
cols = [c for c in train.columns if c not in ['ID','target']]#'Diuretic',"Age","BMI","Gender","Heartfailure"
x_train = train[cols]
x_test = test[cols]
x_train['target']=0
x_test['target']= 1
data=pd.concat([x_train,x_test],axis=0)
train_label=data['target']
data.drop(['target'],axis=1,inplace=True)
train_data=data
#kf=StratifiedKFold(n_splits=6,shuffle=True,random_state=123)
x,y=pd.DataFrame(train_data),pd.DataFrame(train_label)
_get = lambda x,l:[x[i] for i in l]
random.seed( 1000 )
folds = list(StratifiedKFold(n_splits=10, shuffle=True, random_state=100).split(x, y))
for i, (train_idx, valid_idx) in enumerate(folds):
print("第",i+1,"次")
X_train_, y_train_ = _get(x.values, train_idx), _get(y.values, train_idx)
X_valid_, y_valid_ = _get(x.values, valid_idx), _get(y.values, valid_idx)
algorithm = LinearSVC()
random.seed( 1000 )
algorithm.fit(X_train_, y_train_)
y_train_meta = np.zeros((x.shape[0],1),dtype=float)
random.seed( 1000 )
y_train_meta[valid_idx,0] = algorithm.decision_function(X_valid_)
random.seed( 1000 )
fpr, tpr, thresholds = roc_curve(y_valid_, y_train_meta[valid_idx,0])
random.seed( 1000 )
roc_auc = auc(fpr, tpr)
print(roc_auc)
threshold = thresholds[np.argmax(tpr - fpr)]
print(threshold)
random.seed( 1000 )
pre = (np.array(y_train_meta[valid_idx,0]) >= threshold) * 1
print(matthews_corrcoef(np.array(y_valid_)[:,0], pre))
random.seed( 1000 )
fpr, tpr, thresholds = roc_curve(y, y_train_meta)
random.seed( 1000 )
roc_auc = auc(fpr, tpr)
print(roc_auc)
random.seed( 1000 )
threshold = thresholds[np.argmax(tpr - fpr)]
print(threshold)
pre = (np.array(y_train_meta) >= threshold) * 1
random.seed( 1000 )
print(matthews_corrcoef(np.array(y)[:,0], pre))
import matplotlib.pyplot as plt
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='r',label='SVM model:AUC = %0.3f' % roc_auc,lw=lw, alpha=1.0,markersize=12)
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='navy', alpha=.8)
plt.xlim([-0.02, 1.05])
plt.ylim([-0.02, 1.05])
plt.xlabel('False Positive Rate',fontdict={'family':'Times New Roman','size':10})
plt.ylabel('True Positive Rate',fontdict={'family':'Times New Roman','size':10})
plt.title('Covariate Shift: Receiver operating characteristic',fontdict={'family':'Times New Roman','size':10})
plt.legend(loc=4,fontsize = "x-small",prop={'family':'Times New Roman','size':8})
plt.savefig("D:\\anaconda-python\\UMN_JHU_alldata\\trainUMN_testJHU\\new_result\\deepfm\\output\\new_1126\\0827Covariate Shift_SVMmodel.pdf")
from scipy import stats
for i in range(train.shape[1]):
print(cols[i],stats.ks_2samp(train.values[:,i], test.values[:,i]))
#Ejectionfraction Ks_2sampResult(statistic=0.08365434707639026, pvalue=0.2609097363123599)
#Omega3 Ks_2sampResult(statistic=0.06128244073169121, pvalue=0.6390575068630184)
#Statin Ks_2sampResult(statistic=0.3663665646760146, pvalue=7.702447215575547e-17)
#Thiazides Ks_2sampResult(statistic=0.0503936545672001, pvalue=0.8458457594871575)
#Diuretic Ks_2sampResult(statistic=0.04548489584479811, pvalue=0.9173622344243776)
#Potassium Ks_2sampResult(statistic=0.01716231188594657, pvalue=0.9999999999390203)
#Aldosterone Ks_2sampResult(statistic=0.07308840901920212, pvalue=0.4164055906224715)
#Amiodarone Ks_2sampResult(statistic=0.002509410288582183, pvalue=1)
#Vasodilators Ks_2sampResult(statistic=0.07308840901920212, pvalue=0.4164055906224715)
#CoQ10 Ks_2sampResult(statistic=0.005400368340340605, pvalue=1.0)
#Betablocking Ks_2sampResult(statistic=0.28422373373836096, pvalue=1.537433513831843e-10)
#AngiotensinIIantagonists Ks_2sampResult(statistic=0.0354252423195169, pvalue=0.9913388160235281)
#ACEI Ks_2sampResult(statistic=0.2010976835648301, pvalue=1.812635338116486e-05)
#Warfarin Ks_2sampResult(statistic=0.020860390205962417, pvalue=0.9999998447726758)
#Clopidogrel Ks_2sampResult(statistic=0.08144577252415858, pvalue=0.2893322471829566)
#Aspirin Ks_2sampResult(statistic=0.3045044648425749, pvalue=4.637179529254354e-12)
#Folicacid Ks_2sampResult(statistic=0.04254991305113474, pvalue=0.9493437976118437)
#Coronaryheartdisease Ks_2sampResult(statistic=0.2539273738507708, pvalue=1.7458116730040274e-08)
#Heartfailure Ks_2sampResult(statistic=0.11310689941080221, pvalue=0.04982309719818101)
#Myocardialinfarction Ks_2sampResult(statistic=0.11904290211098638, pvalue=0.03359616169888213)
#Diabetes Ks_2sampResult(statistic=0.1596337141473508, pvalue=0.0013210483535550654)
#Atrialfibrillation Ks_2sampResult(statistic=0.06847314857616647, pvalue=0.4987550593777357)
#Stroke Ks_2sampResult(statistic=0.060563369947243685, pvalue=0.6534678500427418)
#Gender Ks_2sampResult(statistic=0.33370020618254126, pvalue=1.84297022087776e-14)
#Age Ks_2sampResult(statistic=0.23963400764563017, pvalue=1.3553634048424357e-07)
#Bloodglucose Ks_2sampResult(statistic=0.3572240932737532, pvalue=4.961710018877332e-16)
#BMI Ks_2sampResult(statistic=0.3935445053453374, pvalue=2.296050582735732e-19)
#LDLcholesterol Ks_2sampResult(statistic=0.11179349461063784, pvalue=0.05421687930314956)
#Numberofcigarettessmoked Ks_2sampResult(statistic=0.037685179070637696, pvalue=0.983110598034503)
#Creatinineserum Ks_2sampResult(statistic=0.3313668948615789, pvalue=2.964295475749168e-14)
#Smoking Ks_2sampResult(statistic=0.041874867008592165, pvalue=0.9556522270110792)
#Averagediastolicbloodpressure Ks_2sampResult(statistic=0.2689251359263906, pvalue=1.8039574278816417e-09)
#Leftventricularhypertrophy Ks_2sampResult(statistic=0.018820577164366373, pvalue=0.9999999964562477)
#Fastingbloodglucose Ks_2sampResult(statistic=0.3576496657788344, pvalue=4.554354248555585e-16)
#HDLcholesterol Ks_2sampResult(statistic=0.13840645109218047, pvalue=0.0080748492287539)
#Hight Ks_2sampResult(statistic=0.20721712268961823, pvalue=8.799157428129867e-06)
#Averagesystolicbloodpressure Ks_2sampResult(statistic=0.04901421265417832, pvalue=0.8682853133177235)
#Totalcholesterol Ks_2sampResult(statistic=0.06500253142265953, pvalue=0.5653457232461865)
#Triglycerides Ks_2sampResult(statistic=0.2722196541122778, pvalue=1.0795365712468197e-09)
#Ventricularrate Ks_2sampResult(statistic=0.09802842530835663, pvalue=0.12378319532780258)
#Waist Ks_2sampResult(statistic=0.26697337236860447, pvalue=2.422331002449596e-09)
#Weight Ks_2sampResult(statistic=0.09239325834452296, pvalue=0.1682000880336827)
#Treatedforhypertension Ks_2sampResult(statistic=0.2730928114933926, pvalue=9.365788145032639e-10)
#Treatedforlipids Ks_2sampResult(statistic=0.3897143527996067, pvalue=5.343842146548376e-19)
#Drinkbeer Ks_2sampResult(statistic=0.004585910615099019, pvalue=1.0)
#Drinkwine Ks_2sampResult(statistic=0.06048265792041794, pvalue=0.6550674876456024)
#Drinkliquor Ks_2sampResult(statistic=0.02085305274897826, pvalue=0.9999998466364483)
#Sleep Ks_2sampResult(statistic=0.8328747422718235, pvalue=2.9538317780452976e-85)
#Albuminurine Ks_2sampResult(statistic=0.2173208009568044, pvalue=2.5643255505691798e-06)
#Creatinineurine Ks_2sampResult(statistic=0.07927388525684768, pvalue=0.3194218512727901)
#HemoglobinA1cwholeblood Ks_2sampResult(statistic=0.17688407551710728, pvalue=0.0002507598683951784)
#Atrialenlargement Ks_2sampResult(statistic=0.031727163999501054, pvalue=0.9979368675275346)
#Rightventricularhypertrophy Ks_2sampResult(statistic=0.030112923462986198, pvalue=0.9990795355209018)
#Rheumatic Ks_2sampResult(statistic=0.002934982793663372, pvalue=1.0)
#Aorticvalve Ks_2sampResult(statistic=0.041720780411924833, pvalue=0.9567642484703144)
#Mitralvalve Ks_2sampResult(statistic=0.04089164777271493, pvalue=0.963473049387389)
#Arrhythmia Ks_2sampResult(statistic=0.03753109247397037, pvalue=0.9837094687308644)
#Dementia Ks_2sampResult(statistic=0.0033385429277920857, pvalue=1.0)
#Parkinson Ks_2sampResult(statistic=0.00878293601003764, pvalue=1.0)
#Adultseizuredisorder Ks_2sampResult(statistic=0.0020838377835009944, pvalue=1)
#Neurological Ks_2sampResult(statistic=0.0226287173391446, pvalue=0.9999981469345283)
#Thyroid Ks_2sampResult(statistic=0.022408593629619847, pvalue=0.9999985882436547)
#Endocrine Ks_2sampResult(statistic=0.01635519161768914, pvalue=0.999999999994527)
#Renal Ks_2sampResult(statistic=0.03125756675251491, pvalue=0.9983454855648579)
#Gynecologic Ks_2sampResult(statistic=0.053974333575469415, pvalue=0.7827843700957668)
#Emphysema Ks_2sampResult(statistic=0.0028909580517584217, pvalue=1)
#Pneumonia Ks_2sampResult(statistic=0.00378612780382575, pvalue=1.0)
#Asthma Ks_2sampResult(statistic=0.028498682926471345, pvalue=0.9996416415491035)
#Pulmonary Ks_2sampResult(statistic=0.035175768782055514, pvalue=0.9920759452269916)
#Gout Ks_2sampResult(statistic=0.03130159149441986, pvalue=0.9983090337765563)
#Degenerative Ks_2sampResult(statistic=0.04640941542480207, pvalue=0.9057835812395012)
#Rheumatoidarthritis Ks_2sampResult(statistic=0.006699098226536647, pvalue=1.0)
#Musculoskeletal Ks_2sampResult(statistic=0.037369668420318886, pvalue=0.9844785486754418)
#Gallbladder Ks_2sampResult(statistic=0.0028909580517584217, pvalue=1)
#Gerd Ks_2sampResult(statistic=0.017793333186584194, pvalue=0.9999999996704556)
#Liver Ks_2sampResult(statistic=0.00963408102020002, pvalue=1.0)
#Gidisease Ks_2sampResult(statistic=0.0437385810825684, pvalue=0.9375106153701078)
#Hematologicdisorder Ks_2sampResult(statistic=0.026708343422336685, pvalue=0.9998961167257331)
#Bleedingdisorder Ks_2sampResult(statistic=0.009186496144166355, pvalue=1.0)
#Eye Ks_2sampResult(statistic=0.015305935268954485, pvalue=0.9999999999998629)
#Ent Ks_2sampResult(statistic=0.02249664311342975, pvalue=0.9999984143318319)
#Skin Ks_2sampResult(statistic=0.06376983864932091, pvalue=0.5895417406394831)
#Depression Ks_2sampResult(statistic=0.023523887091211927, pvalue=0.9999946489887167)
#Anxiety Ks_2sampResult(statistic=0.039769016854138695, pvalue=0.971597846158273)
#Psychosis Ks_2sampResult(statistic=0.0012547051442910915, pvalue=1)
#Prostate Ks_2sampResult(statistic=0.11727457497780419, pvalue=0.03786243088245611)
#Infectious Ks_2sampResult(statistic=0.022225157205015885, pvalue=0.9999988975416942)
#Fever Ks_2sampResult(statistic=0.07724874712921996, pvalue=0.3494029972161632)
#Chronicbronchitis Ks_2sampResult(statistic=0.045594957699560484, pvalue=0.9162528876871665)
#COPD Ks_2sampResult(statistic=0.0113143586695723, pvalue=1.0)
#Creactiveprotein Ks_2sampResult(statistic=0.3299801154915729, pvalue=3.963496197911809e-14)