Diff of /RMD.m [000000] .. [9c69dd]

Switch to unified view

a b/RMD.m
1
%Random matrix classification algorithm
2
%Alpha Lee 26/11/17
3
%
4
%Inputs
5
%
6
%training_binding: The training set for the binders 
7
%verification_binding: The test set for the binders 
8
%training_decoy: The training set for the decoys
9
%verification_decoy: The test set for the decoys 
10
%
11
%thres: a parameter that needs to be tuned such that the entire AUC curve
12
%is plotted (typically 100) 
13
%
14
%The datasets should be formatted as Nxp matrices, where N is the number of
15
%samples in the set and p is the number of descriptors per sample 
16
17
function AUC = RMD(training_binding,verification_binding,training_decoy,verification_decoy,thres) 
18
19
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
20
%processing the binding training set 
21
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
22
23
tset_cleaned = training_binding; 
24
25
%compute z score 
26
[tset_cleaned_z, mu, sigma] = zscore(tset_cleaned); 
27
indzero_bind = find(sigma==0); %get rid of descriptors that have the same value for every member of the dataset  
28
tset_cleaned_z(:,indzero_bind) = []; 
29
mu(indzero_bind) = [];
30
sigma(indzero_bind) = []; 
31
32
%get covarience matrix and eigenvalues 
33
covar =tset_cleaned_z'*tset_cleaned_z/size(training_binding,1); 
34
[v, d] =eig(covar);
35
36
%Use the MP bound to get the number of significant eigenvalues  
37
p = size(tset_cleaned,2); 
38
n = size(tset_cleaned,1); 
39
l = diag(d);
40
num_eig = length(l(find(l>(1+sqrt(p/n))^2))); 
41
42
%get the significnt eigenvectors 
43
vv = v(:,end-num_eig+1:end); 
44
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
45
%processing the decoy training set 
46
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
47
48
%get rid of columns with the same entry 
49
tset_d_cleaned = training_decoy; 
50
51
%compute z score 
52
[tset_d_cleaned_z, mu_d, sigma_d] = zscore(tset_d_cleaned);  
53
54
indzero_bind_d = find(sigma_d==0); 
55
tset_d_cleaned_z(:,indzero_bind_d) = []; 
56
mu_d(indzero_bind_d) = []; 
57
sigma_d(indzero_bind_d) = []; 
58
59
%get covarience matrix and eigenvalues 
60
covar_d =tset_d_cleaned_z'*tset_d_cleaned_z/size(training_decoy,1); 
61
[v_decoy, d_decoy] =eig(covar_d);
62
63
%Use the MP bound to get the number of significant eigenvalues  
64
p = size(tset_d_cleaned,2); 
65
n = size(tset_d_cleaned,1); 
66
l_decoy = diag(d_decoy);
67
num_eig_d = length(l_decoy(find(l_decoy>(1+sqrt(p/n))^2))); 
68
69
%get the significnt eigenvectors 
70
vv_decoy = v_decoy(:,end-num_eig_d+1:end);
71
72
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
73
%processing the binding verification set 
74
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
75
% first look at how close is the verification set to the binding set
76
verification_binding1 = verification_binding; 
77
verification_binding2 = verification_binding; 
78
verification_binding1(:,indzero_bind) = [];
79
verification_binding2(:,indzero_bind_d) = [];
80
81
%first look at how close the compounds are to the active training set 
82
83
%mean center and scale the verification set w.r.t. the active training set
84
veriset_mu = (verification_binding1-repmat(mu,size(verification_binding1,1),1))./repmat(sigma,size(verification_binding1,1),1);   
85
coeff = veriset_mu*vv; 
86
87
%project back into the ligand space 
88
proj_vect = (vv*coeff')';
89
norm_test = sqrt(sum((proj_vect-veriset_mu).^2,2));
90
91
%now look at how close the compounds are to the decoy training set 
92
93
%mean center and scale the verification set w.r.t. the decoy training
94
%set
95
veriset_mu = (verification_binding2-repmat(mu_d,size(verification_binding2,1),1))./repmat(sigma_d,size(verification_binding2,1),1);   
96
coeff = veriset_mu*vv_decoy; 
97
98
%project back into the ligand space 
99
proj_vect = (vv_decoy*coeff')';
100
norm_test_neg = sqrt(sum((proj_vect-veriset_mu).^2,2));
101
102
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
103
%processing the decoy verification set 
104
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
105
106
verification_decoy1 = verification_decoy;
107
verification_decoy2 = verification_decoy;
108
verification_decoy1(:,indzero_bind) = [];
109
verification_decoy2(:,indzero_bind_d) = [];
110
111
%first look at how close the compounds are to the active training set 
112
113
%mean center and scale the verification set w.r.t. the active training set
114
veriset_d_mu = (verification_decoy1-repmat(mu,size(verification_decoy1,1),1))./repmat(sigma,size(verification_decoy1,1),1);   
115
coeff_d = veriset_d_mu*vv; 
116
117
%project back into the ligand space 
118
proj_vect_decoy = (vv*coeff_d')';
119
norm_test_decoy = sqrt(sum((proj_vect_decoy-veriset_d_mu).^2,2));
120
121
%now look at how close the compounds are to the decoy training set 
122
123
%mean center and scale the verification set w.r.t. the decoy training
124
%set
125
veriset_d_mu = (verification_decoy2-repmat(mu_d,size(verification_decoy2,1),1))./repmat(sigma_d,size(verification_decoy2,1),1);   
126
coeff_d = veriset_d_mu*vv_decoy; 
127
128
%project back into the ligand space 
129
proj_vect_decoy = (vv_decoy*coeff_d')';
130
norm_test_decoy_neg = sqrt(sum((proj_vect_decoy-veriset_d_mu).^2,2));
131
132
threshold = -thres:0.01:thres;  
133
for ii = 1:length(threshold) 
134
% compute false negative and false positve 
135
136
     true_pos(ii) = length(find(norm_test < (norm_test_neg + threshold(ii))))/length(norm_test);  
137
     false_pos(ii) = length(find(norm_test_decoy < (norm_test_decoy_neg + threshold(ii)) ))/ length(norm_test_decoy); 
138
139
end
140
141
AUC = trapz(false_pos,true_pos);
142
143
end