|
a |
|
b/RMD.m |
|
|
1 |
%Random matrix classification algorithm |
|
|
2 |
%Alpha Lee 26/11/17 |
|
|
3 |
% |
|
|
4 |
%Inputs |
|
|
5 |
% |
|
|
6 |
%training_binding: The training set for the binders |
|
|
7 |
%verification_binding: The test set for the binders |
|
|
8 |
%training_decoy: The training set for the decoys |
|
|
9 |
%verification_decoy: The test set for the decoys |
|
|
10 |
% |
|
|
11 |
%thres: a parameter that needs to be tuned such that the entire AUC curve |
|
|
12 |
%is plotted (typically 100) |
|
|
13 |
% |
|
|
14 |
%The datasets should be formatted as Nxp matrices, where N is the number of |
|
|
15 |
%samples in the set and p is the number of descriptors per sample |
|
|
16 |
|
|
|
17 |
function AUC = RMD(training_binding,verification_binding,training_decoy,verification_decoy,thres) |
|
|
18 |
|
|
|
19 |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
|
|
20 |
%processing the binding training set |
|
|
21 |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
|
|
22 |
|
|
|
23 |
tset_cleaned = training_binding; |
|
|
24 |
|
|
|
25 |
%compute z score |
|
|
26 |
[tset_cleaned_z, mu, sigma] = zscore(tset_cleaned); |
|
|
27 |
indzero_bind = find(sigma==0); %get rid of descriptors that have the same value for every member of the dataset |
|
|
28 |
tset_cleaned_z(:,indzero_bind) = []; |
|
|
29 |
mu(indzero_bind) = []; |
|
|
30 |
sigma(indzero_bind) = []; |
|
|
31 |
|
|
|
32 |
%get covarience matrix and eigenvalues |
|
|
33 |
covar =tset_cleaned_z'*tset_cleaned_z/size(training_binding,1); |
|
|
34 |
[v, d] =eig(covar); |
|
|
35 |
|
|
|
36 |
%Use the MP bound to get the number of significant eigenvalues |
|
|
37 |
p = size(tset_cleaned,2); |
|
|
38 |
n = size(tset_cleaned,1); |
|
|
39 |
l = diag(d); |
|
|
40 |
num_eig = length(l(find(l>(1+sqrt(p/n))^2))); |
|
|
41 |
|
|
|
42 |
%get the significnt eigenvectors |
|
|
43 |
vv = v(:,end-num_eig+1:end); |
|
|
44 |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
|
|
45 |
%processing the decoy training set |
|
|
46 |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
|
|
47 |
|
|
|
48 |
%get rid of columns with the same entry |
|
|
49 |
tset_d_cleaned = training_decoy; |
|
|
50 |
|
|
|
51 |
%compute z score |
|
|
52 |
[tset_d_cleaned_z, mu_d, sigma_d] = zscore(tset_d_cleaned); |
|
|
53 |
|
|
|
54 |
indzero_bind_d = find(sigma_d==0); |
|
|
55 |
tset_d_cleaned_z(:,indzero_bind_d) = []; |
|
|
56 |
mu_d(indzero_bind_d) = []; |
|
|
57 |
sigma_d(indzero_bind_d) = []; |
|
|
58 |
|
|
|
59 |
%get covarience matrix and eigenvalues |
|
|
60 |
covar_d =tset_d_cleaned_z'*tset_d_cleaned_z/size(training_decoy,1); |
|
|
61 |
[v_decoy, d_decoy] =eig(covar_d); |
|
|
62 |
|
|
|
63 |
%Use the MP bound to get the number of significant eigenvalues |
|
|
64 |
p = size(tset_d_cleaned,2); |
|
|
65 |
n = size(tset_d_cleaned,1); |
|
|
66 |
l_decoy = diag(d_decoy); |
|
|
67 |
num_eig_d = length(l_decoy(find(l_decoy>(1+sqrt(p/n))^2))); |
|
|
68 |
|
|
|
69 |
%get the significnt eigenvectors |
|
|
70 |
vv_decoy = v_decoy(:,end-num_eig_d+1:end); |
|
|
71 |
|
|
|
72 |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
|
|
73 |
%processing the binding verification set |
|
|
74 |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
|
|
75 |
% first look at how close is the verification set to the binding set |
|
|
76 |
verification_binding1 = verification_binding; |
|
|
77 |
verification_binding2 = verification_binding; |
|
|
78 |
verification_binding1(:,indzero_bind) = []; |
|
|
79 |
verification_binding2(:,indzero_bind_d) = []; |
|
|
80 |
|
|
|
81 |
%first look at how close the compounds are to the active training set |
|
|
82 |
|
|
|
83 |
%mean center and scale the verification set w.r.t. the active training set |
|
|
84 |
veriset_mu = (verification_binding1-repmat(mu,size(verification_binding1,1),1))./repmat(sigma,size(verification_binding1,1),1); |
|
|
85 |
coeff = veriset_mu*vv; |
|
|
86 |
|
|
|
87 |
%project back into the ligand space |
|
|
88 |
proj_vect = (vv*coeff')'; |
|
|
89 |
norm_test = sqrt(sum((proj_vect-veriset_mu).^2,2)); |
|
|
90 |
|
|
|
91 |
%now look at how close the compounds are to the decoy training set |
|
|
92 |
|
|
|
93 |
%mean center and scale the verification set w.r.t. the decoy training |
|
|
94 |
%set |
|
|
95 |
veriset_mu = (verification_binding2-repmat(mu_d,size(verification_binding2,1),1))./repmat(sigma_d,size(verification_binding2,1),1); |
|
|
96 |
coeff = veriset_mu*vv_decoy; |
|
|
97 |
|
|
|
98 |
%project back into the ligand space |
|
|
99 |
proj_vect = (vv_decoy*coeff')'; |
|
|
100 |
norm_test_neg = sqrt(sum((proj_vect-veriset_mu).^2,2)); |
|
|
101 |
|
|
|
102 |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
|
|
103 |
%processing the decoy verification set |
|
|
104 |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
|
|
105 |
|
|
|
106 |
verification_decoy1 = verification_decoy; |
|
|
107 |
verification_decoy2 = verification_decoy; |
|
|
108 |
verification_decoy1(:,indzero_bind) = []; |
|
|
109 |
verification_decoy2(:,indzero_bind_d) = []; |
|
|
110 |
|
|
|
111 |
%first look at how close the compounds are to the active training set |
|
|
112 |
|
|
|
113 |
%mean center and scale the verification set w.r.t. the active training set |
|
|
114 |
veriset_d_mu = (verification_decoy1-repmat(mu,size(verification_decoy1,1),1))./repmat(sigma,size(verification_decoy1,1),1); |
|
|
115 |
coeff_d = veriset_d_mu*vv; |
|
|
116 |
|
|
|
117 |
%project back into the ligand space |
|
|
118 |
proj_vect_decoy = (vv*coeff_d')'; |
|
|
119 |
norm_test_decoy = sqrt(sum((proj_vect_decoy-veriset_d_mu).^2,2)); |
|
|
120 |
|
|
|
121 |
%now look at how close the compounds are to the decoy training set |
|
|
122 |
|
|
|
123 |
%mean center and scale the verification set w.r.t. the decoy training |
|
|
124 |
%set |
|
|
125 |
veriset_d_mu = (verification_decoy2-repmat(mu_d,size(verification_decoy2,1),1))./repmat(sigma_d,size(verification_decoy2,1),1); |
|
|
126 |
coeff_d = veriset_d_mu*vv_decoy; |
|
|
127 |
|
|
|
128 |
%project back into the ligand space |
|
|
129 |
proj_vect_decoy = (vv_decoy*coeff_d')'; |
|
|
130 |
norm_test_decoy_neg = sqrt(sum((proj_vect_decoy-veriset_d_mu).^2,2)); |
|
|
131 |
|
|
|
132 |
threshold = -thres:0.01:thres; |
|
|
133 |
for ii = 1:length(threshold) |
|
|
134 |
% compute false negative and false positve |
|
|
135 |
|
|
|
136 |
true_pos(ii) = length(find(norm_test < (norm_test_neg + threshold(ii))))/length(norm_test); |
|
|
137 |
false_pos(ii) = length(find(norm_test_decoy < (norm_test_decoy_neg + threshold(ii)) ))/ length(norm_test_decoy); |
|
|
138 |
|
|
|
139 |
end |
|
|
140 |
|
|
|
141 |
AUC = trapz(false_pos,true_pos); |
|
|
142 |
|
|
|
143 |
end |