[9c69dd]: / RMD.m

Download this file

143 lines (110 with data), 5.2 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
%Random matrix classification algorithm
%Alpha Lee 26/11/17
%
%Inputs
%
%training_binding: The training set for the binders
%verification_binding: The test set for the binders
%training_decoy: The training set for the decoys
%verification_decoy: The test set for the decoys
%
%thres: a parameter that needs to be tuned such that the entire AUC curve
%is plotted (typically 100)
%
%The datasets should be formatted as Nxp matrices, where N is the number of
%samples in the set and p is the number of descriptors per sample
function AUC = RMD(training_binding,verification_binding,training_decoy,verification_decoy,thres)
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%processing the binding training set
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
tset_cleaned = training_binding;
%compute z score
[tset_cleaned_z, mu, sigma] = zscore(tset_cleaned);
indzero_bind = find(sigma==0); %get rid of descriptors that have the same value for every member of the dataset
tset_cleaned_z(:,indzero_bind) = [];
mu(indzero_bind) = [];
sigma(indzero_bind) = [];
%get covarience matrix and eigenvalues
covar =tset_cleaned_z'*tset_cleaned_z/size(training_binding,1);
[v, d] =eig(covar);
%Use the MP bound to get the number of significant eigenvalues
p = size(tset_cleaned,2);
n = size(tset_cleaned,1);
l = diag(d);
num_eig = length(l(find(l>(1+sqrt(p/n))^2)));
%get the significnt eigenvectors
vv = v(:,end-num_eig+1:end);
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%processing the decoy training set
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%get rid of columns with the same entry
tset_d_cleaned = training_decoy;
%compute z score
[tset_d_cleaned_z, mu_d, sigma_d] = zscore(tset_d_cleaned);
indzero_bind_d = find(sigma_d==0);
tset_d_cleaned_z(:,indzero_bind_d) = [];
mu_d(indzero_bind_d) = [];
sigma_d(indzero_bind_d) = [];
%get covarience matrix and eigenvalues
covar_d =tset_d_cleaned_z'*tset_d_cleaned_z/size(training_decoy,1);
[v_decoy, d_decoy] =eig(covar_d);
%Use the MP bound to get the number of significant eigenvalues
p = size(tset_d_cleaned,2);
n = size(tset_d_cleaned,1);
l_decoy = diag(d_decoy);
num_eig_d = length(l_decoy(find(l_decoy>(1+sqrt(p/n))^2)));
%get the significnt eigenvectors
vv_decoy = v_decoy(:,end-num_eig_d+1:end);
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%processing the binding verification set
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% first look at how close is the verification set to the binding set
verification_binding1 = verification_binding;
verification_binding2 = verification_binding;
verification_binding1(:,indzero_bind) = [];
verification_binding2(:,indzero_bind_d) = [];
%first look at how close the compounds are to the active training set
%mean center and scale the verification set w.r.t. the active training set
veriset_mu = (verification_binding1-repmat(mu,size(verification_binding1,1),1))./repmat(sigma,size(verification_binding1,1),1);
coeff = veriset_mu*vv;
%project back into the ligand space
proj_vect = (vv*coeff')';
norm_test = sqrt(sum((proj_vect-veriset_mu).^2,2));
%now look at how close the compounds are to the decoy training set
%mean center and scale the verification set w.r.t. the decoy training
%set
veriset_mu = (verification_binding2-repmat(mu_d,size(verification_binding2,1),1))./repmat(sigma_d,size(verification_binding2,1),1);
coeff = veriset_mu*vv_decoy;
%project back into the ligand space
proj_vect = (vv_decoy*coeff')';
norm_test_neg = sqrt(sum((proj_vect-veriset_mu).^2,2));
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%processing the decoy verification set
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
verification_decoy1 = verification_decoy;
verification_decoy2 = verification_decoy;
verification_decoy1(:,indzero_bind) = [];
verification_decoy2(:,indzero_bind_d) = [];
%first look at how close the compounds are to the active training set
%mean center and scale the verification set w.r.t. the active training set
veriset_d_mu = (verification_decoy1-repmat(mu,size(verification_decoy1,1),1))./repmat(sigma,size(verification_decoy1,1),1);
coeff_d = veriset_d_mu*vv;
%project back into the ligand space
proj_vect_decoy = (vv*coeff_d')';
norm_test_decoy = sqrt(sum((proj_vect_decoy-veriset_d_mu).^2,2));
%now look at how close the compounds are to the decoy training set
%mean center and scale the verification set w.r.t. the decoy training
%set
veriset_d_mu = (verification_decoy2-repmat(mu_d,size(verification_decoy2,1),1))./repmat(sigma_d,size(verification_decoy2,1),1);
coeff_d = veriset_d_mu*vv_decoy;
%project back into the ligand space
proj_vect_decoy = (vv_decoy*coeff_d')';
norm_test_decoy_neg = sqrt(sum((proj_vect_decoy-veriset_d_mu).^2,2));
threshold = -thres:0.01:thres;
for ii = 1:length(threshold)
% compute false negative and false positve
true_pos(ii) = length(find(norm_test < (norm_test_neg + threshold(ii))))/length(norm_test);
false_pos(ii) = length(find(norm_test_decoy < (norm_test_decoy_neg + threshold(ii)) ))/ length(norm_test_decoy);
end
AUC = trapz(false_pos,true_pos);
end