[422372]: / functions / studyfunc / robust_kmeans.m

Download this file

119 lines (107 with data), 4.7 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
% ROBUST_KMEANS - an extension of Matlab KMEANS that removes outlier
% components from all clusters.
% This is a helper function called from POP_CLUST.
% Copyright (C) 2006 UCSD
%
% This file is part of EEGLAB, see http://www.eeglab.org
% for the documentation and details.
%
% Redistribution and use in source and binary forms, with or without
% modification, are permitted provided that the following conditions are met:
%
% 1. Redistributions of source code must retain the above copyright notice,
% this list of conditions and the following disclaimer.
%
% 2. Redistributions in binary form must reproduce the above copyright notice,
% this list of conditions and the following disclaimer in the documentation
% and/or other materials provided with the distribution.
%
% THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
% AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
% IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
% ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
% LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
% CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
% SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
% INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
% CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
% ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
% THE POSSIBILITY OF SUCH DAMAGE.
function [IDX,C,sumd,D,outliers] = robust_kmeans(data,clus_num,STD,MAXiter,method)
% data - pre-clustering data matrix.
% N - number of wanted clusters.
if nargin < 5
method = 'kmeans';
end
flag = 1;
not_outliers = 1:size(data,1);
old_outliers = [];
if strcmpi(method, 'kmeans')
[IDX,C,sumd,D] = kmeans(data,clus_num,'replicates',30,'emptyaction','drop'); % Cluster using K-means algorithm
elseif strcmpi(method,'optimal_kmeans')
[IDX,C,sumd,D] = optimal_kmeans(data,clus_num); % Cluster using optimal K-menas
else
[IDX,C,sumd,D] = kmeanscluster(data,clus_num); % Cluster using K-means algorithm
end;
if STD >= 2 % STD for returned outlier
rSTD = STD -1;
else
rSTD = STD;
end
loop = 0;
while flag
loop = loop + 1;
std_all = [];
ref_D = 0;
N = size(C,1); % Should be dynamically assigned because of "optimal_kmeans"
for k = 1:N
tmp = ['cls' num2str(k) ' = find(IDX==' num2str(k) ')''; ' ]; %find the component indices belonging to each cluster (cls1 = ...).
eval(tmp);
tmp = ['std' num2str(k) ' = std(D(cls' num2str(k) ' ,' num2str(k) ')); ' ]; %compute the std of each cluster
eval(tmp);
std_all = [std_all ['std' num2str(k) ' ']];
tmp = [ 'ref_D = ' num2str(ref_D) ' + mean(D(cls' num2str(k) ' ,' num2str(k) '));' ];
eval(tmp);
end
std_all = [ '[ ' std_all ' ]' ];
std_all = eval(std_all);
% Find the outliers
% Outlier definition - its distance from its cluster center is bigger
% than STD times the std of the cluster, as long as the distance is bigger
% than the mean distance times STD (avoid problems where all points turn to be outliers).
outliers = [];
ref_D = ref_D/N;
for k = 1:N
tmp = ['cls' num2str(k) '(find(D(find(IDX==' num2str(k) ')'' , ' num2str(k) ') > ' num2str(STD) '*std' num2str(k) ')); ' ];
optionalO = eval(tmp);
Oind = find(D(optionalO,k) > ref_D*STD);
outliers = [outliers optionalO(Oind)];
end
if isempty(outliers) || (loop == MAXiter)
flag = 0;
end
l = length(old_outliers);
returned_outliers = [];
for k = 1:l
tmp = sum((C-ones(N,1)*data(old_outliers(k),:)).^2,2)'; % Find the distance of each former outlier to the current cluster
if isempty(find(tmp <= std_all*rSTD)) %Check if the outlier is still an outlier (far from each cluster center more than STD-1 times its std).
returned_outliers = [returned_outliers old_outliers(k)];
end
end
outliers = not_outliers(outliers);
outliers = [outliers returned_outliers ];
tmp = ones(1,size(data,1));
tmp(outliers) = 0;
not_outliers = (find(tmp==1));
if strcmpi(method, 'kmeans')
[IDX,C,sumd,D] = kmeans(data(not_outliers,:),N,'replicates',30,'emptyaction','drop');
elseif strcmpi(method,'optimal_kmeans')
[IDX,C,sumd,D] = optimal_kmeans(data(not_outliers,:),clus_num);
else
[IDX,C,sumd,D] = kmeanscluster(data(not_outliers,:),N);
end
old_outliers = outliers;
old_IDX = zeros(size(data,1),1);
old_IDX(sort(not_outliers)) = IDX;
end
IDX = old_IDX;