eeglab / Git / [422372] /functions/miscfunc/kmeans

Models:
ReneeD/
eeglab
Downloads: 1
[422372]: / functions / miscfunc / kmeans_st.m
History
Download this file
244 lines (200 with data), 8.1 kB

function [centr,clst,sse] = kmeans_st(X,k,restarts)
% KMEANS: K-means clustering of n points into k clusters so that the
%         within-cluster sum of squares is minimized.  Based on Algorithm
%         AS136, which seeks a local optimum such that no movement of a
%         point from one cluster to another will reduced the within-cluster
%         sum of squares.  Tends to find spherical clusters.
%         Not globally optimal against all partitions, which is an NP-complete 
%         problem; thus allows for multiple restarts.
%
%     Usage: [centr,clst,sse] = kmeans(X,k,restarts)
%
%         X =        [n x p] data matrix.
%         k =        number of desired clusters.
%         restarts = optional number of restarts, after finding a new minimum 
%                     sse, needed to end search [default=0].
%         --------------------------------------------------------------------
%         centr =    [k x p] matrix of cluster centroids.
%         clst =     [n x 1] vector of cluster memberships.
%         sse =      total within-cluster sum of squared deviations.
%

% Hartigan,JA & MA Wong. 1979.  Algorithm AS 136: A k-means clustering
%   algorithm.  Appl. Stat. 200:100-108.
% Milligan,G.W. & L. Sokol. 1980. A two-stage clustering algorithm with
%   robustness recovery characteristics.  Educational and Psychological
%   Measurement 40:755-759.

% RE Strauss, 8/26/98
%   8/21/99 -  changed misc statements for Matlab v5.
%   10/14/00 - use MEANS rather than MEAN to update cluster centers;
%                remove references to TRUE and FALSE.

  if (nargin<3) restarts = []; end
  
  if (isempty(restarts))
    restarts = 0;
  end

  [n,p] = size(X);
  if (k<1 || k>n)
    error('KMEANS: k out of range 1-N');
  end

  max_iter = 50;
  highval = 10e8;
  upgma_flag = 1;                     % Do UPGMA first time thru

  % Repeat optimization until have 'restart' attempts with no better solution
  restart_iter = restarts + 1;
  best_sse = highval;

  while (restart_iter > 0)
    restart_iter = restart_iter - 1;

    c1 = zeros(n,1);
    c2 = c1;
    clst = c1;
    nclst = zeros(k,1);

    % Estimate initial cluster centers via UPGMA the first time, and randomly
    % thereafter.

    if (upgma_flag)                     % If UPGMA,
      upgma_flag = 0;
      dist = eucl(X);                     % All interpoint distances
      topo = upgma(dist,[],0);            % UPGMA dendrogram topology

      grp = [1:n]';                       % Identify obs in each of k grps
      for step = 1:(n-k)
        t1 = topo(step,1);
        t2 = topo(step,2);
        t3 = topo(step,3);
        indx = find(grp==t1);
        grp(indx) = t3 * ones(length(indx),1);
        indx = find(grp==t2);
        grp(indx) = t3 * ones(length(indx),1);
      end

      grpid = uniquef(grp);                % Unique group identifiers
      centr = zeros(k,p);
      for kk = 1:length(grpid);           % Calculate group centroids
        g = grpid(kk);
        indx = find(grp == g);
        Xk = X(indx,:);
        if (size(Xk,1)>1)
          centr(kk,:) = mean(Xk);
        else
          centr(kk,:) = Xk;
        end
      end

    else                                % Else if randomized,
      rndprm = randperm(n);               % Pull out random set of points
      centr = X(rndprm(1:k),:);
    end

    % For each point i, find its two closest centers, c1(i) and c2(i), and
    % assign it to c1(i)

    dist = zeros(n,k);
    for kk = 1:k                        % Dists to centers of all clusters
      dist(:,kk) = eucl(X,centr(kk,:));
    end
    [d,indx] = sort(dist');             % Sort points separately

    c1 = indx(1,:)';
    c2 = indx(2,:)';

    % Update cluster centers to be the centroids of points contained within them

    for kk = 1:k
      indx = find(c1==kk);
      len_indx = length(indx);
%      if (len_indx>1)
%        centr(kk,:) = mean(X(indx,:));
%      else
%        centr(kk,:) = X(indx,:);
%      end
      centr(kk,:) = means(X(indx,:));
      nclst(kk) = len_indx;
    end

    % Initialize working matrices

    live = ones(k,1);
    R1 = zeros(n,1);
    R2 = zeros(k,1);
    adj1 = nclst ./ (nclst+1);
    adj2 = nclst ./ (nclst-1+eps);
    update = n * ones(k,1);

    for i = 1:n               % Adjusted dist from each pt to own cluster center
      R1(i) = adj2(c1(i)) * eucl(X(i,:),centr(c1(i),:))^2;
    end

    % Iterate

    iter = 0;
    while ((iter < max_iter) & (any(live)))
      iter = iter+1;

      % Optimal-transfer stage

      i = 0;
      while ((i<n) & any(live))         % Cycle thru points
        i = i+1;
        update = update-1;                % Decrement cluster update counters

        L1 = c1(i);                       % Pt i is in cluster L1
        if (nclst(L1)>1)                  % If point i is not sole member of L1,
          R2 = adj1(L1) .* eucl(centr,X(i,:)).^2; % Dists from pt to cluster centers

          if (live(L1))                     % If L1 is in live set,
            R2(L1) = highval;               %   find min R2 over all clusters
            [r2min,L2] = min(R2);
          else                              % If L1 is not in live set,
            indx = find(~live);             %   find min R2 over live clusters only
            R2([L1;indx]) = highval * ones(length(indx)+1,1);
            [r2min,L2] = min(R2);
          end

          if (r2min >= R1(i))             % No reallocation necessary
            c2(i) = L2;
          else                            % Else reallocate pt to new cluster
            c1(i) = L2;
            c2(i) = L1;

            for kk = [L1,L2]                % Recalc cluster centers and sizes
              indx = find(c1==kk);
              len_indx = length(indx);
              if (len_indx>1)
                centr(kk,:) = mean(X(indx,:));
              else
                centr(kk,:) = X(indx,:);
              end
              nclst(kk) = len_indx;
            end

            adj1([L1,L2]) = nclst([L1,L2]) ./ (nclst([L1,L2])+1);
            adj2([L1,L2]) = nclst([L1,L2]) ./ (nclst([L1,L2])-1+eps);
            R1(i) = adj2(c1(i)) * eucl(X(i,:),centr(c1(i),:))^2;

            live([L1,L2]) = [1,1];          % Put into live set
            update([L1,L2]) = [n,n];        % Reinitialize update counters
          end
        end;  % if nclst(L1)>1

        indx = find(update<1);          % Remove clusters from live set
        if (~isempty(indx))             %   that haven't been recently updated
          live(indx) = zeros(length(indx),1);
        end
      end;  % while (i<n & any(live))

      if (any(live))

        % Quick transfer stage

        for i = 1:n                         % Cycle thru points
          L1 = c1(i);
          L2 = c2(i);
          if (any(live([L1,L2])) & nclst(L1)>1)
            R2 = adj1(L2) .* eucl(centr(L2,:),X(i,:)).^2; % Dists from pt to L2 center
            if (R1>=R2)
              temp = c1(i);                   % Switch L1 & L2
              c1(i) = c2(i);
              c2(i) = temp;

              for kk = [L1,L2]                % Recalc cluster centers and sizes
                indx = find(c1==kk);
                centr(kk,:) = mean(X(indx,:));
                nclst(kk) = length(indx);
              end

              adj1([L1,L2]) = nclst([L1,L2]) ./ (nclst([L1,L2])+1);
              adj2([L1,L2]) = nclst([L1,L2]) ./ (nclst([L1,L2])-1+eps);
              R1(i) = adj2(c1(i)) * eucl(X(i,:),centr(c1(i),:))^2;

              live([L1,L2]) = [1,1];          % Put into live set
              update([L1,L2]) = [n,n];        % Reinitialize update counters
            end
          end
        end
      end;  % if any(live)
    end;  % while iter<max_iter & any(live)

    sse = 0;                          % Calc final total sum-of-squares
    for i=1:n
      sse = sse + eucl(X(i,:),centr(c1(i),:))^2;
    end

    if (sse < best_sse)               % Update best solution so far
      best_sse = sse;
      best_c1 = c1;
      best_centr = centr;
      restart_iter = restarts;
    end
  end;  % while (restart_iter > 0)

  clst =  best_c1;
  centr = best_centr;
  sse =   best_sse;

  return;