Diff of /tree_LeastR_MT.m [000000] .. [d8e26d]

Switch to side-by-side view

--- a
+++ b/tree_LeastR_MT.m
@@ -0,0 +1,444 @@
+function [x_MT, funVal, ValueL_MT]=tree_LeastR_MT(As, ys, z, opts)
+%
+%%
+% Function tree_LeastR
+%      Least Squares Loss with the
+%           tree structured group Lasso Regularization in Multi-Task
+%           Learning
+%
+%% Problem
+%
+%  min  1/2 sum_t || A_t x_t - y_t||^2 + z * sum_j w_j ||x_{G_j}||
+%
+%  G_j's are nodes with tree structure
+%
+%  The tree overlapping group information is contained in
+%  opts.ind_MT, which is a 3 x nodes matrix, where nodes denotes the number of
+%  nodes of the tree.
+%
+%  opts.ind_MT(1,:) contains the starting index
+%  opts.ind_MT(2,:) contains the ending index
+%  opts.ind_MT(3,:) contains the corresponding weight (w_j)
+%
+%  Note:
+%  1) If each element of x is a leaf node of the tree and the weight for
+%  this leaf node are the same, we provide an alternative "efficient" input
+%  for this kind of node, by creating a "super node" with
+%  opts.ind(1,1)=-1; opts.ind(2,1)=-1; and opts.ind(3,1)=the common weight.
+%
+%  2) If the features are well ordered in that, the features of the left
+%  tree is always less than those of the right tree, opts.ind(1,:) and
+%  opts.ind(2,:) contain the "real" starting and ending indices. That is to
+%  say, x( opts.ind_MT(1,j):opts.ind_MT(2,j) ) denotes x_{G_j}. In this case,
+%  the entries in opts.ind(1:2,:) are within 1 and n.
+%
+%
+%  If the features are not well ordered, please use the input opts.G for
+%  specifying the index so that
+%   x( opts.G_MT ( opts.ind_MT(1,j):opts.ind_MT(2,j) ) ) denotes x_{G_j}.
+%  In this case, the entries of opts.G are within 1 and n, and the entries of
+%  opts.ind_MT(1:2,:) are within 1 and length(opts.G_MT).
+%
+% The following example shows how G and ind works:
+%
+% G={ {1, 2}, {4, 5}, {3, 6}, {7, 8},
+%     {1, 2, 3, 6}, {4, 5, 7, 8},
+%     {1, 2, 3, 4, 5, 6, 7, 8} }.
+%
+% ind={ [1, 2, 100]', [3, 4, 100]', [5, 6, 100]', [7, 8, 100]',
+%       [9, 12, 100]', [13, 16, 100]', [17, 24, 100]' },
+%
+% where each node has a weight of 100.
+%
+%
+%% Input parameters:
+%
+%  As-         A set of Matrix of size m_t x n
+%                At can be a dense matrix
+%                         a sparse matrix
+%                         or a DCT matrix
+%  ys -        A set of Response vector (of size m_tx1)
+%  z -        Tree Structure regularization parameter (z >=0)
+%  opts-      optional inputs (default value: opts=[])
+%
+%% Output parameters:
+%
+%  xs-         A set of solution
+%  funVal-    Function value during iterations
+%
+%% Copyright (C) 2010-2011 Jun Liu, and Jieping Ye
+%
+% You are suggested to first read the Manual.
+%
+% For any problem, please contact with Jun Liu via j.liu@asu.edu
+%
+% Last modified on October 3, 2010.
+%
+%% Related papers
+%
+% [1] Jun Liu and Jieping Ye, Moreau-Yosida Regularization for
+%     Grouped Tree Structure Learning, NIPS 2010
+%
+%% Related functions:
+%
+%  sll_opts
+%
+%%
+
+%% Verify and initialize the parameters
+%%
+if (nargin <4)
+    error('\n Inputs: As, ys, z, and opts.ind_MT should be specified!\n');
+end
+
+[m_MT,n_MT]=size(As{1});
+T_num = length(ys);
+
+if (length(ys{1}) ~=m_MT)
+    error('\n Check the length of ys!\n');
+end
+
+if (z<0)
+    error('\n z should be nonnegative!\n');
+end
+
+opts=sll_opts(opts); % run sll_opts to set default values (flags) 
+
+% restart the program for better efficiency
+%  this is a newly added function
+if (~isfield(opts,'rStartNum'))
+    opts.rStartNum=opts.maxIter;
+else
+    if (opts.rStartNum<=0)
+        opts.rStartNum=opts.maxIter;
+    end
+end
+
+%% Detailed initialization
+%% Normalization
+
+% Please refer to sll_opts for the definitions of mu, nu and nFlag
+%
+% If .nFlag =1, the input matrix A is normalized to
+%                     A= ( A- repmat(mu, m,1) ) * diag(nu)^{-1}
+%
+% If .nFlag =2, the input matrix A is normalized to
+%                     A= diag(nu)^{-1} * ( A- repmat(mu, m,1) )
+%
+% Such normalization is done implicitly
+%     This implicit normalization is suggested for the sparse matrix
+%                                    but not for the dense matrix
+%
+
+
+
+if (~issparse(As{1})) && (opts.nFlag~=0)
+    fprintf('\n -----------------------------------------------------');
+    fprintf('\n The data is not sparse or not stored in sparse format');
+    fprintf('\n The code still works.');
+    fprintf('\n But we suggest you to normalize the data directly,');
+    fprintf('\n for achieving better efficiency.');
+    fprintf('\n -----------------------------------------------------');
+end
+
+%% Group & Others
+
+% Initialize ind
+if (~isfield(opts,'ind_MT'))
+    error('\n In tree_LeastR, the field .ind should be specified');
+else
+    ind_MT=opts.ind_MT;
+    
+    if (size(ind_MT,1)~=3)
+        error('\n Check opts.ind');
+    end
+end
+
+GFlag_MT=0;
+% if GFlag=1, we shall apply general_altra
+if (isfield(opts,'G_MT'))
+    GFlag_MT=1;
+    
+    G_MT=opts.G_MT;
+    if (max(G_MT) >n_MT*T_num|| max(G_MT) <1)
+        error('\n The input G is incorrect. It should be within %d and %d',1,n);
+    end
+end
+
+%% Starting point initialization
+% compute AT y
+if (opts.nFlag==0)
+    %ATy =A'*y;
+    AsTys_MT = Xsys_MT_cal(As, ys);
+end
+
+% process the regularization parameter
+if (opts.rFlag==0)
+    lambda=z;
+else % z here is the scaling factor lying in [0,1]
+    %     if (z<0 || z>1)
+    %         error('\n opts.rFlag=1, and z should be in [0,1]');
+    %     end
+    
+    %     computedFlag=0;
+    %     if (isfield(opts,'lambdaMax'))
+    %         if (opts.lambdaMax~=-1)
+    %             lambda=z*opts.lambdaMax;
+    %             computedFlag=1;
+    %         end
+    %     end
+    %
+    %     if (~computedFlag)
+    %         if (GFlag==0)
+    %             lambda_max=findLambdaMax(ATy, n, ind, size(ind,2));
+    %         else
+    %             lambda_max=general_findLambdaMax(ATy, n, G, ind, size(ind,2));
+    %         end
+    %
+    %         % As .rFlag=1, we set lambda as a ratio of lambda_max
+    %         lambda=z*lambda_max;
+    %     end
+end
+
+
+% The following is for computing lambdaMax
+% we use opts.lambdaMax=-1 to show that we need the computation.
+%
+% One can use this for setting up opts.lambdaMax
+% if (isfield(opts,'lambdaMax'))
+%
+%     if (opts.lambdaMax==-1)
+%         if (GFlag==0)
+%             lambda_max=findLambdaMax(ATy, n, ind, size(ind,2));
+%         else
+%             lambda_max=general_findLambdaMax(ATy, n, G, ind, size(ind,2));
+%         end
+%
+%         x=lambda_max;
+%         funVal=lambda_max;
+%         ValueL=lambda_max;
+%
+%         return;
+%     end
+% end
+
+
+% initialize a starting point
+if opts.init==2
+    x_MT=zeros(n_MT*T_num,1);
+else
+    if isfield(opts,'x0')
+        x_MT=opts.x0;
+        if (length(x_MT)~=n_MT*T_num)
+            error('\n Check the input .x0');
+        end
+    else
+        x_MT=AsTys_MT;  % if .x0 is not specified, we use ratio*ATy,
+        % where ratio is a positive value
+    end
+end
+
+% compute A x
+if (opts.nFlag==0)
+    Ax_MT = [];
+    idx_row = [1:T_num:length(x_MT)];
+    for idx_t = 1:T_num
+        Ax_MT = [Ax_MT;As{idx_t}*x_MT(idx_row)];
+        idx_row = idx_row +1;
+    end
+    %Ax=A* x;
+    % elseif (opts.nFlag==1)
+    %     invNu=x./nu; mu_invNu=mu * invNu;
+    %     Ax=A*invNu -repmat(mu_invNu, m, 1);
+    % else
+    %     Ax=A*x-repmat(mu*x, m, 1);     Ax=Ax./nu;
+end
+
+if (opts.init==0)
+    % ------  This function is not available
+    %
+    % If .init=0, we set x=ratio*x by "initFactor"
+    % Please refer to the function initFactor for detail
+    %
+    
+    % Here, we only support starting from zero, due to the complex tree
+    % structure
+    
+    x_MT=zeros(n_MT*T_num,1);
+end
+
+%% The main program
+% The Armijo Goldstein line search schemes + accelearted gradient descent
+
+bFlag_MT=0; % this flag tests whether the gradient step only changes a little
+
+if (opts.mFlag==0 && opts.lFlag==0)
+    L=1;
+    % We assume that the maximum eigenvalue of A'A is over 1
+    
+    % assign xp with x, and Axp with Ax
+    xp_MT=x_MT; Axp_MT=Ax_MT; xxp_MT=zeros(n_MT*T_num,1);
+    
+    alphap=0; alpha=1;
+    
+    for iterStep=1:opts.maxIter
+        % --------------------------- step 1 ---------------------------
+        % compute search point s based on xp and x (with beta)
+        beta=(alphap-1)/alpha;    s_MT=x_MT + beta* xxp_MT;
+        
+        % --------------------------- step 2 ---------------------------
+        % line search for L and compute the new approximate solution x
+        
+        % compute the gradient (g) at s
+        As_MT=Ax_MT + beta* (Ax_MT-Axp_MT);
+        
+        % compute AT As
+        if (opts.nFlag==0)
+            
+            ATAs_MT = zeros(n_MT*T_num,1);
+            idx_row = [0:n_MT-1]*T_num+1;
+            for idx_t = 1:T_num
+                ATAs_MT(idx_row) = As{idx_t}'*As_MT((idx_t-1)*length(ys{1})+1:idx_t*length(ys{1}));
+                idx_row = idx_row+1;
+            end
+            %ATAs=A'*As;
+            %         elseif (opts.nFlag==1)
+            %             ATAs=A'*As - sum(As) * mu';  ATAs=ATAs./nu;
+            %         else
+            %             invNu=As./nu;                ATAs=A'*invNu-sum(invNu)*mu';
+        end
+        
+        % obtain the gradient g
+        g_MT=ATAs_MT-AsTys_MT;
+        
+        % copy x and Ax to xp and Axp
+        xp_MT=x_MT;    Axp_MT=Ax_MT;
+        
+        while (1)
+            % let s walk in a step in the antigradient of s to get v
+            % and then do the L1/Lq-norm regularized projection
+            v_MT=s_MT-g_MT/L;
+            
+            % tree overlapping group Lasso projection
+            ind_work_MT(1:2,:)=ind_MT(1:2,:);
+            ind_work_MT(3,:)=ind_MT(3,:) * (lambda / L);
+            
+            if (GFlag_MT==0)
+                x_MT=altra(v_MT, n_MT*T_num, ind_work_MT, size(ind_work_MT,2));
+            else
+                x_MT=general_altra(v_MT, n_MT*T_num, G_MT, ind_work_MT, size(ind_work_MT,2));
+            end
+            
+            v_MT=x_MT-s_MT;  % the difference between the new approximate solution x
+            % and the search point s
+            
+            % compute A x
+            if (opts.nFlag==0)
+                %Ax=A* x;
+                Ax_MT = [];
+                idx_row = [1:T_num:n_MT*T_num];
+                for idx_t = 1:T_num
+                    Ax_MT = [Ax_MT;As{idx_t}*x_MT(idx_row)];
+                    idx_row = idx_row +1;
+                end
+%             elseif (opts.nFlag==1)
+%                 invNu=x./nu; mu_invNu=mu * invNu;
+%                 Ax=A*invNu -repmat(mu_invNu, m, 1);
+%             else
+%                 Ax=A*x-repmat(mu*x, m, 1);     Ax=Ax./nu;
+            end
+            
+            Av_MT=Ax_MT -As_MT;
+            r_sum_MT=v_MT'*v_MT; l_sum_MT=Av_MT'*Av_MT;
+            
+            if (r_sum_MT <=1e-20)
+                bFlag_MT=1; % this shows that, the gradient step makes little improvement
+                break;
+            end
+            
+            % the condition is ||Av||_2^2 <= L * ||v||_2^2
+            if(l_sum_MT <= r_sum_MT * L)
+                break;
+            else
+                L=max(2*L, l_sum_MT/r_sum_MT);
+                %fprintf('\n L=%5.6f',L);
+            end
+        end
+        
+        % --------------------------- step 3 ---------------------------
+        % update alpha and alphap, and check whether converge
+        alphap=alpha; alpha= (1+ sqrt(4*alpha*alpha +1))/2;
+        
+        xxp_MT=x_MT-xp_MT;   
+        y_all = [];
+        for idx_t = 1:T_num
+        y_all = [y_all;ys{idx_t}];
+        end
+        Axy_MT=Ax_MT-y_all;
+        
+        ValueL_MT(iterStep)=L;
+        
+        % compute the regularization part
+        if (GFlag_MT==0)
+            tree_norm_MT=treeNorm(x_MT, n_MT*T_num, ind_MT, size(ind_MT,2));
+        else
+            tree_norm_MT=general_treeNorm(x_MT, n_MT*T_num, G_MT, ind_MT, size(ind_MT,2));
+        end
+        
+        % function value = loss + regularizatioin
+        funVal(iterStep)=Axy_MT'* Axy_MT/2 + lambda * tree_norm_MT;
+        
+        if (bFlag_MT)
+            % fprintf('\n The program terminates as the gradient step changes the solution very small.');
+            break;
+        end
+        
+        switch(opts.tFlag)
+            case 0
+                if iterStep>=2
+                    if (abs( funVal(iterStep) - funVal(iterStep-1) ) <= opts.tol)
+                        break;
+                    end
+                end
+            case 1
+                if iterStep>=2
+                    if (abs( funVal(iterStep) - funVal(iterStep-1) ) <=...
+                            opts.tol* funVal(iterStep-1))
+                        break;
+                    end
+                end
+            case 2
+                if ( funVal(iterStep)<= opts.tol)
+                    break;
+                end
+            case 3
+                norm_xxp_MT=sqrt(xxp_MT'*xxp_MT);
+                if ( norm_xxp_MT <=opts.tol)
+                    break;
+                end
+            case 4
+                norm_xp_MT=sqrt(xp_MT'*xp_MT);    norm_xxp_MT=sqrt(xxp_MT'*xxp_MT);
+                if ( norm_xxp_MT <=opts.tol * max(norm_xp_MT,1))
+                    break;
+                end
+            case 5
+                if iterStep>=opts.maxIter
+                    break;
+                end
+        end
+        
+        % restart the program every opts.rStartNum
+        if (~mod(iterStep, opts.rStartNum))
+            alphap=0; alpha=1;
+            xp_MT=x_MT; Axp_MT=Ax_MT; xxp_MT=zeros(n_MT*T_num,1); L =L/2;
+        end
+    end
+else
+    error('\n The function does not support opts.mFlag neq 0 & opts.lFlag neq 0!');
+    endfunction [ output_args ] = tree_LeastR_MT( input_args )
+    %TREE_LEASTR_MT Summary of this function goes here
+    %   Detailed explanation goes here
+    
+    
+end
+