--- a +++ b/tree_LeastR_MT.m @@ -0,0 +1,444 @@ +function [x_MT, funVal, ValueL_MT]=tree_LeastR_MT(As, ys, z, opts) +% +%% +% Function tree_LeastR +% Least Squares Loss with the +% tree structured group Lasso Regularization in Multi-Task +% Learning +% +%% Problem +% +% min 1/2 sum_t || A_t x_t - y_t||^2 + z * sum_j w_j ||x_{G_j}|| +% +% G_j's are nodes with tree structure +% +% The tree overlapping group information is contained in +% opts.ind_MT, which is a 3 x nodes matrix, where nodes denotes the number of +% nodes of the tree. +% +% opts.ind_MT(1,:) contains the starting index +% opts.ind_MT(2,:) contains the ending index +% opts.ind_MT(3,:) contains the corresponding weight (w_j) +% +% Note: +% 1) If each element of x is a leaf node of the tree and the weight for +% this leaf node are the same, we provide an alternative "efficient" input +% for this kind of node, by creating a "super node" with +% opts.ind(1,1)=-1; opts.ind(2,1)=-1; and opts.ind(3,1)=the common weight. +% +% 2) If the features are well ordered in that, the features of the left +% tree is always less than those of the right tree, opts.ind(1,:) and +% opts.ind(2,:) contain the "real" starting and ending indices. That is to +% say, x( opts.ind_MT(1,j):opts.ind_MT(2,j) ) denotes x_{G_j}. In this case, +% the entries in opts.ind(1:2,:) are within 1 and n. +% +% +% If the features are not well ordered, please use the input opts.G for +% specifying the index so that +% x( opts.G_MT ( opts.ind_MT(1,j):opts.ind_MT(2,j) ) ) denotes x_{G_j}. +% In this case, the entries of opts.G are within 1 and n, and the entries of +% opts.ind_MT(1:2,:) are within 1 and length(opts.G_MT). +% +% The following example shows how G and ind works: +% +% G={ {1, 2}, {4, 5}, {3, 6}, {7, 8}, +% {1, 2, 3, 6}, {4, 5, 7, 8}, +% {1, 2, 3, 4, 5, 6, 7, 8} }. +% +% ind={ [1, 2, 100]', [3, 4, 100]', [5, 6, 100]', [7, 8, 100]', +% [9, 12, 100]', [13, 16, 100]', [17, 24, 100]' }, +% +% where each node has a weight of 100. +% +% +%% Input parameters: +% +% As- A set of Matrix of size m_t x n +% At can be a dense matrix +% a sparse matrix +% or a DCT matrix +% ys - A set of Response vector (of size m_tx1) +% z - Tree Structure regularization parameter (z >=0) +% opts- optional inputs (default value: opts=[]) +% +%% Output parameters: +% +% xs- A set of solution +% funVal- Function value during iterations +% +%% Copyright (C) 2010-2011 Jun Liu, and Jieping Ye +% +% You are suggested to first read the Manual. +% +% For any problem, please contact with Jun Liu via j.liu@asu.edu +% +% Last modified on October 3, 2010. +% +%% Related papers +% +% [1] Jun Liu and Jieping Ye, Moreau-Yosida Regularization for +% Grouped Tree Structure Learning, NIPS 2010 +% +%% Related functions: +% +% sll_opts +% +%% + +%% Verify and initialize the parameters +%% +if (nargin <4) + error('\n Inputs: As, ys, z, and opts.ind_MT should be specified!\n'); +end + +[m_MT,n_MT]=size(As{1}); +T_num = length(ys); + +if (length(ys{1}) ~=m_MT) + error('\n Check the length of ys!\n'); +end + +if (z<0) + error('\n z should be nonnegative!\n'); +end + +opts=sll_opts(opts); % run sll_opts to set default values (flags) + +% restart the program for better efficiency +% this is a newly added function +if (~isfield(opts,'rStartNum')) + opts.rStartNum=opts.maxIter; +else + if (opts.rStartNum<=0) + opts.rStartNum=opts.maxIter; + end +end + +%% Detailed initialization +%% Normalization + +% Please refer to sll_opts for the definitions of mu, nu and nFlag +% +% If .nFlag =1, the input matrix A is normalized to +% A= ( A- repmat(mu, m,1) ) * diag(nu)^{-1} +% +% If .nFlag =2, the input matrix A is normalized to +% A= diag(nu)^{-1} * ( A- repmat(mu, m,1) ) +% +% Such normalization is done implicitly +% This implicit normalization is suggested for the sparse matrix +% but not for the dense matrix +% + + + +if (~issparse(As{1})) && (opts.nFlag~=0) + fprintf('\n -----------------------------------------------------'); + fprintf('\n The data is not sparse or not stored in sparse format'); + fprintf('\n The code still works.'); + fprintf('\n But we suggest you to normalize the data directly,'); + fprintf('\n for achieving better efficiency.'); + fprintf('\n -----------------------------------------------------'); +end + +%% Group & Others + +% Initialize ind +if (~isfield(opts,'ind_MT')) + error('\n In tree_LeastR, the field .ind should be specified'); +else + ind_MT=opts.ind_MT; + + if (size(ind_MT,1)~=3) + error('\n Check opts.ind'); + end +end + +GFlag_MT=0; +% if GFlag=1, we shall apply general_altra +if (isfield(opts,'G_MT')) + GFlag_MT=1; + + G_MT=opts.G_MT; + if (max(G_MT) >n_MT*T_num|| max(G_MT) <1) + error('\n The input G is incorrect. It should be within %d and %d',1,n); + end +end + +%% Starting point initialization +% compute AT y +if (opts.nFlag==0) + %ATy =A'*y; + AsTys_MT = Xsys_MT_cal(As, ys); +end + +% process the regularization parameter +if (opts.rFlag==0) + lambda=z; +else % z here is the scaling factor lying in [0,1] + % if (z<0 || z>1) + % error('\n opts.rFlag=1, and z should be in [0,1]'); + % end + + % computedFlag=0; + % if (isfield(opts,'lambdaMax')) + % if (opts.lambdaMax~=-1) + % lambda=z*opts.lambdaMax; + % computedFlag=1; + % end + % end + % + % if (~computedFlag) + % if (GFlag==0) + % lambda_max=findLambdaMax(ATy, n, ind, size(ind,2)); + % else + % lambda_max=general_findLambdaMax(ATy, n, G, ind, size(ind,2)); + % end + % + % % As .rFlag=1, we set lambda as a ratio of lambda_max + % lambda=z*lambda_max; + % end +end + + +% The following is for computing lambdaMax +% we use opts.lambdaMax=-1 to show that we need the computation. +% +% One can use this for setting up opts.lambdaMax +% if (isfield(opts,'lambdaMax')) +% +% if (opts.lambdaMax==-1) +% if (GFlag==0) +% lambda_max=findLambdaMax(ATy, n, ind, size(ind,2)); +% else +% lambda_max=general_findLambdaMax(ATy, n, G, ind, size(ind,2)); +% end +% +% x=lambda_max; +% funVal=lambda_max; +% ValueL=lambda_max; +% +% return; +% end +% end + + +% initialize a starting point +if opts.init==2 + x_MT=zeros(n_MT*T_num,1); +else + if isfield(opts,'x0') + x_MT=opts.x0; + if (length(x_MT)~=n_MT*T_num) + error('\n Check the input .x0'); + end + else + x_MT=AsTys_MT; % if .x0 is not specified, we use ratio*ATy, + % where ratio is a positive value + end +end + +% compute A x +if (opts.nFlag==0) + Ax_MT = []; + idx_row = [1:T_num:length(x_MT)]; + for idx_t = 1:T_num + Ax_MT = [Ax_MT;As{idx_t}*x_MT(idx_row)]; + idx_row = idx_row +1; + end + %Ax=A* x; + % elseif (opts.nFlag==1) + % invNu=x./nu; mu_invNu=mu * invNu; + % Ax=A*invNu -repmat(mu_invNu, m, 1); + % else + % Ax=A*x-repmat(mu*x, m, 1); Ax=Ax./nu; +end + +if (opts.init==0) + % ------ This function is not available + % + % If .init=0, we set x=ratio*x by "initFactor" + % Please refer to the function initFactor for detail + % + + % Here, we only support starting from zero, due to the complex tree + % structure + + x_MT=zeros(n_MT*T_num,1); +end + +%% The main program +% The Armijo Goldstein line search schemes + accelearted gradient descent + +bFlag_MT=0; % this flag tests whether the gradient step only changes a little + +if (opts.mFlag==0 && opts.lFlag==0) + L=1; + % We assume that the maximum eigenvalue of A'A is over 1 + + % assign xp with x, and Axp with Ax + xp_MT=x_MT; Axp_MT=Ax_MT; xxp_MT=zeros(n_MT*T_num,1); + + alphap=0; alpha=1; + + for iterStep=1:opts.maxIter + % --------------------------- step 1 --------------------------- + % compute search point s based on xp and x (with beta) + beta=(alphap-1)/alpha; s_MT=x_MT + beta* xxp_MT; + + % --------------------------- step 2 --------------------------- + % line search for L and compute the new approximate solution x + + % compute the gradient (g) at s + As_MT=Ax_MT + beta* (Ax_MT-Axp_MT); + + % compute AT As + if (opts.nFlag==0) + + ATAs_MT = zeros(n_MT*T_num,1); + idx_row = [0:n_MT-1]*T_num+1; + for idx_t = 1:T_num + ATAs_MT(idx_row) = As{idx_t}'*As_MT((idx_t-1)*length(ys{1})+1:idx_t*length(ys{1})); + idx_row = idx_row+1; + end + %ATAs=A'*As; + % elseif (opts.nFlag==1) + % ATAs=A'*As - sum(As) * mu'; ATAs=ATAs./nu; + % else + % invNu=As./nu; ATAs=A'*invNu-sum(invNu)*mu'; + end + + % obtain the gradient g + g_MT=ATAs_MT-AsTys_MT; + + % copy x and Ax to xp and Axp + xp_MT=x_MT; Axp_MT=Ax_MT; + + while (1) + % let s walk in a step in the antigradient of s to get v + % and then do the L1/Lq-norm regularized projection + v_MT=s_MT-g_MT/L; + + % tree overlapping group Lasso projection + ind_work_MT(1:2,:)=ind_MT(1:2,:); + ind_work_MT(3,:)=ind_MT(3,:) * (lambda / L); + + if (GFlag_MT==0) + x_MT=altra(v_MT, n_MT*T_num, ind_work_MT, size(ind_work_MT,2)); + else + x_MT=general_altra(v_MT, n_MT*T_num, G_MT, ind_work_MT, size(ind_work_MT,2)); + end + + v_MT=x_MT-s_MT; % the difference between the new approximate solution x + % and the search point s + + % compute A x + if (opts.nFlag==0) + %Ax=A* x; + Ax_MT = []; + idx_row = [1:T_num:n_MT*T_num]; + for idx_t = 1:T_num + Ax_MT = [Ax_MT;As{idx_t}*x_MT(idx_row)]; + idx_row = idx_row +1; + end +% elseif (opts.nFlag==1) +% invNu=x./nu; mu_invNu=mu * invNu; +% Ax=A*invNu -repmat(mu_invNu, m, 1); +% else +% Ax=A*x-repmat(mu*x, m, 1); Ax=Ax./nu; + end + + Av_MT=Ax_MT -As_MT; + r_sum_MT=v_MT'*v_MT; l_sum_MT=Av_MT'*Av_MT; + + if (r_sum_MT <=1e-20) + bFlag_MT=1; % this shows that, the gradient step makes little improvement + break; + end + + % the condition is ||Av||_2^2 <= L * ||v||_2^2 + if(l_sum_MT <= r_sum_MT * L) + break; + else + L=max(2*L, l_sum_MT/r_sum_MT); + %fprintf('\n L=%5.6f',L); + end + end + + % --------------------------- step 3 --------------------------- + % update alpha and alphap, and check whether converge + alphap=alpha; alpha= (1+ sqrt(4*alpha*alpha +1))/2; + + xxp_MT=x_MT-xp_MT; + y_all = []; + for idx_t = 1:T_num + y_all = [y_all;ys{idx_t}]; + end + Axy_MT=Ax_MT-y_all; + + ValueL_MT(iterStep)=L; + + % compute the regularization part + if (GFlag_MT==0) + tree_norm_MT=treeNorm(x_MT, n_MT*T_num, ind_MT, size(ind_MT,2)); + else + tree_norm_MT=general_treeNorm(x_MT, n_MT*T_num, G_MT, ind_MT, size(ind_MT,2)); + end + + % function value = loss + regularizatioin + funVal(iterStep)=Axy_MT'* Axy_MT/2 + lambda * tree_norm_MT; + + if (bFlag_MT) + % fprintf('\n The program terminates as the gradient step changes the solution very small.'); + break; + end + + switch(opts.tFlag) + case 0 + if iterStep>=2 + if (abs( funVal(iterStep) - funVal(iterStep-1) ) <= opts.tol) + break; + end + end + case 1 + if iterStep>=2 + if (abs( funVal(iterStep) - funVal(iterStep-1) ) <=... + opts.tol* funVal(iterStep-1)) + break; + end + end + case 2 + if ( funVal(iterStep)<= opts.tol) + break; + end + case 3 + norm_xxp_MT=sqrt(xxp_MT'*xxp_MT); + if ( norm_xxp_MT <=opts.tol) + break; + end + case 4 + norm_xp_MT=sqrt(xp_MT'*xp_MT); norm_xxp_MT=sqrt(xxp_MT'*xxp_MT); + if ( norm_xxp_MT <=opts.tol * max(norm_xp_MT,1)) + break; + end + case 5 + if iterStep>=opts.maxIter + break; + end + end + + % restart the program every opts.rStartNum + if (~mod(iterStep, opts.rStartNum)) + alphap=0; alpha=1; + xp_MT=x_MT; Axp_MT=Ax_MT; xxp_MT=zeros(n_MT*T_num,1); L =L/2; + end + end +else + error('\n The function does not support opts.mFlag neq 0 & opts.lFlag neq 0!'); + endfunction [ output_args ] = tree_LeastR_MT( input_args ) + %TREE_LEASTR_MT Summary of this function goes here + % Detailed explanation goes here + + +end +