Switch to unified view

a b/matlab_source/netics_fun.m
1
function [ ranked_list_genes, scores ] = netics_fun( varargin )
2
3
% - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -  
4
%   INPUTS:
5
%   filenameMu(varargin{1}):         input file that contains the genetically aberrant genes of each sample. 
6
%                                    It contains two columns that map every gene (1st column) to the samples that 
7
%                                    it is genetically aberrant (2nd column)
8
%   adj(varargin{2}):                adjacency matrix of the directed interaction network
9
%   restart_prob(varargin{3}):       restart probability for the insulated diffusion. For (Wu et al., 2010) network use 0.4.
10
%   rank_method_str(varargin{4}):    'MEDIAN' uses the median of the sample-specific ranks
11
%                                    'RRA' uses the Robust Rank Aggregation method to integrate sample-specific ranked lists
12
%                                    'SUM' uses the sum of the sample-specific ranks
13
%   filenameNet(varargin{5}):        input file that contains the list of the genes that are present in the network. 
14
%                                    They should be in the same order as in the rows of the adjacency matrix adj. 
15
%                                    An example file is given that contains the gene names of the network described in (Wu et al, 2010).
16
%   filenameRNA(varargin{6}):        tab delimited file with two columns. First column contains the genes for which differential expression 
17
%                                    between the tumor and normal samples at the RNA level was measured. Second column contains the p-values 
18
%                                    of these measurements. This file can be the result of a tool for differential expression analysis such as DESeq2. 
19
%                                    Each gene in this file should have only one entry.
20
%   filenamePR(varargin{7}):         tab delimited file with two columns. First column contains the proteins for which differential expression between 
21
%                                    the tumor and normal samples at the protein level was measured. Second column contains the p-values of these 
22
%                                    measurements. Each gene in this file should have only one entry.
23
% - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -  
24
%   OUTPUTS:
25
%   ranked_list_genes:  list with the ranked gene names
26
%   scores:             list with the scores of the ranked gene names
27
% - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -  
28
29
    filenameRNA = '';
30
    filenamePR = '';
31
    ranked_list_genes={};
32
33
    % Accept arguments    
34
    if nargin < 5,
35
        error('Error: Missing input arguments.')
36
    elseif nargin > 7,
37
        error('Error: Too many input arguments.')
38
    else            
39
        mutation_data = read_mutations( varargin{1} );
40
        adj = varargin{2};
41
        restart_prob = varargin{3};
42
        rank_method_str = varargin{4};
43
        filenameNet = varargin{5};
44
        choose_mut_diff = 1;
45
    end
46
    if nargin >= 6,            
47
        filenameRNA = varargin{6};
48
        choose_mut_diff = 3;        
49
    end
50
    if nargin >= 7,
51
        filenamePR = varargin{7};
52
    end
53
54
    %read network genes
55
    fidnet = fopen(filenameNet,'r');
56
    g = textscan(fidnet, '%s', 'delimiter', '\n');
57
    network_genes = g{1};
58
    fclose(fidnet);
59
    
60
    %Extract first token
61
    rank_method_str1 = rank_method_str(1:strfind(rank_method_str,'='));
62
    
63
    %Handle input
64
    rank_method_str = rank_method_str(strfind(rank_method_str,'=')+1:end);
65
    
66
    %Check input - Token 1
67
    if ~strcmp(rank_method_str1,'RANK_AGGREG='),
68
        disp(strcat('Wrong input: ',rank_method_str1));
69
        disp('Input should be RANK_AGGREG=');       
70
        return; 
71
    end
72
73
    %Check input - Token 2
74
    if  ~strcmp(rank_method_str,'SUM') && ~strcmp(rank_method_str,'MEDIAN') && ~strcmp(rank_method_str,'RRA'),
75
        disp(strcat('Wrong input: ',rank_method_str));
76
        disp(strcat('Input should be MEDIAN, RRA or SUM'));
77
        return;     
78
    end
79
80
    %copy data from input mutation_data 
81
    samples = {};
82
    for i = 1:length(mutation_data),
83
        samples.DNA{i} = upper(mutation_data{i});
84
        samples.DNA_network{i} = upper(intersect(network_genes,samples.DNA{i}));
85
    end
86
    clear mutation_data;
87
88
    %read differentially expressed genes
89
    diff_expr_genes = {};
90
    if choose_mut_diff~=1,
91
        diff_expr_genes = read_diff_expr( filenameRNA, filenamePR );
92
        if isempty(diff_expr_genes),
93
            choose_mut_diff=1;
94
        end
95
    else
96
        disp('No Input files were provided for differentially expressed genes.');        
97
    end
98
    for i = 1:length(samples.DNA),
99
        samples.RNA{i} = diff_expr_genes;
100
        samples.RNA_network{i} = upper(intersect(network_genes,samples.RNA{i}));
101
    end
102
103
    %compute diffused matrix
104
    disp('Computing diffused matrix...');
105
    F = insulated_diff(norm_adj(adj), restart_prob);
106
    F_opp = insulated_diff(norm_adj(adj'), restart_prob);    
107
    disp('Running NetICS...');
108
    [ranked_list_genes, scores] = prioritization( samples, F, F_opp, network_genes, choose_mut_diff, rank_method_str);
109
110
end
111
112
function [ mutation_data ] = read_mutations( filename )
113
    fid = fopen(filename,'r');
114
    g = textscan(fid, '%s%s', 'delimiter', '\t');
115
    fclose(fid);
116
    unique_samples = unique(g{2});
117
    mutation_data = cell(length(unique_samples),1);
118
    for i = 1:length(unique_samples),
119
        mutation_data{i} = g{1}(ismember(g{2},unique_samples{i}));
120
    end
121
end
122
123
function [ diff_expr_genes ] = read_diff_expr( filenameDE, filenamePR )
124
125
    diff_expr_genes ={}; RNA_names={}; rppa_names={};
126
127
    if ~isempty(filenamePR),
128
        [rppa_names, rppa_pval] = read_file(filenamePR);
129
    else
130
        disp('No Input file was provided for differentially expressed genes at the proteome level.');        
131
    end
132
133
    if ~isempty(filenameDE),
134
        [RNA_names, RNA_pval] = read_file(filenameDE);        
135
    else
136
        disp('No Input file was provided for differentially expressed genes at the RNA level.');        
137
    end
138
139
    %check for duplicate genes in the input files
140
    if (length(unique(RNA_names)) ~= length(RNA_names)) || (length(unique(rppa_names)) ~= length(rppa_names)),
141
        disp('Input files for differentially expressed genes should contain only one entry per gene.');
142
        disp('Input files for differentially expressed genes were ignored.');        
143
        return;         
144
    end
145
    %check if the input files for differential expression are empty    
146
    if isempty(rppa_names) && isempty(RNA_names),
147
        disp('Only genetically aberrant genes are used for diffusion.');
148
        return;
149
    %only differential expressed genes at the RNA level are provided
150
    elseif isempty(rppa_names) && ~isempty(RNA_names),
151
        [~,~,pval_all] = fdr(RNA_pval,0.05);
152
        diff_expr_genes = RNA_names(pval_all < 0.05);
153
    %only differential expressed genes at the protein level are provided        
154
    elseif isempty(RNA_names) && ~isempty(rppa_names),
155
        [~,~,pval_all] = fdr(rppa_pval,0.05);
156
        diff_expr_genes = rppa_names(pval_all < 0.05);
157
    else
158
        names_intersection = intersect(RNA_names,rppa_names);
159
        names_setdiff_rna = setdiff(RNA_names,rppa_names);
160
        names_setdiff_pr = setdiff(rppa_names,RNA_names);
161
162
        rna_pval_names_intersection = RNA_pval(ismember(RNA_names,names_intersection));
163
        rppa_pval_names_intersection = rppa_pval(ismember(rppa_names,names_intersection));
164
165
        rna_pval_names_setdiff_rna = RNA_pval(ismember(RNA_names,names_setdiff_rna));
166
        pr_pval_names_setdiff_pr = rppa_pval(ismember(rppa_names,names_setdiff_pr));
167
168
        all_names = [names_intersection' names_setdiff_rna' names_setdiff_pr'];
169
        %Fishers method
170
        pval_all = (1-pchisq(-2*(log(rna_pval_names_intersection)+log(rppa_pval_names_intersection)),4));
171
        pval_all = [pval_all' rna_pval_names_setdiff_rna' pr_pval_names_setdiff_pr'];    
172
173
        [~,~,pval_all] = fdr(pval_all,0.05);
174
        diff_expr_genes = all_names(pval_all < 0.05);
175
    end
176
end
177
178
function [names, pval] = read_file(filenamePR)
179
    fid = fopen(filenamePR,'r');
180
    g = textscan(fid, '%s%s', 'delimiter', '\t');
181
    fclose(fid);
182
    names = g{1}; pval = str2double(g{2}); 
183
end