[11a160]: / matlab_source / netics_fun.m

Download this file

184 lines (160 with data), 8.1 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
function [ ranked_list_genes, scores ] = netics_fun( varargin )
% - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
% INPUTS:
% filenameMu(varargin{1}): input file that contains the genetically aberrant genes of each sample.
% It contains two columns that map every gene (1st column) to the samples that
% it is genetically aberrant (2nd column)
% adj(varargin{2}): adjacency matrix of the directed interaction network
% restart_prob(varargin{3}): restart probability for the insulated diffusion. For (Wu et al., 2010) network use 0.4.
% rank_method_str(varargin{4}): 'MEDIAN' uses the median of the sample-specific ranks
% 'RRA' uses the Robust Rank Aggregation method to integrate sample-specific ranked lists
% 'SUM' uses the sum of the sample-specific ranks
% filenameNet(varargin{5}): input file that contains the list of the genes that are present in the network.
% They should be in the same order as in the rows of the adjacency matrix adj.
% An example file is given that contains the gene names of the network described in (Wu et al, 2010).
% filenameRNA(varargin{6}): tab delimited file with two columns. First column contains the genes for which differential expression
% between the tumor and normal samples at the RNA level was measured. Second column contains the p-values
% of these measurements. This file can be the result of a tool for differential expression analysis such as DESeq2.
% Each gene in this file should have only one entry.
% filenamePR(varargin{7}): tab delimited file with two columns. First column contains the proteins for which differential expression between
% the tumor and normal samples at the protein level was measured. Second column contains the p-values of these
% measurements. Each gene in this file should have only one entry.
% - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
% OUTPUTS:
% ranked_list_genes: list with the ranked gene names
% scores: list with the scores of the ranked gene names
% - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
filenameRNA = '';
filenamePR = '';
ranked_list_genes={};
% Accept arguments
if nargin < 5,
error('Error: Missing input arguments.')
elseif nargin > 7,
error('Error: Too many input arguments.')
else
mutation_data = read_mutations( varargin{1} );
adj = varargin{2};
restart_prob = varargin{3};
rank_method_str = varargin{4};
filenameNet = varargin{5};
choose_mut_diff = 1;
end
if nargin >= 6,
filenameRNA = varargin{6};
choose_mut_diff = 3;
end
if nargin >= 7,
filenamePR = varargin{7};
end
%read network genes
fidnet = fopen(filenameNet,'r');
g = textscan(fidnet, '%s', 'delimiter', '\n');
network_genes = g{1};
fclose(fidnet);
%Extract first token
rank_method_str1 = rank_method_str(1:strfind(rank_method_str,'='));
%Handle input
rank_method_str = rank_method_str(strfind(rank_method_str,'=')+1:end);
%Check input - Token 1
if ~strcmp(rank_method_str1,'RANK_AGGREG='),
disp(strcat('Wrong input: ',rank_method_str1));
disp('Input should be RANK_AGGREG=');
return;
end
%Check input - Token 2
if ~strcmp(rank_method_str,'SUM') && ~strcmp(rank_method_str,'MEDIAN') && ~strcmp(rank_method_str,'RRA'),
disp(strcat('Wrong input: ',rank_method_str));
disp(strcat('Input should be MEDIAN, RRA or SUM'));
return;
end
%copy data from input mutation_data
samples = {};
for i = 1:length(mutation_data),
samples.DNA{i} = upper(mutation_data{i});
samples.DNA_network{i} = upper(intersect(network_genes,samples.DNA{i}));
end
clear mutation_data;
%read differentially expressed genes
diff_expr_genes = {};
if choose_mut_diff~=1,
diff_expr_genes = read_diff_expr( filenameRNA, filenamePR );
if isempty(diff_expr_genes),
choose_mut_diff=1;
end
else
disp('No Input files were provided for differentially expressed genes.');
end
for i = 1:length(samples.DNA),
samples.RNA{i} = diff_expr_genes;
samples.RNA_network{i} = upper(intersect(network_genes,samples.RNA{i}));
end
%compute diffused matrix
disp('Computing diffused matrix...');
F = insulated_diff(norm_adj(adj), restart_prob);
F_opp = insulated_diff(norm_adj(adj'), restart_prob);
disp('Running NetICS...');
[ranked_list_genes, scores] = prioritization( samples, F, F_opp, network_genes, choose_mut_diff, rank_method_str);
end
function [ mutation_data ] = read_mutations( filename )
fid = fopen(filename,'r');
g = textscan(fid, '%s%s', 'delimiter', '\t');
fclose(fid);
unique_samples = unique(g{2});
mutation_data = cell(length(unique_samples),1);
for i = 1:length(unique_samples),
mutation_data{i} = g{1}(ismember(g{2},unique_samples{i}));
end
end
function [ diff_expr_genes ] = read_diff_expr( filenameDE, filenamePR )
diff_expr_genes ={}; RNA_names={}; rppa_names={};
if ~isempty(filenamePR),
[rppa_names, rppa_pval] = read_file(filenamePR);
else
disp('No Input file was provided for differentially expressed genes at the proteome level.');
end
if ~isempty(filenameDE),
[RNA_names, RNA_pval] = read_file(filenameDE);
else
disp('No Input file was provided for differentially expressed genes at the RNA level.');
end
%check for duplicate genes in the input files
if (length(unique(RNA_names)) ~= length(RNA_names)) || (length(unique(rppa_names)) ~= length(rppa_names)),
disp('Input files for differentially expressed genes should contain only one entry per gene.');
disp('Input files for differentially expressed genes were ignored.');
return;
end
%check if the input files for differential expression are empty
if isempty(rppa_names) && isempty(RNA_names),
disp('Only genetically aberrant genes are used for diffusion.');
return;
%only differential expressed genes at the RNA level are provided
elseif isempty(rppa_names) && ~isempty(RNA_names),
[~,~,pval_all] = fdr(RNA_pval,0.05);
diff_expr_genes = RNA_names(pval_all < 0.05);
%only differential expressed genes at the protein level are provided
elseif isempty(RNA_names) && ~isempty(rppa_names),
[~,~,pval_all] = fdr(rppa_pval,0.05);
diff_expr_genes = rppa_names(pval_all < 0.05);
else
names_intersection = intersect(RNA_names,rppa_names);
names_setdiff_rna = setdiff(RNA_names,rppa_names);
names_setdiff_pr = setdiff(rppa_names,RNA_names);
rna_pval_names_intersection = RNA_pval(ismember(RNA_names,names_intersection));
rppa_pval_names_intersection = rppa_pval(ismember(rppa_names,names_intersection));
rna_pval_names_setdiff_rna = RNA_pval(ismember(RNA_names,names_setdiff_rna));
pr_pval_names_setdiff_pr = rppa_pval(ismember(rppa_names,names_setdiff_pr));
all_names = [names_intersection' names_setdiff_rna' names_setdiff_pr'];
%Fishers method
pval_all = (1-pchisq(-2*(log(rna_pval_names_intersection)+log(rppa_pval_names_intersection)),4));
pval_all = [pval_all' rna_pval_names_setdiff_rna' pr_pval_names_setdiff_pr'];
[~,~,pval_all] = fdr(pval_all,0.05);
diff_expr_genes = all_names(pval_all < 0.05);
end
end
function [names, pval] = read_file(filenamePR)
fid = fopen(filenamePR,'r');
g = textscan(fid, '%s%s', 'delimiter', '\t');
fclose(fid);
names = g{1}; pval = str2double(g{2});
end