|
a |
|
b/exseek/scripts/feature_selection.py |
|
|
1 |
#! /usr/bin/env python |
|
|
2 |
import argparse, sys, os, errno |
|
|
3 |
import logging |
|
|
4 |
logging.basicConfig(level=logging.INFO, format='[%(asctime)s] [%(levelname)s] %(name)s: %(message)s') |
|
|
5 |
|
|
|
6 |
command_handlers = {} |
|
|
7 |
def command_handler(f): |
|
|
8 |
command_handlers[f.__name__] = f |
|
|
9 |
return f |
|
|
10 |
|
|
|
11 |
def select_samples_by_class(matrix, sample_classes, positive_class=None, negative_class=None): |
|
|
12 |
''' |
|
|
13 |
Args: |
|
|
14 |
matrix: |
|
|
15 |
pandas DataFrame: [n_samples, n_features] |
|
|
16 |
sample_classes: |
|
|
17 |
pandas Series. Index are sample ids. Values are sample classes. |
|
|
18 |
Returns: |
|
|
19 |
X: pandas DataFrame |
|
|
20 |
y: ndarray |
|
|
21 |
''' |
|
|
22 |
if (positive_class is not None) and (negative_class is not None): |
|
|
23 |
positive_class = positive_class.split(',') |
|
|
24 |
negative_class = negative_class.split(',') |
|
|
25 |
else: |
|
|
26 |
unique_classes = np.unique(sample_classes.values) |
|
|
27 |
if len(unique_classes) != 2: |
|
|
28 |
raise ValueError('expect 2 classes but {} classes found'.format(len(unique_classes))) |
|
|
29 |
positive_class, negative_class = unique_classes |
|
|
30 |
positive_class = np.atleast_1d(positive_class) |
|
|
31 |
negative_class = np.atleast_1d(negative_class) |
|
|
32 |
|
|
|
33 |
logger.info('positive class: {}, negative class: {}'.format(positive_class, negative_class)) |
|
|
34 |
X_pos = matrix.loc[sample_classes[sample_classes.isin(positive_class)].index.values] |
|
|
35 |
X_neg = matrix.loc[sample_classes[sample_classes.isin(negative_class)].index.values] |
|
|
36 |
logger.info('number of positive samples: {}, negative samples: {}, class ratio: {}'.format( |
|
|
37 |
X_pos.shape[0], X_neg.shape[0], float(X_pos.shape[0])/X_neg.shape[0])) |
|
|
38 |
X = pd.concat([X_pos, X_neg], axis=0) |
|
|
39 |
y = np.zeros(X.shape[0], dtype=np.int32) |
|
|
40 |
y[X_pos.shape[0]:] = 1 |
|
|
41 |
del X_pos |
|
|
42 |
del X_neg |
|
|
43 |
|
|
|
44 |
return X, y |
|
|
45 |
|
|
|
46 |
@command_handler |
|
|
47 |
def preprocess_features(args): |
|
|
48 |
import numpy as np |
|
|
49 |
import pandas as pd |
|
|
50 |
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler |
|
|
51 |
|
|
|
52 |
logger.info('read feature matrix: ' + args.matrix) |
|
|
53 |
X = pd.read_table(args.matrix, index_col=0, sep='\t') |
|
|
54 |
if args.transpose: |
|
|
55 |
logger.info('transpose feature matrix') |
|
|
56 |
X = X.T |
|
|
57 |
logger.info('{} samples, {} features'.format(X.shape[0], X.shape[1])) |
|
|
58 |
if args.remove_zero_features is not None: |
|
|
59 |
logger.info('remove features with zero fraction larger than {}'.format(args.remove_zero_features)) |
|
|
60 |
X = X.loc[:, ~(np.isclose(X, 0).sum(axis=0) > (X.shape[0]*args.remove_zero_features))] |
|
|
61 |
if args.rpkm_top is not None: |
|
|
62 |
logger.info('select top {} features ranked by RPKM'.format(args.rpkm_top)) |
|
|
63 |
feature_info = X.columns.to_series().str.split('|', expand=True) |
|
|
64 |
feature_info.columns = ['gene_id', 'gene_type', 'gene_name', 'feature_id', 'transcript_id', 'start', 'end'] |
|
|
65 |
feature_info['start'] = feature_info['start'].astype('int') |
|
|
66 |
feature_info['end'] = feature_info['end'].astype('int') |
|
|
67 |
feature_info['length'] = feature_info['end'] - feature_info['start'] |
|
|
68 |
rpkm = 1e3*X.div(feature_info['length'], axis=1) |
|
|
69 |
mean_rpkm = np.exp(np.log(rpkm + 0.01).mean(axis=0)) - 0.01 |
|
|
70 |
features_select = mean_rpkm.sort_values(ascending=False)[:args.rpkm_top].index.values |
|
|
71 |
X = X.loc[:, features_select] |
|
|
72 |
elif args.expr_top is not None: |
|
|
73 |
logger.info('select top {} features ranked by raw expression value'.format(args.expr_top)) |
|
|
74 |
mean_expr = np.exp(np.log(X + 0.01).mean(axis=0)) - 0.01 |
|
|
75 |
features_select = mean_expr.sort_values(ascending=False)[:args.expr_top].index.values |
|
|
76 |
X = X.loc[:, features_select] |
|
|
77 |
|
|
|
78 |
feature_names = X.columns.values |
|
|
79 |
logger.info('{} samples, {} features'.format(X.shape[0], X.shape[1])) |
|
|
80 |
logger.info('sample: {} ...'.format(str(X.index.values[:3]))) |
|
|
81 |
logger.info('features: {} ...'.format(str(X.columns.values[:3]))) |
|
|
82 |
|
|
|
83 |
n_samples, n_features = X.shape |
|
|
84 |
sample_ids = X.index.values |
|
|
85 |
|
|
|
86 |
if args.use_log: |
|
|
87 |
logger.info('apply log2 to feature matrix') |
|
|
88 |
X = np.log2(X + 0.001) |
|
|
89 |
|
|
|
90 |
if args.scaler == 'zscore': |
|
|
91 |
logger.info('scale features using z-score normalization') |
|
|
92 |
X = StandardScaler().fit_transform(X) |
|
|
93 |
elif args.scaler == 'robust': |
|
|
94 |
logger.info('scale features using robust normalization') |
|
|
95 |
X = RobustScaler().fit_transform(X) |
|
|
96 |
elif args.scaler == 'min_max': |
|
|
97 |
logger.info('scale features using min-max normalization') |
|
|
98 |
X = MinMaxScaler().fit_transform(X) |
|
|
99 |
elif args.scaler == 'max_abs': |
|
|
100 |
logger.info('scale features using max-abs normalization') |
|
|
101 |
X = MaxAbsScaler().fit_transform(X) |
|
|
102 |
|
|
|
103 |
X = pd.DataFrame(X, index=sample_ids, columns=feature_names) |
|
|
104 |
X.index.name = 'sample' |
|
|
105 |
X.to_csv(args.output_file, sep='\t', header=True, index=True, na_rep='NA') |
|
|
106 |
|
|
|
107 |
@command_handler |
|
|
108 |
def evaluate(args): |
|
|
109 |
import numpy as np |
|
|
110 |
import pandas as pd |
|
|
111 |
from sklearn.linear_model import LogisticRegression |
|
|
112 |
from sklearn.ensemble import RandomForestClassifier |
|
|
113 |
from sklearn.svm import LinearSVC |
|
|
114 |
from sklearn.metrics import roc_auc_score, accuracy_score, get_scorer |
|
|
115 |
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler |
|
|
116 |
from sklearn.model_selection import GridSearchCV |
|
|
117 |
from sklearn.feature_selection import RFE, RFECV |
|
|
118 |
from sklearn.utils.class_weight import compute_sample_weight |
|
|
119 |
from sklearn.model_selection import KFold, StratifiedKFold, ShuffleSplit, LeaveOneOut, \ |
|
|
120 |
RepeatedKFold, RepeatedStratifiedKFold, LeaveOneOut, StratifiedShuffleSplit |
|
|
121 |
import pickle |
|
|
122 |
from estimators import RobustEstimator |
|
|
123 |
from tqdm import tqdm |
|
|
124 |
import h5py |
|
|
125 |
|
|
|
126 |
logger.info('read feature matrix: ' + args.matrix) |
|
|
127 |
m = pd.read_table(args.matrix, index_col=0, sep='\t') |
|
|
128 |
feature_names = m.columns.values |
|
|
129 |
logger.info('{} samples, {} features'.format(m.shape[0], m.shape[1])) |
|
|
130 |
logger.info('sample: {} ...'.format(str(m.index.values[:3]))) |
|
|
131 |
logger.info('features: {} ...'.format(str(m.columns.values[:3]))) |
|
|
132 |
|
|
|
133 |
logger.info('read sample classes: ' + args.sample_classes) |
|
|
134 |
sample_classes = pd.read_table(args.sample_classes, index_col=0, sep='\t') |
|
|
135 |
sample_classes = sample_classes.iloc[:, 0] |
|
|
136 |
sample_classes = sample_classes.loc[m.index.values] |
|
|
137 |
logger.info('sample_classes: {}'.format(sample_classes.shape[0])) |
|
|
138 |
|
|
|
139 |
# select samples |
|
|
140 |
if (args.positive_class is not None) and (args.negative_class is not None): |
|
|
141 |
positive_class = args.positive_class.split(',') |
|
|
142 |
negative_class = args.negative_class.split(',') |
|
|
143 |
else: |
|
|
144 |
unique_classes = np.unique(sample_classes.values) |
|
|
145 |
if len(unique_classes) != 2: |
|
|
146 |
raise ValueError('expect 2 classes but {} classes found'.format(len(unique_classes))) |
|
|
147 |
positive_class, negative_class = unique_classes |
|
|
148 |
positive_class = np.atleast_1d(positive_class) |
|
|
149 |
negative_class = np.atleast_1d(negative_class) |
|
|
150 |
|
|
|
151 |
logger.info('positive class: {}, negative class: {}'.format(positive_class, negative_class)) |
|
|
152 |
X_pos = m.loc[sample_classes[sample_classes.isin(positive_class)].index.values] |
|
|
153 |
X_neg = m.loc[sample_classes[sample_classes.isin(negative_class)].index.values] |
|
|
154 |
logger.info('number of positive samples: {}, negative samples: {}, class ratio: {}'.format( |
|
|
155 |
X_pos.shape[0], X_neg.shape[0], float(X_pos.shape[0])/X_neg.shape[0])) |
|
|
156 |
X = pd.concat([X_pos, X_neg], axis=0) |
|
|
157 |
y = np.zeros(X.shape[0], dtype=np.int32) |
|
|
158 |
y[X_pos.shape[0]:] = 1 |
|
|
159 |
del X_pos |
|
|
160 |
del X_neg |
|
|
161 |
n_samples, n_features = X.shape |
|
|
162 |
sample_ids = X.index.values |
|
|
163 |
|
|
|
164 |
if not os.path.isdir(args.output_dir): |
|
|
165 |
logger.info('create outout directory: ' + args.output_dir) |
|
|
166 |
os.makedirs(args.output_dir) |
|
|
167 |
|
|
|
168 |
logger.info('save sample ids') |
|
|
169 |
X.index.to_series().to_csv(os.path.join(args.output_dir, 'samples.txt'), |
|
|
170 |
sep='\t', header=False, index=False) |
|
|
171 |
logger.info('save sample classes') |
|
|
172 |
np.savetxt(os.path.join(args.output_dir, 'classes.txt'), y, fmt='%d') |
|
|
173 |
|
|
|
174 |
# get numpy array from DataFrame |
|
|
175 |
X = X.values |
|
|
176 |
|
|
|
177 |
# check NaN values |
|
|
178 |
if np.any(np.isnan(X)): |
|
|
179 |
logger.info('nan values found in features') |
|
|
180 |
estimator = None |
|
|
181 |
grid_search = None |
|
|
182 |
logger.info('use {} to select features'.format(args.method)) |
|
|
183 |
if args.method == 'logistic_regression': |
|
|
184 |
estimator = LogisticRegression() |
|
|
185 |
grid_search = {'C': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e2, 1e3, 1e4, 1e5]} |
|
|
186 |
elif args.method == 'random_forest': |
|
|
187 |
estimator = RandomForestClassifier() |
|
|
188 |
grid_search = {'n_estimators': [25, 50, 75], 'max_depth': list(range(2, 8)) } |
|
|
189 |
elif args.method == 'linear_svm': |
|
|
190 |
estimator = LinearSVC() |
|
|
191 |
grid_search = {'C': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e2, 1e3, 1e4, 1e5]} |
|
|
192 |
else: |
|
|
193 |
raise ValueError('unknown feature selection method: {}'.format(args.method)) |
|
|
194 |
|
|
|
195 |
def get_splitter(splitter, n_splits=5, n_repeats=5, test_size=0.2): |
|
|
196 |
if splitter == 'kfold': |
|
|
197 |
return KFold(n_splits=n_splits) |
|
|
198 |
elif splitter == 'stratified_kfold': |
|
|
199 |
return StratifiedKFold(n_splits=n_splits) |
|
|
200 |
elif splitter == 'repeated_stratified_kfold': |
|
|
201 |
return RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats) |
|
|
202 |
elif splitter == 'shuffle_split': |
|
|
203 |
return ShuffleSplit(n_splits=n_splits, test_size=test_size) |
|
|
204 |
elif splitter == 'stratified_shuffle_split': |
|
|
205 |
return StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size) |
|
|
206 |
elif splitter == 'leave_one_out': |
|
|
207 |
return LeaveOneOut() |
|
|
208 |
else: |
|
|
209 |
raise ValueError('unknown splitter: {}'.format(splitter)) |
|
|
210 |
|
|
|
211 |
def score_function(estimator): |
|
|
212 |
'''Get method of an estimator that predict a continous score for each sample |
|
|
213 |
''' |
|
|
214 |
if hasattr(estimator, 'predict_proba'): |
|
|
215 |
return estimator.predict_proba |
|
|
216 |
elif hasattr(estimator, 'decision_function'): |
|
|
217 |
return estimator.decision_function |
|
|
218 |
else: |
|
|
219 |
raise ValueError('the estimator should either have decision_function() method or predict_proba() method') |
|
|
220 |
|
|
|
221 |
def feature_importances(estimator): |
|
|
222 |
'''Get feature importance attribute of an estimator |
|
|
223 |
''' |
|
|
224 |
if hasattr(estimator, 'coef_'): |
|
|
225 |
return np.ravel(estimator.coef_) |
|
|
226 |
elif hasattr(estimator, 'feature_importances_'): |
|
|
227 |
return np.ravel(estimator.feature_importances_) |
|
|
228 |
else: |
|
|
229 |
raise ValueError('the estimator should have either coef_ or feature_importances_ attribute') |
|
|
230 |
|
|
|
231 |
def get_scorer(scoring): |
|
|
232 |
if scoring == 'roc_auc': |
|
|
233 |
return roc_auc_score |
|
|
234 |
else: |
|
|
235 |
raise ValueError('unknonwn scoring: {}'.format(scoring)) |
|
|
236 |
|
|
|
237 |
splitter = get_splitter(args.splitter, n_splits=args.n_splits, n_repeats=args.n_repeats) |
|
|
238 |
metrics = [] |
|
|
239 |
predictions = np.full((splitter.get_n_splits(X), X.shape[0]), np.nan) |
|
|
240 |
predicted_labels = np.full((splitter.get_n_splits(X), X.shape[0]), np.nan) |
|
|
241 |
train_index_matrix = np.zeros((splitter.get_n_splits(X), X.shape[0]),dtype=np.bool) |
|
|
242 |
feature_selection_matrix = None |
|
|
243 |
if args.n_select is not None: |
|
|
244 |
feature_selection_matrix = np.zeros((splitter.get_n_splits(X), X.shape[1]), dtype=bool) |
|
|
245 |
if args.rfe: |
|
|
246 |
if 0.0 < args.rfe_step < 1.0: |
|
|
247 |
rfe_step = int(max(1, args.rfe_step*n_features)) |
|
|
248 |
else: |
|
|
249 |
rfe_step = int(args.rfe_step) |
|
|
250 |
rfe_scores = None |
|
|
251 |
i_split = 0 |
|
|
252 |
scorer = get_scorer(args.scorer) |
|
|
253 |
data_splits = list(splitter.split(X, y)) |
|
|
254 |
data_splits.append((np.arange(n_samples), None)) |
|
|
255 |
for train_index, test_index in tqdm(data_splits, total=splitter.get_n_splits(X) + 1, unit='fold'): |
|
|
256 |
X_train, y_train = X[train_index], y[train_index] |
|
|
257 |
X_test, y_test = X[test_index], y[test_index] |
|
|
258 |
# optimize hyper-parameters |
|
|
259 |
if grid_search is not None: |
|
|
260 |
cv = GridSearchCV(estimator, grid_search, cv=5) |
|
|
261 |
cv.fit(X[train_index], y[train_index]) |
|
|
262 |
estimator = cv.best_estimator_ |
|
|
263 |
|
|
|
264 |
sample_weight = np.ones(X_train.shape[0]) |
|
|
265 |
if args.compute_sample_weight: |
|
|
266 |
sample_weight = compute_sample_weight('balanced', y_train) |
|
|
267 |
# feature selection |
|
|
268 |
if args.n_select is not None: |
|
|
269 |
if args.robust_select: |
|
|
270 |
resampler_args = {} |
|
|
271 |
if args.robust_resample_method == 'jackknife': |
|
|
272 |
resampler_args = {'max_runs': args.robust_max_runs, |
|
|
273 |
'remove': args.robust_jackknife_remove |
|
|
274 |
} |
|
|
275 |
elif args.robust_resample_method == 'bootstrap': |
|
|
276 |
resampler_args = {'max_runs': args.robust_max_runs} |
|
|
277 |
robust_estimator = RobustEstimator(estimator, n_select=args.n_select, |
|
|
278 |
resample_method=args.robust_resample_method, |
|
|
279 |
rfe=args.rfe, **resampler_args) |
|
|
280 |
robust_estimator.fit(X_train, y_train, sample_weight=sample_weight) |
|
|
281 |
estimator = robust_estimator.estimator_ |
|
|
282 |
features = robust_estimator.features_ |
|
|
283 |
# RFE feature selection |
|
|
284 |
elif args.rfe: |
|
|
285 |
rfe = RFE(estimator, n_features_to_select=args.n_select, step=rfe_step) |
|
|
286 |
if i_split < splitter.get_n_splits(X): |
|
|
287 |
if args.splitter == 'leave_one_out': |
|
|
288 |
# AUC is undefined for only one test sample |
|
|
289 |
step_score = lambda estimator, features: np.nan |
|
|
290 |
else: |
|
|
291 |
step_score = lambda estimator, features: scorer(y_test, |
|
|
292 |
score_function(estimator)(X[test_index][:, features])[:, 1]) |
|
|
293 |
else: |
|
|
294 |
step_score = None |
|
|
295 |
rfe._fit(X_train, y_train, step_score=step_score) |
|
|
296 |
features = np.nonzero(rfe.ranking_ == 1)[0] |
|
|
297 |
if i_split < splitter.get_n_splits(X): |
|
|
298 |
if rfe_scores is None: |
|
|
299 |
rfe_n_steps = len(rfe.scores_) |
|
|
300 |
rfe_n_features_step = np.maximum(n_features - rfe_step*np.arange(rfe_n_steps), 1) |
|
|
301 |
rfe_scores = np.zeros((splitter.get_n_splits(X), rfe_n_steps)) |
|
|
302 |
rfe_scores[i_split] = rfe.scores_ |
|
|
303 |
estimator = rfe.estimator_ |
|
|
304 |
# no feature selection |
|
|
305 |
else: |
|
|
306 |
# train the model |
|
|
307 |
estimator.fit(X[train_index], y[train_index], sample_weight=sample_weight) |
|
|
308 |
features = np.argsort(-feature_importances(estimator))[:args.n_select] |
|
|
309 |
if i_split < splitter.get_n_splits(X): |
|
|
310 |
feature_selection_matrix[i_split, features] = True |
|
|
311 |
else: |
|
|
312 |
# no feature selection |
|
|
313 |
features = np.arange(n_features, dtype=np.int64) |
|
|
314 |
|
|
|
315 |
estimator.fit(X[train_index][:, features], y[train_index], sample_weight=sample_weight) |
|
|
316 |
if i_split != splitter.get_n_splits(X): |
|
|
317 |
predictions[i_split] = score_function(estimator)(X[:, features])[:, 1] |
|
|
318 |
predicted_labels[i_split] = estimator.predict(X[:, features]) |
|
|
319 |
metric = {} |
|
|
320 |
metric['train_{}'.format(args.scorer)] = scorer(y_train, predictions[i_split, train_index]) |
|
|
321 |
# AUC is undefined for only one test sample |
|
|
322 |
if args.splitter != 'leave_one_out': |
|
|
323 |
metric['test_{}'.format(args.scorer)] = scorer(y_test, predictions[i_split, test_index]) |
|
|
324 |
if args.splitter in ('repeated_kfold', 'repeated_stratified_kfold'): |
|
|
325 |
metric['repeat'] = i_split//args.n_repeats |
|
|
326 |
metric['split'] = i_split%args.n_repeats |
|
|
327 |
else: |
|
|
328 |
metric['split'] = i_split |
|
|
329 |
metrics.append(metric) |
|
|
330 |
train_index_matrix[i_split, train_index] = True |
|
|
331 |
i_split += 1 |
|
|
332 |
metrics = pd.DataFrame.from_records(metrics) |
|
|
333 |
if args.splitter == 'leave_one_out': |
|
|
334 |
metrics['test_{}'.format(args.scorer)] = scorer(y, predictions[np.r_[:n_samples], np.r_[:n_samples]]) |
|
|
335 |
|
|
|
336 |
logger.info('save best model') |
|
|
337 |
with open(os.path.join(args.output_dir, 'best_model.pkl'), 'wb') as f: |
|
|
338 |
pickle.dump(estimator, f) |
|
|
339 |
|
|
|
340 |
logger.info('save features') |
|
|
341 |
data = pd.Series(features, index=feature_names[features]) |
|
|
342 |
data.to_csv(os.path.join(args.output_dir, 'features.txt'), sep='\t', header=False) |
|
|
343 |
|
|
|
344 |
logger.info('save feature importances') |
|
|
345 |
data = pd.Series(feature_importances(estimator), index=feature_names[features]) |
|
|
346 |
data.to_csv(os.path.join(args.output_dir, 'feature_importances.txt'), sep='\t', header=False) |
|
|
347 |
|
|
|
348 |
logger.info('save evaluations') |
|
|
349 |
with h5py.File(os.path.join(args.output_dir, 'evaluation.{}.h5'.format(args.splitter)), 'w') as f: |
|
|
350 |
f.create_dataset('train_index', data=train_index_matrix) |
|
|
351 |
f.create_dataset('predictions', data=predictions) |
|
|
352 |
if feature_selection_matrix is not None: |
|
|
353 |
f.create_dataset('feature_selection', data=feature_selection_matrix) |
|
|
354 |
if args.rfe: |
|
|
355 |
f.create_dataset('rfe_n_features_step', data=rfe_n_features_step) |
|
|
356 |
f.create_dataset('rfe_scores', data=rfe_scores) |
|
|
357 |
f.create_dataset('labels', data=y) |
|
|
358 |
f.create_dataset('predicted_labels', data=predicted_labels) |
|
|
359 |
|
|
|
360 |
logger.info('save metrics') |
|
|
361 |
metrics.to_csv(os.path.join(args.output_dir, 'metrics.txt'), sep='\t', header=True, index=False) |
|
|
362 |
|
|
|
363 |
@command_handler |
|
|
364 |
def calculate_clustering_score(args): |
|
|
365 |
import numpy as np |
|
|
366 |
import pandas as pd |
|
|
367 |
from evaluation import uca_score, knn_score |
|
|
368 |
from ioutils import open_file_or_stdout |
|
|
369 |
|
|
|
370 |
logger.info('read feature matrix: ' + args.matrix) |
|
|
371 |
X = pd.read_table(args.matrix, index_col=0, sep='\t') |
|
|
372 |
|
|
|
373 |
if args.transpose: |
|
|
374 |
logger.info('transpose feature matrix') |
|
|
375 |
X = X.T |
|
|
376 |
if args.use_log: |
|
|
377 |
logger.info('apply log2 to feature matrix') |
|
|
378 |
X = np.log2(X + 0.25) |
|
|
379 |
|
|
|
380 |
logger.info('calculate clustering score') |
|
|
381 |
if args.method == 'uca_score': |
|
|
382 |
if args.sample_classes is None: |
|
|
383 |
raise ValueError('argument --sample-classes is required for knn_score') |
|
|
384 |
logger.info('read sample classes: ' + args.sample_classes) |
|
|
385 |
sample_classes = pd.read_table(args.sample_classes, index_col=0, sep='\t').iloc[:, 0] |
|
|
386 |
y = sample_classes[X.index.values].values |
|
|
387 |
score = uca_score(X, y) |
|
|
388 |
elif args.method == 'knn_score': |
|
|
389 |
if args.batch is None: |
|
|
390 |
raise ValueError('argument --batch is required for knn_score') |
|
|
391 |
if args.batch_index is None: |
|
|
392 |
raise ValueError('argument --batch-index is required for knn_score') |
|
|
393 |
logger.info('read batch information: ' + args.batch) |
|
|
394 |
batch = pd.read_table(args.batch, index_col=0, sep='\t').iloc[:, args.batch_index - 1] |
|
|
395 |
batch = batch[X.index.values].values |
|
|
396 |
score = knn_score(X, batch) |
|
|
397 |
else: |
|
|
398 |
raise ValueError('unknown clustering score method: ' + args.method) |
|
|
399 |
with open_file_or_stdout(args.output_file) as fout: |
|
|
400 |
fout.write('{}'.format(score)) |
|
|
401 |
|
|
|
402 |
@command_handler |
|
|
403 |
def evaluate_single_features(args): |
|
|
404 |
import numpy as np |
|
|
405 |
import pandas as pd |
|
|
406 |
from ioutils import open_file_or_stdout |
|
|
407 |
from tqdm import tqdm |
|
|
408 |
from sklearn.metrics import roc_auc_score |
|
|
409 |
|
|
|
410 |
logger.info('read feature matrix: ' + args.matrix) |
|
|
411 |
matrix = pd.read_table(args.matrix, index_col=0, sep='\t') |
|
|
412 |
logger.info('read sample classes: ' + args.sample_classes) |
|
|
413 |
sample_classes = pd.read_table(args.sample_classes, index_col=0, sep='\t').iloc[:, 0] |
|
|
414 |
if args.transpose: |
|
|
415 |
logger.info('transpose feature matrix') |
|
|
416 |
matrix = matrix.T |
|
|
417 |
logger.info('select positive and negative samples') |
|
|
418 |
X, y = select_samples_by_class(matrix, sample_classes, |
|
|
419 |
positive_class=args.positive_class, |
|
|
420 |
negative_class=args.negative_class) |
|
|
421 |
n_samples, n_features = X.shape |
|
|
422 |
logger.info('evaluate single features') |
|
|
423 |
scorers = [('roc_auc', roc_auc_score)] |
|
|
424 |
scores = np.zeros((n_features, len(scorers))) |
|
|
425 |
for i in tqdm(range(n_features), unit='feature'): |
|
|
426 |
for j in range(len(scorers)): |
|
|
427 |
scores[i, j] = scorers[j][1](y, X.iloc[:, i]) |
|
|
428 |
# if AUC < 0.5, use 1 - AUC |
|
|
429 |
scores[i, j] = max(scores[i, j], 1 - scores[i, j]) |
|
|
430 |
scores = pd.DataFrame(scores, index=X.columns.values, columns=[name for name, scorer in scorers]) |
|
|
431 |
scores.index.name = 'feature' |
|
|
432 |
logger.info('write scores to file: ' + args.output_file) |
|
|
433 |
scores.to_csv(args.output_file, sep='\t', index=True, header=True, na_rep='NA') |
|
|
434 |
|
|
|
435 |
if __name__ == '__main__': |
|
|
436 |
main_parser = argparse.ArgumentParser(description='Feature selection module') |
|
|
437 |
subparsers = main_parser.add_subparsers(dest='command') |
|
|
438 |
|
|
|
439 |
parser = subparsers.add_parser('preprocess_features') |
|
|
440 |
parser.add_argument('--matrix', '-i', type=str, required=True, |
|
|
441 |
help='input feature matrix (rows are samples and columns are features') |
|
|
442 |
parser.add_argument('--use-log', action='store_true', |
|
|
443 |
help='apply log2 to feature matrix') |
|
|
444 |
parser.add_argument('--transpose', action='store_true', default=False, |
|
|
445 |
help='transpose the feature matrix') |
|
|
446 |
parser.add_argument('--scaler', type=str, |
|
|
447 |
choices=('zscore', 'robust', 'max_abs', 'min_max', 'none'), |
|
|
448 |
help='method for scaling features') |
|
|
449 |
parser.add_argument('--output-file', '-o', type=str, required=True, |
|
|
450 |
help='output file name') |
|
|
451 |
parser.add_argument('--remove-zero-features', type=float, |
|
|
452 |
help='remove features that have fraction of zero values above this value') |
|
|
453 |
parser.add_argument('--rpkm-top', type=int, |
|
|
454 |
help='Maximum number of top features to select ranked by RPKM') |
|
|
455 |
parser.add_argument('--expr-top', type=int, |
|
|
456 |
help='Maximum number of top features to select ranked by raw expression value') |
|
|
457 |
|
|
|
458 |
parser = subparsers.add_parser('evaluate') |
|
|
459 |
parser.add_argument('--matrix', '-i', type=str, required=True, |
|
|
460 |
help='input feature matrix (rows are samples and columns are features') |
|
|
461 |
parser.add_argument('--sample-classes', type=str, required=True, |
|
|
462 |
help='input file containing sample classes with 2 columns: sample_id, sample_class') |
|
|
463 |
parser.add_argument('--positive-class', type=str, |
|
|
464 |
help='comma-separated list of sample classes to use as positive class') |
|
|
465 |
parser.add_argument('--negative-class', type=str, |
|
|
466 |
help='comma-separates list of sample classes to use as negative class') |
|
|
467 |
parser.add_argument('--method', type=str, default='logistic_regression', |
|
|
468 |
choices=('logistic_regression', 'random_forest', 'linear_svm'), |
|
|
469 |
help='feature selection method') |
|
|
470 |
parser.add_argument('--output-dir', '-o', type=str, required=True, |
|
|
471 |
help='output directory') |
|
|
472 |
parser.add_argument('--compute-sample-weight', action='store_true', |
|
|
473 |
help='compute sample weight to balance classes') |
|
|
474 |
parser.add_argument('--splitter', type=str, default='stratified_kfold', |
|
|
475 |
choices=('kfold', 'stratified_kfold', 'shuffle_split', 'repeated_stratified_kfold', 'leave_one_out', 'stratified_shuffle_split')) |
|
|
476 |
parser.add_argument('--rfe', action='store_true', default=False, |
|
|
477 |
help='use RFE to select features') |
|
|
478 |
parser.add_argument('--rfe-step', type=float, |
|
|
479 |
help='number/fraction of features to eliminate in each step') |
|
|
480 |
parser.add_argument('--n-select', type=int, |
|
|
481 |
help='number of features to select') |
|
|
482 |
parser.add_argument('--n-splits', type=int, default=5, |
|
|
483 |
help='number of splits for kfold, stratified_kfold and shuffle_splits') |
|
|
484 |
parser.add_argument('--n-repeats', type=int, default=10, |
|
|
485 |
help='number of repeats for repeated_stratified_kfold and repeated_kfold') |
|
|
486 |
parser.add_argument('--test-size', type=float, default=0.2, |
|
|
487 |
help='fraction/number of samples for testing') |
|
|
488 |
parser.add_argument('--scorer', type=str, default='roc_auc', |
|
|
489 |
help='metric to use') |
|
|
490 |
parser.add_argument('--robust-select', action='store_true', default=False, |
|
|
491 |
help='use robust feature selection by selecting recurring features across resampling runs') |
|
|
492 |
parser.add_argument('--robust-resample-method', type=str, default='jackknife', |
|
|
493 |
choices=('bootstrap', 'jackknife'), help='resampling method for robust feature selection') |
|
|
494 |
parser.add_argument('--robust-max-runs', type=int, default=50, |
|
|
495 |
help='number of resampling runs for robust feature selections') |
|
|
496 |
parser.add_argument('--robust-jackknife-remove', type=float, default=1, |
|
|
497 |
help='number/fraction of samples to remove during each resampling run for robust feature selection') |
|
|
498 |
parser.add_argument('--remove-zero-features', type=float, |
|
|
499 |
help='remove features that have fraction of zero values above this value') |
|
|
500 |
|
|
|
501 |
parser = subparsers.add_parser('calculate_clustering_score', |
|
|
502 |
help='evaluate a normalized matrix by clustering score') |
|
|
503 |
parser.add_argument('--matrix', '-i', type=str, required=True, |
|
|
504 |
help='input feature matrix (rows are samples and columns are features') |
|
|
505 |
parser.add_argument('--sample-classes', type=str, required=False, |
|
|
506 |
help='input file containing sample classes with 2 columns: sample_id, sample_class') |
|
|
507 |
parser.add_argument('--batch', type=str, required=False, |
|
|
508 |
help='batch information') |
|
|
509 |
parser.add_argument('--batch-index', type=int, |
|
|
510 |
help='column index (1-based) to use in the batch information table') |
|
|
511 |
parser.add_argument('--output-file', '-o', type=str, default='-', |
|
|
512 |
help='output file for the score') |
|
|
513 |
parser.add_argument('--transpose', action='store_true', default=False, |
|
|
514 |
help='transpose the feature matrix') |
|
|
515 |
parser.add_argument('--method', type=str, required=True, choices=('uca_score', 'knn_score'), |
|
|
516 |
help='score method') |
|
|
517 |
parser.add_argument('--use-log', action='store_true', |
|
|
518 |
help='apply log2 to feature matrix') |
|
|
519 |
|
|
|
520 |
parser = subparsers.add_parser('evaluate_single_features') |
|
|
521 |
parser.add_argument('--matrix', '-i', type=str, required=True) |
|
|
522 |
parser.add_argument('--sample-classes', type=str, required=True, |
|
|
523 |
help='input file containing sample classes with 2 columns: sample_id, sample_class') |
|
|
524 |
parser.add_argument('--positive-class', type=str, |
|
|
525 |
help='comma-separated list of sample classes to use as positive class') |
|
|
526 |
parser.add_argument('--negative-class', type=str, |
|
|
527 |
help='comma-separates list of sample classes to use as negative class') |
|
|
528 |
parser.add_argument('--transpose', action='store_true', default=False, |
|
|
529 |
help='transpose the feature matrix') |
|
|
530 |
parser.add_argument('--output-file', '-o', type=str, required=True, |
|
|
531 |
help='output file for the score') |
|
|
532 |
|
|
|
533 |
args = main_parser.parse_args() |
|
|
534 |
if args.command is None: |
|
|
535 |
print('Errror: missing command', file=sys.stdout) |
|
|
536 |
main_parser.print_help() |
|
|
537 |
sys.exit(1) |
|
|
538 |
logger = logging.getLogger('feature_selection.' + args.command) |
|
|
539 |
|
|
|
540 |
# global imports |
|
|
541 |
import numpy as np |
|
|
542 |
import pandas as pd |
|
|
543 |
|
|
|
544 |
command_handlers.get(args.command)(args) |