a b/exseek/scripts/feature_selection.py
1
#! /usr/bin/env python
2
import argparse, sys, os, errno
3
import logging
4
logging.basicConfig(level=logging.INFO, format='[%(asctime)s] [%(levelname)s] %(name)s: %(message)s')
5
6
command_handlers = {}
7
def command_handler(f):
8
    command_handlers[f.__name__] = f
9
    return f
10
11
def select_samples_by_class(matrix, sample_classes, positive_class=None, negative_class=None):
12
    '''
13
    Args:
14
        matrix: 
15
            pandas DataFrame: [n_samples, n_features]
16
        sample_classes: 
17
            pandas Series. Index are sample ids. Values are sample classes.
18
    Returns:
19
        X: pandas DataFrame
20
        y: ndarray
21
    '''
22
    if (positive_class is not None) and (negative_class is not None):
23
        positive_class = positive_class.split(',')
24
        negative_class = negative_class.split(',')
25
    else:
26
        unique_classes = np.unique(sample_classes.values)
27
        if len(unique_classes) != 2:
28
            raise ValueError('expect 2 classes but {} classes found'.format(len(unique_classes)))
29
        positive_class, negative_class = unique_classes
30
    positive_class = np.atleast_1d(positive_class)
31
    negative_class = np.atleast_1d(negative_class)
32
33
    logger.info('positive class: {}, negative class: {}'.format(positive_class, negative_class))
34
    X_pos = matrix.loc[sample_classes[sample_classes.isin(positive_class)].index.values]
35
    X_neg = matrix.loc[sample_classes[sample_classes.isin(negative_class)].index.values]
36
    logger.info('number of positive samples: {}, negative samples: {}, class ratio: {}'.format(
37
        X_pos.shape[0], X_neg.shape[0], float(X_pos.shape[0])/X_neg.shape[0]))
38
    X = pd.concat([X_pos, X_neg], axis=0)
39
    y = np.zeros(X.shape[0], dtype=np.int32)
40
    y[X_pos.shape[0]:] = 1
41
    del X_pos
42
    del X_neg
43
44
    return X, y
45
46
@command_handler
47
def preprocess_features(args):
48
    import numpy as np
49
    import pandas as pd
50
    from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler
51
52
    logger.info('read feature matrix: ' + args.matrix)
53
    X = pd.read_table(args.matrix, index_col=0, sep='\t')
54
    if args.transpose:
55
        logger.info('transpose feature matrix')
56
        X = X.T
57
    logger.info('{} samples, {} features'.format(X.shape[0], X.shape[1]))
58
    if args.remove_zero_features is not None:
59
        logger.info('remove features with zero fraction larger than {}'.format(args.remove_zero_features))
60
        X = X.loc[:, ~(np.isclose(X, 0).sum(axis=0) > (X.shape[0]*args.remove_zero_features))]
61
    if args.rpkm_top is not None:
62
        logger.info('select top {} features ranked by RPKM'.format(args.rpkm_top))
63
        feature_info = X.columns.to_series().str.split('|', expand=True)
64
        feature_info.columns = ['gene_id', 'gene_type', 'gene_name', 'feature_id', 'transcript_id', 'start', 'end']
65
        feature_info['start'] = feature_info['start'].astype('int')
66
        feature_info['end'] = feature_info['end'].astype('int')
67
        feature_info['length'] = feature_info['end'] - feature_info['start']
68
        rpkm = 1e3*X.div(feature_info['length'], axis=1)
69
        mean_rpkm = np.exp(np.log(rpkm + 0.01).mean(axis=0)) - 0.01
70
        features_select = mean_rpkm.sort_values(ascending=False)[:args.rpkm_top].index.values
71
        X = X.loc[:, features_select]
72
    elif args.expr_top is not None:
73
        logger.info('select top {} features ranked by raw expression value'.format(args.expr_top))
74
        mean_expr = np.exp(np.log(X + 0.01).mean(axis=0)) - 0.01
75
        features_select = mean_expr.sort_values(ascending=False)[:args.expr_top].index.values
76
        X = X.loc[:, features_select]
77
78
    feature_names = X.columns.values
79
    logger.info('{} samples, {} features'.format(X.shape[0], X.shape[1]))
80
    logger.info('sample: {} ...'.format(str(X.index.values[:3])))
81
    logger.info('features: {} ...'.format(str(X.columns.values[:3])))
82
83
    n_samples, n_features = X.shape
84
    sample_ids = X.index.values
85
86
    if args.use_log:
87
        logger.info('apply log2 to feature matrix')
88
        X = np.log2(X + 0.001)
89
90
    if args.scaler == 'zscore':
91
        logger.info('scale features using z-score normalization')
92
        X = StandardScaler().fit_transform(X)
93
    elif args.scaler == 'robust':
94
        logger.info('scale features using robust normalization')
95
        X = RobustScaler().fit_transform(X)
96
    elif args.scaler == 'min_max':
97
        logger.info('scale features using min-max normalization')
98
        X = MinMaxScaler().fit_transform(X)
99
    elif args.scaler == 'max_abs':
100
        logger.info('scale features using max-abs normalization')
101
        X = MaxAbsScaler().fit_transform(X)
102
    
103
    X = pd.DataFrame(X, index=sample_ids, columns=feature_names)
104
    X.index.name = 'sample'
105
    X.to_csv(args.output_file, sep='\t', header=True, index=True, na_rep='NA')
106
107
@command_handler
108
def evaluate(args):
109
    import numpy as np
110
    import pandas as pd
111
    from sklearn.linear_model import LogisticRegression
112
    from sklearn.ensemble import RandomForestClassifier
113
    from sklearn.svm import LinearSVC
114
    from sklearn.metrics import roc_auc_score, accuracy_score, get_scorer
115
    from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler
116
    from sklearn.model_selection import GridSearchCV
117
    from sklearn.feature_selection import RFE, RFECV
118
    from sklearn.utils.class_weight import compute_sample_weight
119
    from sklearn.model_selection import KFold, StratifiedKFold, ShuffleSplit, LeaveOneOut, \
120
        RepeatedKFold, RepeatedStratifiedKFold, LeaveOneOut, StratifiedShuffleSplit
121
    import pickle
122
    from estimators import RobustEstimator
123
    from tqdm import tqdm
124
    import h5py
125
126
    logger.info('read feature matrix: ' + args.matrix)
127
    m = pd.read_table(args.matrix, index_col=0, sep='\t')
128
    feature_names = m.columns.values
129
    logger.info('{} samples, {} features'.format(m.shape[0], m.shape[1]))
130
    logger.info('sample: {} ...'.format(str(m.index.values[:3])))
131
    logger.info('features: {} ...'.format(str(m.columns.values[:3])))
132
133
    logger.info('read sample classes: ' + args.sample_classes)
134
    sample_classes = pd.read_table(args.sample_classes, index_col=0, sep='\t')
135
    sample_classes = sample_classes.iloc[:, 0]
136
    sample_classes = sample_classes.loc[m.index.values]
137
    logger.info('sample_classes: {}'.format(sample_classes.shape[0]))
138
139
    # select samples
140
    if (args.positive_class is not None) and (args.negative_class is not None):
141
        positive_class = args.positive_class.split(',')
142
        negative_class = args.negative_class.split(',')
143
    else:
144
        unique_classes = np.unique(sample_classes.values)
145
        if len(unique_classes) != 2:
146
            raise ValueError('expect 2 classes but {} classes found'.format(len(unique_classes)))
147
        positive_class, negative_class = unique_classes
148
    positive_class = np.atleast_1d(positive_class)
149
    negative_class = np.atleast_1d(negative_class)
150
151
    logger.info('positive class: {}, negative class: {}'.format(positive_class, negative_class))
152
    X_pos = m.loc[sample_classes[sample_classes.isin(positive_class)].index.values]
153
    X_neg = m.loc[sample_classes[sample_classes.isin(negative_class)].index.values]
154
    logger.info('number of positive samples: {}, negative samples: {}, class ratio: {}'.format(
155
        X_pos.shape[0], X_neg.shape[0], float(X_pos.shape[0])/X_neg.shape[0]))
156
    X = pd.concat([X_pos, X_neg], axis=0)
157
    y = np.zeros(X.shape[0], dtype=np.int32)
158
    y[X_pos.shape[0]:] = 1
159
    del X_pos
160
    del X_neg
161
    n_samples, n_features = X.shape
162
    sample_ids = X.index.values
163
164
    if not os.path.isdir(args.output_dir):
165
        logger.info('create outout directory: ' + args.output_dir)
166
        os.makedirs(args.output_dir)
167
168
    logger.info('save sample ids')
169
    X.index.to_series().to_csv(os.path.join(args.output_dir, 'samples.txt'),
170
        sep='\t', header=False, index=False)
171
    logger.info('save sample classes')
172
    np.savetxt(os.path.join(args.output_dir, 'classes.txt'), y, fmt='%d')
173
174
    # get numpy array from DataFrame
175
    X = X.values
176
177
    # check NaN values
178
    if np.any(np.isnan(X)):
179
        logger.info('nan values found in features')
180
    estimator = None
181
    grid_search = None
182
    logger.info('use {} to select features'.format(args.method))
183
    if args.method == 'logistic_regression':
184
        estimator = LogisticRegression()
185
        grid_search = {'C': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e2, 1e3, 1e4, 1e5]}
186
    elif args.method == 'random_forest':
187
        estimator = RandomForestClassifier()
188
        grid_search = {'n_estimators': [25, 50, 75], 'max_depth': list(range(2, 8)) }
189
    elif args.method == 'linear_svm':
190
        estimator = LinearSVC()
191
        grid_search = {'C': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e2, 1e3, 1e4, 1e5]}
192
    else:
193
        raise ValueError('unknown feature selection method: {}'.format(args.method))
194
    
195
    def get_splitter(splitter, n_splits=5, n_repeats=5, test_size=0.2):
196
        if splitter == 'kfold':
197
            return KFold(n_splits=n_splits)
198
        elif splitter == 'stratified_kfold':
199
            return StratifiedKFold(n_splits=n_splits)
200
        elif splitter == 'repeated_stratified_kfold':
201
            return RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats)
202
        elif splitter == 'shuffle_split':
203
            return ShuffleSplit(n_splits=n_splits, test_size=test_size)
204
        elif splitter == 'stratified_shuffle_split':
205
            return StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size)
206
        elif splitter == 'leave_one_out':
207
            return LeaveOneOut()
208
        else:
209
            raise ValueError('unknown splitter: {}'.format(splitter))
210
211
    def score_function(estimator):
212
        '''Get method of an estimator that predict a continous score for each sample
213
        '''
214
        if hasattr(estimator, 'predict_proba'):
215
            return estimator.predict_proba
216
        elif hasattr(estimator, 'decision_function'):
217
            return estimator.decision_function
218
        else:
219
            raise ValueError('the estimator should either have decision_function() method or predict_proba() method')
220
221
    def feature_importances(estimator):
222
        '''Get feature importance attribute of an estimator
223
        '''
224
        if hasattr(estimator, 'coef_'):
225
            return np.ravel(estimator.coef_)
226
        elif hasattr(estimator, 'feature_importances_'):
227
            return np.ravel(estimator.feature_importances_)
228
        else:
229
            raise ValueError('the estimator should have either coef_ or feature_importances_ attribute')
230
    
231
    def get_scorer(scoring):
232
        if scoring == 'roc_auc':
233
            return roc_auc_score
234
        else:
235
            raise ValueError('unknonwn scoring: {}'.format(scoring))
236
237
    splitter = get_splitter(args.splitter, n_splits=args.n_splits, n_repeats=args.n_repeats)
238
    metrics = []
239
    predictions = np.full((splitter.get_n_splits(X), X.shape[0]), np.nan)
240
    predicted_labels = np.full((splitter.get_n_splits(X), X.shape[0]), np.nan)
241
    train_index_matrix = np.zeros((splitter.get_n_splits(X), X.shape[0]),dtype=np.bool)
242
    feature_selection_matrix = None
243
    if args.n_select is not None:
244
        feature_selection_matrix = np.zeros((splitter.get_n_splits(X), X.shape[1]), dtype=bool)
245
    if args.rfe:
246
        if 0.0 < args.rfe_step < 1.0:
247
            rfe_step = int(max(1, args.rfe_step*n_features))
248
        else:
249
            rfe_step = int(args.rfe_step)
250
        rfe_scores = None
251
    i_split = 0
252
    scorer = get_scorer(args.scorer)
253
    data_splits = list(splitter.split(X, y))
254
    data_splits.append((np.arange(n_samples), None))
255
    for train_index, test_index in tqdm(data_splits, total=splitter.get_n_splits(X) + 1, unit='fold'):
256
        X_train, y_train = X[train_index], y[train_index]
257
        X_test, y_test = X[test_index], y[test_index]
258
        # optimize hyper-parameters
259
        if grid_search is not None:
260
            cv = GridSearchCV(estimator, grid_search, cv=5)
261
            cv.fit(X[train_index], y[train_index])
262
            estimator = cv.best_estimator_
263
        
264
        sample_weight = np.ones(X_train.shape[0])
265
        if args.compute_sample_weight:
266
            sample_weight = compute_sample_weight('balanced', y_train)
267
        # feature selection
268
        if args.n_select is not None:
269
            if args.robust_select:
270
                resampler_args = {}
271
                if args.robust_resample_method == 'jackknife':
272
                    resampler_args = {'max_runs': args.robust_max_runs,
273
                        'remove': args.robust_jackknife_remove
274
                    }
275
                elif args.robust_resample_method == 'bootstrap':
276
                    resampler_args = {'max_runs': args.robust_max_runs}
277
                robust_estimator = RobustEstimator(estimator, n_select=args.n_select, 
278
                    resample_method=args.robust_resample_method,
279
                    rfe=args.rfe, **resampler_args)
280
                robust_estimator.fit(X_train, y_train, sample_weight=sample_weight)
281
                estimator = robust_estimator.estimator_
282
                features = robust_estimator.features_
283
            # RFE feature selection
284
            elif args.rfe:
285
                rfe = RFE(estimator, n_features_to_select=args.n_select, step=rfe_step)
286
                if i_split < splitter.get_n_splits(X):
287
                    if args.splitter == 'leave_one_out':
288
                        # AUC is undefined for only one test sample
289
                        step_score = lambda estimator, features: np.nan
290
                    else:
291
                        step_score = lambda estimator, features: scorer(y_test, 
292
                            score_function(estimator)(X[test_index][:, features])[:, 1])
293
                else:
294
                    step_score = None
295
                rfe._fit(X_train, y_train, step_score=step_score)
296
                features = np.nonzero(rfe.ranking_ == 1)[0]
297
                if i_split < splitter.get_n_splits(X):
298
                    if rfe_scores is None:
299
                        rfe_n_steps = len(rfe.scores_)
300
                        rfe_n_features_step = np.maximum(n_features - rfe_step*np.arange(rfe_n_steps), 1)
301
                        rfe_scores = np.zeros((splitter.get_n_splits(X), rfe_n_steps))
302
                    rfe_scores[i_split] = rfe.scores_
303
                estimator = rfe.estimator_
304
            # no feature selection
305
            else:
306
                # train the model
307
                estimator.fit(X[train_index], y[train_index], sample_weight=sample_weight)
308
                features = np.argsort(-feature_importances(estimator))[:args.n_select]
309
            if i_split < splitter.get_n_splits(X):
310
                feature_selection_matrix[i_split, features] = True
311
        else:
312
            # no feature selection
313
            features = np.arange(n_features, dtype=np.int64)
314
        
315
        estimator.fit(X[train_index][:, features], y[train_index], sample_weight=sample_weight)
316
        if i_split != splitter.get_n_splits(X):
317
            predictions[i_split] = score_function(estimator)(X[:, features])[:, 1]
318
            predicted_labels[i_split] = estimator.predict(X[:, features])
319
            metric = {}
320
            metric['train_{}'.format(args.scorer)] = scorer(y_train, predictions[i_split, train_index])
321
            # AUC is undefined for only one test sample
322
            if args.splitter != 'leave_one_out':
323
                metric['test_{}'.format(args.scorer)] = scorer(y_test, predictions[i_split, test_index])
324
            if args.splitter in ('repeated_kfold', 'repeated_stratified_kfold'):
325
                metric['repeat'] = i_split//args.n_repeats
326
                metric['split'] = i_split%args.n_repeats
327
            else:
328
                metric['split'] = i_split
329
            metrics.append(metric)
330
            train_index_matrix[i_split, train_index] = True
331
        i_split += 1
332
    metrics = pd.DataFrame.from_records(metrics)
333
    if args.splitter == 'leave_one_out':
334
        metrics['test_{}'.format(args.scorer)] = scorer(y, predictions[np.r_[:n_samples], np.r_[:n_samples]])
335
336
    logger.info('save best model')
337
    with open(os.path.join(args.output_dir, 'best_model.pkl'), 'wb') as f:
338
        pickle.dump(estimator, f)
339
    
340
    logger.info('save features')
341
    data = pd.Series(features, index=feature_names[features])
342
    data.to_csv(os.path.join(args.output_dir, 'features.txt'), sep='\t', header=False)
343
344
    logger.info('save feature importances')
345
    data = pd.Series(feature_importances(estimator), index=feature_names[features])
346
    data.to_csv(os.path.join(args.output_dir, 'feature_importances.txt'), sep='\t', header=False)
347
348
    logger.info('save evaluations')
349
    with h5py.File(os.path.join(args.output_dir, 'evaluation.{}.h5'.format(args.splitter)), 'w') as f:
350
        f.create_dataset('train_index', data=train_index_matrix)
351
        f.create_dataset('predictions', data=predictions)
352
        if feature_selection_matrix is not None:
353
            f.create_dataset('feature_selection', data=feature_selection_matrix)
354
        if args.rfe:
355
            f.create_dataset('rfe_n_features_step', data=rfe_n_features_step)
356
            f.create_dataset('rfe_scores', data=rfe_scores)
357
        f.create_dataset('labels', data=y)
358
        f.create_dataset('predicted_labels', data=predicted_labels)
359
        
360
    logger.info('save metrics')
361
    metrics.to_csv(os.path.join(args.output_dir, 'metrics.txt'), sep='\t', header=True, index=False)
362
363
@command_handler
364
def calculate_clustering_score(args):
365
    import numpy as np
366
    import pandas as pd
367
    from evaluation import uca_score, knn_score
368
    from ioutils import open_file_or_stdout
369
370
    logger.info('read feature matrix: ' + args.matrix)
371
    X = pd.read_table(args.matrix, index_col=0, sep='\t')
372
    
373
    if args.transpose:
374
        logger.info('transpose feature matrix')
375
        X = X.T
376
    if args.use_log:
377
        logger.info('apply log2 to feature matrix')
378
        X = np.log2(X + 0.25)
379
    
380
    logger.info('calculate clustering score')
381
    if args.method == 'uca_score':
382
        if args.sample_classes is None:
383
            raise ValueError('argument --sample-classes is required for knn_score')
384
        logger.info('read sample classes: ' + args.sample_classes)
385
        sample_classes = pd.read_table(args.sample_classes, index_col=0, sep='\t').iloc[:, 0]
386
        y = sample_classes[X.index.values].values
387
        score = uca_score(X, y)
388
    elif args.method == 'knn_score':
389
        if args.batch is None:
390
            raise ValueError('argument --batch is required for knn_score')
391
        if args.batch_index is None:
392
            raise ValueError('argument --batch-index is required for knn_score')
393
        logger.info('read batch information: ' + args.batch)
394
        batch = pd.read_table(args.batch, index_col=0, sep='\t').iloc[:, args.batch_index - 1]
395
        batch = batch[X.index.values].values
396
        score = knn_score(X, batch)
397
    else:
398
        raise ValueError('unknown clustering score method: ' + args.method)
399
    with open_file_or_stdout(args.output_file) as fout:
400
        fout.write('{}'.format(score))
401
402
@command_handler
403
def evaluate_single_features(args):
404
    import numpy as np
405
    import pandas as pd
406
    from ioutils import open_file_or_stdout
407
    from tqdm import tqdm
408
    from sklearn.metrics import roc_auc_score
409
410
    logger.info('read feature matrix: ' + args.matrix)
411
    matrix = pd.read_table(args.matrix, index_col=0, sep='\t')
412
    logger.info('read sample classes: ' + args.sample_classes)
413
    sample_classes = pd.read_table(args.sample_classes, index_col=0, sep='\t').iloc[:, 0]
414
    if args.transpose:
415
        logger.info('transpose feature matrix')
416
        matrix = matrix.T
417
    logger.info('select positive and negative samples')
418
    X, y = select_samples_by_class(matrix, sample_classes, 
419
        positive_class=args.positive_class, 
420
        negative_class=args.negative_class)
421
    n_samples, n_features = X.shape
422
    logger.info('evaluate single features')
423
    scorers = [('roc_auc', roc_auc_score)]
424
    scores = np.zeros((n_features, len(scorers)))
425
    for i in tqdm(range(n_features), unit='feature'):
426
        for j in range(len(scorers)):
427
            scores[i, j] = scorers[j][1](y, X.iloc[:, i])
428
            # if AUC < 0.5, use 1 - AUC
429
            scores[i, j] = max(scores[i, j], 1 - scores[i, j])
430
    scores = pd.DataFrame(scores, index=X.columns.values, columns=[name for name, scorer in scorers])
431
    scores.index.name = 'feature'
432
    logger.info('write scores to file: ' + args.output_file)
433
    scores.to_csv(args.output_file, sep='\t', index=True, header=True, na_rep='NA')
434
435
if __name__ == '__main__':
436
    main_parser = argparse.ArgumentParser(description='Feature selection module')
437
    subparsers = main_parser.add_subparsers(dest='command')
438
439
    parser = subparsers.add_parser('preprocess_features')
440
    parser.add_argument('--matrix', '-i', type=str, required=True,
441
        help='input feature matrix (rows are samples and columns are features')
442
    parser.add_argument('--use-log', action='store_true',
443
        help='apply log2 to feature matrix')
444
    parser.add_argument('--transpose', action='store_true', default=False,
445
        help='transpose the feature matrix')
446
    parser.add_argument('--scaler', type=str,
447
        choices=('zscore', 'robust', 'max_abs', 'min_max', 'none'),
448
        help='method for scaling features')
449
    parser.add_argument('--output-file', '-o', type=str, required=True,
450
        help='output file name')
451
    parser.add_argument('--remove-zero-features', type=float, 
452
        help='remove features that have fraction of zero values above this value')
453
    parser.add_argument('--rpkm-top', type=int,
454
        help='Maximum number of top features to select ranked by RPKM')
455
    parser.add_argument('--expr-top', type=int, 
456
        help='Maximum number of top features to select ranked by raw expression value')
457
    
458
    parser = subparsers.add_parser('evaluate')
459
    parser.add_argument('--matrix', '-i', type=str, required=True,
460
        help='input feature matrix (rows are samples and columns are features')
461
    parser.add_argument('--sample-classes', type=str, required=True,
462
        help='input file containing sample classes with 2 columns: sample_id, sample_class')
463
    parser.add_argument('--positive-class', type=str,
464
        help='comma-separated list of sample classes to use as positive class')
465
    parser.add_argument('--negative-class', type=str,
466
        help='comma-separates list of sample classes to use as negative class')
467
    parser.add_argument('--method', type=str, default='logistic_regression',
468
        choices=('logistic_regression', 'random_forest', 'linear_svm'),
469
        help='feature selection method')
470
    parser.add_argument('--output-dir', '-o', type=str, required=True,
471
        help='output directory')
472
    parser.add_argument('--compute-sample-weight', action='store_true',
473
        help='compute sample weight to balance classes')
474
    parser.add_argument('--splitter', type=str, default='stratified_kfold',
475
        choices=('kfold', 'stratified_kfold', 'shuffle_split', 'repeated_stratified_kfold', 'leave_one_out', 'stratified_shuffle_split'))
476
    parser.add_argument('--rfe', action='store_true', default=False,
477
        help='use RFE to select features')
478
    parser.add_argument('--rfe-step', type=float,
479
        help='number/fraction of features to eliminate in each step')
480
    parser.add_argument('--n-select', type=int,
481
        help='number of features to select')
482
    parser.add_argument('--n-splits', type=int, default=5,
483
        help='number of splits for kfold, stratified_kfold and shuffle_splits')
484
    parser.add_argument('--n-repeats', type=int, default=10,
485
        help='number of repeats for repeated_stratified_kfold and repeated_kfold')
486
    parser.add_argument('--test-size', type=float, default=0.2,
487
        help='fraction/number of samples for testing')
488
    parser.add_argument('--scorer', type=str, default='roc_auc',
489
        help='metric to use')
490
    parser.add_argument('--robust-select', action='store_true', default=False,
491
        help='use robust feature selection by selecting recurring features across resampling runs')
492
    parser.add_argument('--robust-resample-method', type=str, default='jackknife',
493
        choices=('bootstrap', 'jackknife'), help='resampling method for robust feature selection')
494
    parser.add_argument('--robust-max-runs', type=int, default=50,
495
        help='number of resampling runs for robust feature selections')
496
    parser.add_argument('--robust-jackknife-remove', type=float, default=1,
497
        help='number/fraction of samples to remove during each resampling run for robust feature selection')
498
    parser.add_argument('--remove-zero-features', type=float, 
499
        help='remove features that have fraction of zero values above this value')
500
    
501
    parser = subparsers.add_parser('calculate_clustering_score',
502
        help='evaluate a normalized matrix by clustering score')
503
    parser.add_argument('--matrix', '-i', type=str, required=True,
504
        help='input feature matrix (rows are samples and columns are features')
505
    parser.add_argument('--sample-classes', type=str, required=False,
506
        help='input file containing sample classes with 2 columns: sample_id, sample_class')
507
    parser.add_argument('--batch', type=str, required=False,
508
        help='batch information')
509
    parser.add_argument('--batch-index', type=int,
510
        help='column index (1-based) to use in the batch information table')
511
    parser.add_argument('--output-file', '-o', type=str, default='-',
512
        help='output file for the score')
513
    parser.add_argument('--transpose', action='store_true', default=False,
514
        help='transpose the feature matrix')
515
    parser.add_argument('--method', type=str, required=True, choices=('uca_score', 'knn_score'),
516
        help='score method')
517
    parser.add_argument('--use-log', action='store_true',
518
        help='apply log2 to feature matrix')
519
520
    parser = subparsers.add_parser('evaluate_single_features')
521
    parser.add_argument('--matrix', '-i', type=str, required=True)
522
    parser.add_argument('--sample-classes', type=str, required=True,
523
        help='input file containing sample classes with 2 columns: sample_id, sample_class')
524
    parser.add_argument('--positive-class', type=str,
525
        help='comma-separated list of sample classes to use as positive class')
526
    parser.add_argument('--negative-class', type=str,
527
        help='comma-separates list of sample classes to use as negative class')
528
    parser.add_argument('--transpose', action='store_true', default=False,
529
        help='transpose the feature matrix')
530
    parser.add_argument('--output-file', '-o', type=str, required=True,
531
        help='output file for the score')
532
533
    args = main_parser.parse_args()
534
    if args.command is None:
535
        print('Errror: missing command', file=sys.stdout)
536
        main_parser.print_help()
537
        sys.exit(1)
538
    logger = logging.getLogger('feature_selection.' + args.command)
539
540
    # global imports
541
    import numpy as np
542
    import pandas as pd
543
544
    command_handlers.get(args.command)(args)