a b/exseek/scripts/estimators.py
1
import numpy as np
2
import pandas as pd
3
import matplotlib
4
matplotlib.use('Agg')
5
import matplotlib.pyplot as plt
6
from matplotlib.backends.backend_pdf import PdfPages
7
import seaborn as sns
8
sns.set()
9
from sklearn.base import BaseEstimator, ClassifierMixin
10
from sklearn.linear_model import LogisticRegression
11
from sklearn.ensemble import RandomForestClassifier
12
from sklearn.svm import LinearSVC
13
from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve
14
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler
15
from sklearn.utils.class_weight import compute_sample_weight
16
from sklearn.model_selection import GridSearchCV
17
from sklearn.feature_selection import RFE
18
import os
19
from tqdm import tqdm
20
import pickle
21
import json
22
from scipy.cluster import hierarchy
23
from scipy.spatial.distance import pdist, squareform
24
from tqdm import tqdm
25
import logging
26
logging.basicConfig(level=logging.INFO, format='[%(asctime)s] [%(levelname)s] %(name)s: %(message)s')
27
28
def bootstrap(*arrays):
29
    '''Perform bootstrap resampling
30
31
    Parameters
32
    ----------
33
    arrays: list of array-like objects
34
        Data to resample. Bootstrap resampling is performed on the first dimension of each array
35
    
36
    Returns
37
    -------
38
    arrays: tuple of array-like objects
39
        Resampled data
40
    '''
41
    n_samples = arrays[0].shape[0]
42
    if not all(n_samples == a.shape[0] for a in arrays):
43
        raise ValueError('the first dimension of all arrays should be equal')
44
    indices = np.random.randint(n_samples, size=n_samples)
45
    return (np.take(a, indices, axis=0) for a in arrays)
46
47
def jackknife(*arrays, remove=1, indices=None):
48
    '''Perform jackknife resampling
49
50
    Parameters
51
    ----------
52
    arrays: list of array-like objects
53
        Data to resample. Bootstrap resampling is performed on the first dimension of each array
54
    
55
    remove: int or float
56
        If `remove` is a integer, remove that number of samples.
57
        If `remove` is a float, remove that fraction of samples.
58
59
    indices: array-like
60
        Indices of samples to remove. The `remove` parameter will be ignored.
61
62
    Returns
63
    -------
64
    arrays: tuple of array-like objects
65
        Resampled data
66
    '''
67
    n_samples = arrays[0].shape[0]
68
    if not all(n_samples == a.shape[0] for a in arrays):
69
        raise ValueError('the first dimension of all arrays should be equal')
70
    if indices is None:
71
        if remove < 1:
72
            n_remove = round(remove*n_samples)
73
        else:
74
            n_remove = remove
75
        indices = np.random.choice(n_samples, replace=False, size=n_remove)
76
    return (np.delete(a, indices, axis=0) for a in arrays)
77
78
class RobustEstimator(BaseEstimator, ClassifierMixin):
79
    '''A wrapper to select robust features based on feature reccurence
80
81
    Parameters:
82
    ----------
83
    estimator: sklearn.base.BaseEstimator object
84
        Base estimator for feature selection
85
    
86
    n_select: int
87
        Number of features to select
88
89
    resample_method: str, choices: ('jackknife', 'bootstrap')
90
        Resampling method
91
92
    remove: float or int
93
        Fraction (float, 0.0-1.0) or number (int, >= 1) of samples to remove for each round of resampling
94
    
95
    max_runs: int
96
        Maximum rounds of jackknife resampling
97
    
98
    recurring_fraction: float
99
        Minimum required fraction of reccuring samples to select features
100
    
101
    grid_search: dict or None
102
        Parameter grid for optimizing hyper-parameters using GridSearchCV
103
    
104
    rfe: bool
105
        Whether to use RFE to select features rather than select features with higest importance during each run
106
107
    Attributes:
108
    ----------
109
    feature_recurrence_: array-like, shape (n_features,)
110
        Number of resampling rounds in which each feature is retained
111
    
112
    features_: array-like, shape (n_features,)
113
        Indices of selected features
114
    
115
    feature_selection_matrix_: array-like, (max_runs, n_select), type int
116
        Indicator matrix for selected features during each run
117
118
    feature_importance_matrix_: array-like, (max_runs, n_features)
119
        Feature importance during each resampling run
120
    
121
    feature_importances_mean_: array-like, (n_features,)
122
        Feature importances averaged across resampling runs
123
    
124
    feature_importance_std_: array-like, (n_features,)
125
        Standard deviation of feature importances across resampling runs
126
    
127
    estimator_: sklearn.base.BaseEstimator object
128
        Estimator fitted on all data using selected features
129
    
130
    feature_importances_: array-like, (n_select,)
131
        Feature importances trained on all data using selected features
132
133
    '''
134
    def __init__(self, estimator, n_select=None, resample_method = 'jackknife',
135
            remove=1, max_runs=50, recurring_fraction=0.5,
136
            grid_search=None, rfe=False):
137
        self.estimator = estimator
138
        self.remove = remove
139
        self.max_runs = max_runs
140
        self.n_select = n_select
141
        self.resample_method = resample_method
142
        self.recurring_fraction = recurring_fraction
143
        self.grid_search = grid_search
144
        self.rfe = rfe
145
    
146
    def fit(self, X, y, sample_weight=None):
147
        n_samples, n_features = X.shape
148
        feature_rank_matrix = np.zeros(n_samples)
149
        if self.remove == 1:
150
            max_runs = n_samples
151
        else:
152
            max_runs = self.max_runs
153
        n_select = self.n_select
154
        if n_select is None:
155
            n_select = n_features
156
        feature_rank_matrix = np.zeros((max_runs, n_features))
157
        feature_importances_matrix = np.zeros((max_runs, n_features))
158
        if sample_weight is None:
159
            sample_weight = np.ones(n_samples)
160
        for i_run in range(max_runs):
161
            if self.resample_method == 'bootstrap':
162
                X_, y_, sample_weight_ = bootstrap(X, y, sample_weight)
163
            elif self.resample_method == 'jackknife':  
164
                indices_remove = None
165
                if self.remove == 1:
166
                    indices_remove = i_run
167
                X_, y_, sample_weight_ = jackknife(X, y, sample_weight,
168
                    remove=self.remove, indices=indices_remove)
169
            if self.grid_search is not None:
170
                cv = GridSearchCV(self.estimator, param_grid=self.grid_search, cv=3)
171
                cv.fit(X_, y_, sample_weight=sample_weight_)
172
                self.estimator = cv.best_estimator_
173
            else:
174
                self.estimator.fit(X_, y_, sample_weight=sample_weight_)
175
            if self.rfe:
176
                rfe = RFE(self.estimator, n_select, step=0.1)
177
                rfe.fit(X_, y_)
178
                feature_importances_matrix[i_run] = rfe.support_.astype('float')
179
                feature_rank_matrix[i_run] = rfe.ranking_
180
            else:
181
                if hasattr(self.estimator, 'coef_'):
182
                    feature_importances = np.square(self.estimator.coef_.flatten())
183
                else:
184
                    feature_importances = self.estimator.feature_importances_
185
                feature_importances_matrix[i_run] = feature_importances
186
                feature_orders = np.argsort(-feature_importances)
187
                feature_rank_matrix[i_run, feature_orders] = np.arange(n_features)
188
        if self.rfe:
189
            feature_selection_matrix = (feature_rank_matrix == 1).astype(np.int32)
190
        else:
191
            feature_selection_matrix = (feature_rank_matrix < n_select).astype(np.int32)
192
        feature_recurrence = np.sum(feature_selection_matrix, axis=0)
193
        #robust_features = np.nonzero(feature_recurrence > round(n_samples)*self.recurring_fraction)[0]
194
        #robust_features = robust_features[np.argsort(feature_recurrence[robust_features])][::-1]
195
        robust_features = np.argsort(-feature_recurrence)[:n_select]
196
        feature_selection_matrix = feature_selection_matrix[:, robust_features]
197
198
        # refit the estimator
199
        if self.grid_search is not None:
200
            cv = GridSearchCV(self.estimator, param_grid=self.grid_search, cv=3)
201
            cv.fit(X[:, robust_features], y, sample_weight=sample_weight)
202
            self.estimator = cv.best_estimator_
203
        else:
204
            self.estimator.fit(X[:, robust_features], y, sample_weight=sample_weight)
205
206
        # save attributes
207
        self.feature_recurrence_ = feature_recurrence
208
        self.features_ = robust_features
209
        self.feature_selection_matrix_ = feature_selection_matrix
210
        self.feature_importances_matrix_ = feature_importances_matrix
211
        self.feature_importances_mean_ = np.mean(feature_importances_matrix, axis=0)
212
        self.feature_importances_std_ = np.std(feature_importances_matrix, axis=0)
213
        self.estimator_ = self.estimator
214
        if hasattr(self.estimator, 'coef_'):
215
            self.feature_importances_ = np.square(self.estimator.coef_.flatten())
216
        else:
217
            self.feature_importances_ = self.estimator.feature_importances_
218
        
219
        return self
220
221
    def predict(self, X):
222
        return self.estimator_.fit(X)
223
    
224
    def score(self, X):
225
        return self.estimator_.score(X)
226
227
def define_step(inputs=None, outputs=None):
228
    inputs = inputs if inputs is not None else ()
229
    outputs = outputs if outputs is not None else ()
230
231
    def wrapper_generator(f):
232
        def wrapper(self, *args, **kwargs):
233
            for input in inputs:
234
                if not hasattr(self, input):
235
                    raise ValueError('missing input "{}" for step f.__name__'.format(input))
236
            r = f(self, *args, **kwargs)
237
            for output in outputs:
238
                if not hasattr(self, output):
239
                    raise ValueError('missing output "{}" for step f.__name__'.format(output))
240
            return r
241
        return wrapper
242
    return wrapper_generator
243
244
class FeatureSelection(object):
245
    def __init__(self, matrix, 
246
            sample_classes,
247
            output_dir,
248
            remove_zero_features=None,
249
            top_features_by_median=None,
250
            transpose=False,
251
            positive_class=None,
252
            negative_class=None,
253
            use_log=False,
254
            scaler=None,
255
            compute_sample_weight=False,
256
            method='logistic_regression',
257
            rfe=False,
258
            n_select=None,
259
            resample_method='jackknife',
260
            jackknife_max_runs=100,
261
            jackknife_remove=0.2,
262
            bootstrap_max_runs=100,
263
            **kwargs):
264
        self.matrix_file = matrix
265
        self.sample_classes_file = sample_classes
266
        self.output_dir = output_dir
267
        self.remove_zero_features = remove_zero_features
268
        self.top_features_by_median = top_features_by_median
269
        self.transpose = transpose
270
        self.positive_class = positive_class
271
        self.negative_class = negative_class
272
        self.use_log = use_log
273
        self.scaler = scaler
274
        self.compute_sample_weight = compute_sample_weight
275
        self.rfe = rfe
276
        self.method = method
277
        self.n_select = n_select
278
        self.resample_method = resample_method
279
        self.jackknife_max_runs = jackknife_max_runs
280
        self.jackknife_remove = jackknife_remove
281
        self.bootstrap_max_runs = bootstrap_max_runs
282
        self.logger = logging.getLogger('FeatureSelection')
283
284
        if not os.path.exists(self.output_dir):
285
            self.logger.info('create output directory: ' + self.output_dir)
286
            os.makedirs(self.output_dir)
287
    
288
    @define_step(inputs=['matrix_file'], outputs=['X', 'sample_classes'])
289
    def read_data(self):
290
        self.X = pd.read_table(self.matrix_file, index_col=0)
291
        if self.transpose:
292
            self.logger.info('transpose feature matrix')
293
            self.X = self.X.T
294
        self.logger.info('{} samples, {} features'.format(*self.X.shape))
295
        self.logger.info('read sample classes: ' + self.sample_classes_file)
296
        self.sample_classes = pd.read_table(self.sample_classes_file, header=None, 
297
            names=['sample_id', 'sample_class'], index_col=0)
298
        self.sample_classes = self.sample_classes.iloc[:, 0]
299
    
300
    @define_step(inputs=['X'], outputs=['feature_names', 'n_features'])
301
    def filter_features(self):
302
        if self.remove_zero_features is not None:
303
            self.X = self.X.loc[:, np.isclose(m, 0).sum(axis=0) >= (m.shape[0]*self.remove_zero_features)]
304
        if self.top_features_by_median is not None:
305
            nonzero_samples = (self.X > 0).sum(axis=0)
306
            counts_geomean = np.exp(np.sum(np.log(np.maximum(self.X, 1)), axis=0)/nonzero_samples)
307
            self.X = self.X.loc[:, counts_geomean.sort_values(ascending=False)[:self.top_features_by_median].index.values]
308
        self.feature_names = self.X.columns.values
309
        self.n_features = self.X.shape[1]
310
        self.logger.info('{} features after filtering'.format(self.n_features))
311
        self.logger.info('features: {} ...'.format(str(self.feature_names[:3])))
312
    
313
    @define_step(inputs=['X', 'positive_class', 'negative_class'],
314
        outputs=['X', 'y', 'n_samples', 'sample_ids', 'X_raw'])
315
    def select_samples(self):
316
        if (self.positive_class is not None) and (self.negative_class is not None):
317
            self.positive_class = self.positive_class.split(',')
318
            self.negative_class = self.negative_class.split(',')
319
        else:
320
            unique_classes = np.unique(self.sample_classes.values)
321
            if len(unique_classes) != 2:
322
                raise ValueError('expect 2 classes but {} classes found'.format(len(unique_classes)))
323
            self.positive_class, self.negative_class = unique_classes
324
        self.positive_class = np.atleast_1d(self.positive_class)
325
        self.negative_class = np.atleast_1d(self.negative_class)
326
327
        self.logger.info('positive class: {}, negative class: {}'.format(self.positive_class, self.negative_class))
328
        X_pos = self.X.loc[self.sample_classes[self.sample_classes.isin(self.positive_class)].index.values]
329
        X_neg = self.X.loc[self.sample_classes[self.sample_classes.isin(self.negative_class)].index.values]
330
        self.logger.info('number of positive samples: {}, negative samples: {}, class ratio: {}'.format(
331
            X_pos.shape[0], X_neg.shape[0], float(X_pos.shape[0])/X_neg.shape[0]))
332
        self.X = pd.concat([X_pos, X_neg], axis=0)
333
        self.y = np.zeros(self.X.shape[0], dtype=np.int32)
334
        self.y[X_pos.shape[0]:] = 1
335
        self.X_raw = self.X
336
        self.n_samples = self.X.shape
337
        self.sample_ids = self.X.index.values
338
339
    @define_step(inputs=['X', 'use_log', 'scaler'], outputs=['X'])
340
    def scale_features(self):
341
        if self.use_log:
342
            self.logger.info('apply log2 to feature matrix')
343
            self.X = np.log2(self.X + 1)
344
345
        if self.scaler == 'zscore':
346
            scaler = StandardScaler()
347
        elif self.scaler == 'robust':
348
            scaler = RobustScaler()
349
        elif self.scaler == 'min_max':
350
            scaler = MinMaxScaler()
351
        elif self.scaler == 'max_abs':
352
            scaler = MaxAbsScaler()
353
        self.logger.info('scale features using {}'.format(self.scaler))
354
        self.X = scaler.fit_transform(self.X)
355
356
    @define_step(inputs=['X', 'method', 'resample_method'], 
357
        outputs=['estimator', 'robust_estimator', 'compute_sample_weight'])
358
    def train_model(self):
359
        if np.any(np.isnan(self.X)):
360
            self.logger.info('nan values found in features')
361
        self.estimator = None
362
        grid_search = None
363
        self.logger.info('use {} to select features'.format(self.method))
364
        if self.method == 'r_test':
365
            self.estimator = TTestEstimator()
366
        elif self.method == 'logistic_regression':
367
            self.estimator = LogisticRegression()
368
            grid_search = {'C': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e2, 1e3, 1e4, 1e5]}
369
        elif self.method == 'random_forest':
370
            self.estimator = RandomForestClassifier()
371
            grid_search = {'max_depth': list(range(2, 10))}
372
        elif self.method == 'linear_svm':
373
            self.estimator = LinearSVC()
374
            grid_search = {'C': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e2, 1e3, 1e4, 1e5]}
375
        else:
376
            raise ValueError('unknown feature selection method: {}'.format(self.method))
377
378
        resampler_args = {}
379
        if self.resample_method == 'jackknife':
380
            resampler_args = {'max_runs': self.jackknife_max_runs,
381
                'remove': self.jackknife_remove
382
            }
383
        elif self.resample_method == 'bootstrap':
384
            resampler_args = {'max_runs': self.bootstrap_max_runs}
385
386
        self.robust_estimator = RobustEstimator(self.estimator, n_select=self.n_select, 
387
            grid_search=grid_search, resample_method='bootstrap',
388
            rfe=self.rfe, **resampler_args)
389
390
        sample_weight = None
391
        if self.compute_sample_weight:
392
            sample_weight = compute_sample_weight('balanced', self.y)
393
        self.robust_estimator.fit(self.X, self.y, sample_weight=sample_weight)
394
        self.estimator = self.robust_estimator.estimator_
395
396
    @define_step(inputs=['estimator', 'X', 'y'])
397
    def evaluate_model(self):
398
        self.logger.info('evaluate the model')
399
        features = pd.read_table(os.path.join(self.output_dir, 'features.txt'),
400
            header=None).iloc[:, 0].values
401
        feature_index = pd.Series(np.arange(self.n_features), 
402
            index=self.feature_names).loc[features]
403
        
404
        y_pred = self.estimator.predict(self.X[:, feature_index])
405
        y_score = self.estimator.predict_proba(self.X[:, feature_index])[:, 1]
406
        roc_auc = roc_auc_score(self.y, y_score)
407
        fpr, tpr, thresholds = roc_curve(self.y, y_score)
408
409
        sns.set_style('whitegrid')
410
        fig, ax = plt.subplots(figsize=(7, 7))
411
        ax.plot(fpr, tpr, label='AUC = {:.4f}'.format(roc_auc))
412
        ax.plot([0, 1], [0, 1], linestyle='dashed', color='gray', linewidth=0.8)
413
        ax.set_xlabel('False positive rate')
414
        ax.set_ylabel('True positive rate')
415
        ax.legend()
416
        plt.savefig(os.path.join(self.output_dir, 'roc_curve.pdf'))
417
        plt.close()
418
419
    @define_step(inputs=['estimator'])
420
    def save_model(self):
421
        self.logger.info('save model')
422
        with open(os.path.join(self.output_dir, 'model.pkl'), 'wb') as f:
423
            pickle.dump(self.estimator, f)
424
    
425
    @define_step(outputs=['estimator'])
426
    def load_model(self):
427
        self.logger.info('load model')
428
        with open(os.path.join(self.output_dir, 'model.pkl'), 'rb') as f:
429
            self.estimator = pickle.load(f)
430
    
431
    @define_step(inputs=['estimator'])
432
    def save_params(self):
433
        self.logger.info('save model parameters')
434
        with open(os.path.join(self.output_dir, 'params.json'), 'w') as f:
435
            json.dump(self.estimator.get_params(), f, indent=2)
436
    
437
    def save_matrix(self):
438
        self.logger.info('save matrix')
439
        df = pd.DataFrame(self.X, columns=self.feature_names, index=self.sample_ids)
440
        df.index.name = 'sample'
441
        df.to_csv(os.path.join(self.output_dir, 'matrix.txt'),
442
            sep='\t', index=True, header=True)
443
        data = pd.Series(self.y, index=self.sample_ids)
444
        data.to_csv(os.path.join(self.output_dir, 'labels.txt'),
445
            sep='\t', index=True, header=False)
446
    
447
    @define_step(inputs=['output_dir', 'n_features', 'feature_names', 'X', 'y'])
448
    def single_feature_metrics(self):
449
        self.logger.info('compute metrics for selected features independently')
450
        features = pd.read_table(os.path.join(self.output_dir, 'features.txt'),
451
            header=None).iloc[:, 0].values
452
        feature_index = pd.Series(np.arange(self.n_features), 
453
            index=self.feature_names).loc[features]
454
        metrics = {}
455
        scorers = {'roc_auc': roc_auc_score}
456
        for metric in ['roc_auc']:
457
            metrics[metric] = np.zeros(len(features))
458
            for i, i_feature in enumerate(feature_index):
459
                metrics[metric][i] = scorers[metric](self.y, self.X[:, i_feature])
460
        metrics = pd.DataFrame(metrics, index=features)
461
        metrics.index.name = 'feature'
462
        metrics.to_csv(os.path.join(self.output_dir, 'single_feature_metrics.txt'),
463
            sep='\t', index=True, header=True)
464
465
    @define_step(inputs=['estimator'])
466
    def plot_feature_importance(self):
467
        self.logger.info('plot feature importance')
468
        features = pd.read_table(os.path.join(self.output_dir, 'features.txt'),
469
            header=None).iloc[:, 0].values
470
        
471
        feature_importance = pd.read_table(os.path.join(self.output_dir, 'feature_importances.txt'),
472
            names=['feature', 'feature_importance'], header=None)
473
        feature_importance = feature_importance.iloc[np.argsort(-feature_importance['feature_importance'].values), :]
474
        print(feature_importance.head())
475
        sns.set_style('whitegrid')
476
        fig, ax = plt.subplots(figsize=(15, 20))
477
        sns.barplot('feature_importance', 'feature', color='gray',
478
            data=feature_importance, ax=ax)
479
        plt.subplots_adjust(left=0.3)
480
        plt.savefig(os.path.join(self.output_dir, 'feature_importances_refitted.pdf'))
481
        plt.close()
482
483
        feature_importance_matrix = pd.read_table(os.path.join(self.output_dir, 'feature_importance_matrix.txt'))
484
        feature_importance_matrix = feature_importance_matrix.loc[:, features]
485
        feature_importance_matrix = feature_importance_matrix.iloc[:, np.argsort(-feature_importance_matrix.mean(axis=0).values)]
486
        data = pd.melt(feature_importance_matrix, var_name='feature', value_name='feature_importance')
487
        sns.set_style('whitegrid')
488
        fig, ax = plt.subplots(figsize=(15, 25))
489
        sns.barplot('feature_importance', 'feature', color='gray', ci='sd',
490
            data=data, ax=ax, errwidth=1, capsize=0.2)
491
        plt.subplots_adjust(left=0.2)
492
        plt.savefig(os.path.join(self.output_dir, 'feature_importances_estimate.pdf'))
493
        plt.close()
494
495
496
497