Switch to side-by-side view

--- a
+++ b/exseek/scripts/estimators.py
@@ -0,0 +1,497 @@
+import numpy as np
+import pandas as pd
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+from matplotlib.backends.backend_pdf import PdfPages
+import seaborn as sns
+sns.set()
+from sklearn.base import BaseEstimator, ClassifierMixin
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.svm import LinearSVC
+from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve
+from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler
+from sklearn.utils.class_weight import compute_sample_weight
+from sklearn.model_selection import GridSearchCV
+from sklearn.feature_selection import RFE
+import os
+from tqdm import tqdm
+import pickle
+import json
+from scipy.cluster import hierarchy
+from scipy.spatial.distance import pdist, squareform
+from tqdm import tqdm
+import logging
+logging.basicConfig(level=logging.INFO, format='[%(asctime)s] [%(levelname)s] %(name)s: %(message)s')
+
+def bootstrap(*arrays):
+    '''Perform bootstrap resampling
+
+    Parameters
+    ----------
+    arrays: list of array-like objects
+        Data to resample. Bootstrap resampling is performed on the first dimension of each array
+    
+    Returns
+    -------
+    arrays: tuple of array-like objects
+        Resampled data
+    '''
+    n_samples = arrays[0].shape[0]
+    if not all(n_samples == a.shape[0] for a in arrays):
+        raise ValueError('the first dimension of all arrays should be equal')
+    indices = np.random.randint(n_samples, size=n_samples)
+    return (np.take(a, indices, axis=0) for a in arrays)
+
+def jackknife(*arrays, remove=1, indices=None):
+    '''Perform jackknife resampling
+
+    Parameters
+    ----------
+    arrays: list of array-like objects
+        Data to resample. Bootstrap resampling is performed on the first dimension of each array
+    
+    remove: int or float
+        If `remove` is a integer, remove that number of samples.
+        If `remove` is a float, remove that fraction of samples.
+
+    indices: array-like
+        Indices of samples to remove. The `remove` parameter will be ignored.
+
+    Returns
+    -------
+    arrays: tuple of array-like objects
+        Resampled data
+    '''
+    n_samples = arrays[0].shape[0]
+    if not all(n_samples == a.shape[0] for a in arrays):
+        raise ValueError('the first dimension of all arrays should be equal')
+    if indices is None:
+        if remove < 1:
+            n_remove = round(remove*n_samples)
+        else:
+            n_remove = remove
+        indices = np.random.choice(n_samples, replace=False, size=n_remove)
+    return (np.delete(a, indices, axis=0) for a in arrays)
+
+class RobustEstimator(BaseEstimator, ClassifierMixin):
+    '''A wrapper to select robust features based on feature reccurence
+
+    Parameters:
+    ----------
+    estimator: sklearn.base.BaseEstimator object
+        Base estimator for feature selection
+    
+    n_select: int
+        Number of features to select
+
+    resample_method: str, choices: ('jackknife', 'bootstrap')
+        Resampling method
+
+    remove: float or int
+        Fraction (float, 0.0-1.0) or number (int, >= 1) of samples to remove for each round of resampling
+    
+    max_runs: int
+        Maximum rounds of jackknife resampling
+    
+    recurring_fraction: float
+        Minimum required fraction of reccuring samples to select features
+    
+    grid_search: dict or None
+        Parameter grid for optimizing hyper-parameters using GridSearchCV
+    
+    rfe: bool
+        Whether to use RFE to select features rather than select features with higest importance during each run
+
+    Attributes:
+    ----------
+    feature_recurrence_: array-like, shape (n_features,)
+        Number of resampling rounds in which each feature is retained
+    
+    features_: array-like, shape (n_features,)
+        Indices of selected features
+    
+    feature_selection_matrix_: array-like, (max_runs, n_select), type int
+        Indicator matrix for selected features during each run
+
+    feature_importance_matrix_: array-like, (max_runs, n_features)
+        Feature importance during each resampling run
+    
+    feature_importances_mean_: array-like, (n_features,)
+        Feature importances averaged across resampling runs
+    
+    feature_importance_std_: array-like, (n_features,)
+        Standard deviation of feature importances across resampling runs
+    
+    estimator_: sklearn.base.BaseEstimator object
+        Estimator fitted on all data using selected features
+    
+    feature_importances_: array-like, (n_select,)
+        Feature importances trained on all data using selected features
+
+    '''
+    def __init__(self, estimator, n_select=None, resample_method = 'jackknife',
+            remove=1, max_runs=50, recurring_fraction=0.5,
+            grid_search=None, rfe=False):
+        self.estimator = estimator
+        self.remove = remove
+        self.max_runs = max_runs
+        self.n_select = n_select
+        self.resample_method = resample_method
+        self.recurring_fraction = recurring_fraction
+        self.grid_search = grid_search
+        self.rfe = rfe
+    
+    def fit(self, X, y, sample_weight=None):
+        n_samples, n_features = X.shape
+        feature_rank_matrix = np.zeros(n_samples)
+        if self.remove == 1:
+            max_runs = n_samples
+        else:
+            max_runs = self.max_runs
+        n_select = self.n_select
+        if n_select is None:
+            n_select = n_features
+        feature_rank_matrix = np.zeros((max_runs, n_features))
+        feature_importances_matrix = np.zeros((max_runs, n_features))
+        if sample_weight is None:
+            sample_weight = np.ones(n_samples)
+        for i_run in range(max_runs):
+            if self.resample_method == 'bootstrap':
+                X_, y_, sample_weight_ = bootstrap(X, y, sample_weight)
+            elif self.resample_method == 'jackknife':  
+                indices_remove = None
+                if self.remove == 1:
+                    indices_remove = i_run
+                X_, y_, sample_weight_ = jackknife(X, y, sample_weight,
+                    remove=self.remove, indices=indices_remove)
+            if self.grid_search is not None:
+                cv = GridSearchCV(self.estimator, param_grid=self.grid_search, cv=3)
+                cv.fit(X_, y_, sample_weight=sample_weight_)
+                self.estimator = cv.best_estimator_
+            else:
+                self.estimator.fit(X_, y_, sample_weight=sample_weight_)
+            if self.rfe:
+                rfe = RFE(self.estimator, n_select, step=0.1)
+                rfe.fit(X_, y_)
+                feature_importances_matrix[i_run] = rfe.support_.astype('float')
+                feature_rank_matrix[i_run] = rfe.ranking_
+            else:
+                if hasattr(self.estimator, 'coef_'):
+                    feature_importances = np.square(self.estimator.coef_.flatten())
+                else:
+                    feature_importances = self.estimator.feature_importances_
+                feature_importances_matrix[i_run] = feature_importances
+                feature_orders = np.argsort(-feature_importances)
+                feature_rank_matrix[i_run, feature_orders] = np.arange(n_features)
+        if self.rfe:
+            feature_selection_matrix = (feature_rank_matrix == 1).astype(np.int32)
+        else:
+            feature_selection_matrix = (feature_rank_matrix < n_select).astype(np.int32)
+        feature_recurrence = np.sum(feature_selection_matrix, axis=0)
+        #robust_features = np.nonzero(feature_recurrence > round(n_samples)*self.recurring_fraction)[0]
+        #robust_features = robust_features[np.argsort(feature_recurrence[robust_features])][::-1]
+        robust_features = np.argsort(-feature_recurrence)[:n_select]
+        feature_selection_matrix = feature_selection_matrix[:, robust_features]
+
+        # refit the estimator
+        if self.grid_search is not None:
+            cv = GridSearchCV(self.estimator, param_grid=self.grid_search, cv=3)
+            cv.fit(X[:, robust_features], y, sample_weight=sample_weight)
+            self.estimator = cv.best_estimator_
+        else:
+            self.estimator.fit(X[:, robust_features], y, sample_weight=sample_weight)
+
+        # save attributes
+        self.feature_recurrence_ = feature_recurrence
+        self.features_ = robust_features
+        self.feature_selection_matrix_ = feature_selection_matrix
+        self.feature_importances_matrix_ = feature_importances_matrix
+        self.feature_importances_mean_ = np.mean(feature_importances_matrix, axis=0)
+        self.feature_importances_std_ = np.std(feature_importances_matrix, axis=0)
+        self.estimator_ = self.estimator
+        if hasattr(self.estimator, 'coef_'):
+            self.feature_importances_ = np.square(self.estimator.coef_.flatten())
+        else:
+            self.feature_importances_ = self.estimator.feature_importances_
+        
+        return self
+
+    def predict(self, X):
+        return self.estimator_.fit(X)
+    
+    def score(self, X):
+        return self.estimator_.score(X)
+
+def define_step(inputs=None, outputs=None):
+    inputs = inputs if inputs is not None else ()
+    outputs = outputs if outputs is not None else ()
+
+    def wrapper_generator(f):
+        def wrapper(self, *args, **kwargs):
+            for input in inputs:
+                if not hasattr(self, input):
+                    raise ValueError('missing input "{}" for step f.__name__'.format(input))
+            r = f(self, *args, **kwargs)
+            for output in outputs:
+                if not hasattr(self, output):
+                    raise ValueError('missing output "{}" for step f.__name__'.format(output))
+            return r
+        return wrapper
+    return wrapper_generator
+
+class FeatureSelection(object):
+    def __init__(self, matrix, 
+            sample_classes,
+            output_dir,
+            remove_zero_features=None,
+            top_features_by_median=None,
+            transpose=False,
+            positive_class=None,
+            negative_class=None,
+            use_log=False,
+            scaler=None,
+            compute_sample_weight=False,
+            method='logistic_regression',
+            rfe=False,
+            n_select=None,
+            resample_method='jackknife',
+            jackknife_max_runs=100,
+            jackknife_remove=0.2,
+            bootstrap_max_runs=100,
+            **kwargs):
+        self.matrix_file = matrix
+        self.sample_classes_file = sample_classes
+        self.output_dir = output_dir
+        self.remove_zero_features = remove_zero_features
+        self.top_features_by_median = top_features_by_median
+        self.transpose = transpose
+        self.positive_class = positive_class
+        self.negative_class = negative_class
+        self.use_log = use_log
+        self.scaler = scaler
+        self.compute_sample_weight = compute_sample_weight
+        self.rfe = rfe
+        self.method = method
+        self.n_select = n_select
+        self.resample_method = resample_method
+        self.jackknife_max_runs = jackknife_max_runs
+        self.jackknife_remove = jackknife_remove
+        self.bootstrap_max_runs = bootstrap_max_runs
+        self.logger = logging.getLogger('FeatureSelection')
+
+        if not os.path.exists(self.output_dir):
+            self.logger.info('create output directory: ' + self.output_dir)
+            os.makedirs(self.output_dir)
+    
+    @define_step(inputs=['matrix_file'], outputs=['X', 'sample_classes'])
+    def read_data(self):
+        self.X = pd.read_table(self.matrix_file, index_col=0)
+        if self.transpose:
+            self.logger.info('transpose feature matrix')
+            self.X = self.X.T
+        self.logger.info('{} samples, {} features'.format(*self.X.shape))
+        self.logger.info('read sample classes: ' + self.sample_classes_file)
+        self.sample_classes = pd.read_table(self.sample_classes_file, header=None, 
+            names=['sample_id', 'sample_class'], index_col=0)
+        self.sample_classes = self.sample_classes.iloc[:, 0]
+    
+    @define_step(inputs=['X'], outputs=['feature_names', 'n_features'])
+    def filter_features(self):
+        if self.remove_zero_features is not None:
+            self.X = self.X.loc[:, np.isclose(m, 0).sum(axis=0) >= (m.shape[0]*self.remove_zero_features)]
+        if self.top_features_by_median is not None:
+            nonzero_samples = (self.X > 0).sum(axis=0)
+            counts_geomean = np.exp(np.sum(np.log(np.maximum(self.X, 1)), axis=0)/nonzero_samples)
+            self.X = self.X.loc[:, counts_geomean.sort_values(ascending=False)[:self.top_features_by_median].index.values]
+        self.feature_names = self.X.columns.values
+        self.n_features = self.X.shape[1]
+        self.logger.info('{} features after filtering'.format(self.n_features))
+        self.logger.info('features: {} ...'.format(str(self.feature_names[:3])))
+    
+    @define_step(inputs=['X', 'positive_class', 'negative_class'],
+        outputs=['X', 'y', 'n_samples', 'sample_ids', 'X_raw'])
+    def select_samples(self):
+        if (self.positive_class is not None) and (self.negative_class is not None):
+            self.positive_class = self.positive_class.split(',')
+            self.negative_class = self.negative_class.split(',')
+        else:
+            unique_classes = np.unique(self.sample_classes.values)
+            if len(unique_classes) != 2:
+                raise ValueError('expect 2 classes but {} classes found'.format(len(unique_classes)))
+            self.positive_class, self.negative_class = unique_classes
+        self.positive_class = np.atleast_1d(self.positive_class)
+        self.negative_class = np.atleast_1d(self.negative_class)
+
+        self.logger.info('positive class: {}, negative class: {}'.format(self.positive_class, self.negative_class))
+        X_pos = self.X.loc[self.sample_classes[self.sample_classes.isin(self.positive_class)].index.values]
+        X_neg = self.X.loc[self.sample_classes[self.sample_classes.isin(self.negative_class)].index.values]
+        self.logger.info('number of positive samples: {}, negative samples: {}, class ratio: {}'.format(
+            X_pos.shape[0], X_neg.shape[0], float(X_pos.shape[0])/X_neg.shape[0]))
+        self.X = pd.concat([X_pos, X_neg], axis=0)
+        self.y = np.zeros(self.X.shape[0], dtype=np.int32)
+        self.y[X_pos.shape[0]:] = 1
+        self.X_raw = self.X
+        self.n_samples = self.X.shape
+        self.sample_ids = self.X.index.values
+
+    @define_step(inputs=['X', 'use_log', 'scaler'], outputs=['X'])
+    def scale_features(self):
+        if self.use_log:
+            self.logger.info('apply log2 to feature matrix')
+            self.X = np.log2(self.X + 1)
+
+        if self.scaler == 'zscore':
+            scaler = StandardScaler()
+        elif self.scaler == 'robust':
+            scaler = RobustScaler()
+        elif self.scaler == 'min_max':
+            scaler = MinMaxScaler()
+        elif self.scaler == 'max_abs':
+            scaler = MaxAbsScaler()
+        self.logger.info('scale features using {}'.format(self.scaler))
+        self.X = scaler.fit_transform(self.X)
+
+    @define_step(inputs=['X', 'method', 'resample_method'], 
+        outputs=['estimator', 'robust_estimator', 'compute_sample_weight'])
+    def train_model(self):
+        if np.any(np.isnan(self.X)):
+            self.logger.info('nan values found in features')
+        self.estimator = None
+        grid_search = None
+        self.logger.info('use {} to select features'.format(self.method))
+        if self.method == 'r_test':
+            self.estimator = TTestEstimator()
+        elif self.method == 'logistic_regression':
+            self.estimator = LogisticRegression()
+            grid_search = {'C': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e2, 1e3, 1e4, 1e5]}
+        elif self.method == 'random_forest':
+            self.estimator = RandomForestClassifier()
+            grid_search = {'max_depth': list(range(2, 10))}
+        elif self.method == 'linear_svm':
+            self.estimator = LinearSVC()
+            grid_search = {'C': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e2, 1e3, 1e4, 1e5]}
+        else:
+            raise ValueError('unknown feature selection method: {}'.format(self.method))
+
+        resampler_args = {}
+        if self.resample_method == 'jackknife':
+            resampler_args = {'max_runs': self.jackknife_max_runs,
+                'remove': self.jackknife_remove
+            }
+        elif self.resample_method == 'bootstrap':
+            resampler_args = {'max_runs': self.bootstrap_max_runs}
+
+        self.robust_estimator = RobustEstimator(self.estimator, n_select=self.n_select, 
+            grid_search=grid_search, resample_method='bootstrap',
+            rfe=self.rfe, **resampler_args)
+
+        sample_weight = None
+        if self.compute_sample_weight:
+            sample_weight = compute_sample_weight('balanced', self.y)
+        self.robust_estimator.fit(self.X, self.y, sample_weight=sample_weight)
+        self.estimator = self.robust_estimator.estimator_
+
+    @define_step(inputs=['estimator', 'X', 'y'])
+    def evaluate_model(self):
+        self.logger.info('evaluate the model')
+        features = pd.read_table(os.path.join(self.output_dir, 'features.txt'),
+            header=None).iloc[:, 0].values
+        feature_index = pd.Series(np.arange(self.n_features), 
+            index=self.feature_names).loc[features]
+        
+        y_pred = self.estimator.predict(self.X[:, feature_index])
+        y_score = self.estimator.predict_proba(self.X[:, feature_index])[:, 1]
+        roc_auc = roc_auc_score(self.y, y_score)
+        fpr, tpr, thresholds = roc_curve(self.y, y_score)
+
+        sns.set_style('whitegrid')
+        fig, ax = plt.subplots(figsize=(7, 7))
+        ax.plot(fpr, tpr, label='AUC = {:.4f}'.format(roc_auc))
+        ax.plot([0, 1], [0, 1], linestyle='dashed', color='gray', linewidth=0.8)
+        ax.set_xlabel('False positive rate')
+        ax.set_ylabel('True positive rate')
+        ax.legend()
+        plt.savefig(os.path.join(self.output_dir, 'roc_curve.pdf'))
+        plt.close()
+
+    @define_step(inputs=['estimator'])
+    def save_model(self):
+        self.logger.info('save model')
+        with open(os.path.join(self.output_dir, 'model.pkl'), 'wb') as f:
+            pickle.dump(self.estimator, f)
+    
+    @define_step(outputs=['estimator'])
+    def load_model(self):
+        self.logger.info('load model')
+        with open(os.path.join(self.output_dir, 'model.pkl'), 'rb') as f:
+            self.estimator = pickle.load(f)
+    
+    @define_step(inputs=['estimator'])
+    def save_params(self):
+        self.logger.info('save model parameters')
+        with open(os.path.join(self.output_dir, 'params.json'), 'w') as f:
+            json.dump(self.estimator.get_params(), f, indent=2)
+    
+    def save_matrix(self):
+        self.logger.info('save matrix')
+        df = pd.DataFrame(self.X, columns=self.feature_names, index=self.sample_ids)
+        df.index.name = 'sample'
+        df.to_csv(os.path.join(self.output_dir, 'matrix.txt'),
+            sep='\t', index=True, header=True)
+        data = pd.Series(self.y, index=self.sample_ids)
+        data.to_csv(os.path.join(self.output_dir, 'labels.txt'),
+            sep='\t', index=True, header=False)
+    
+    @define_step(inputs=['output_dir', 'n_features', 'feature_names', 'X', 'y'])
+    def single_feature_metrics(self):
+        self.logger.info('compute metrics for selected features independently')
+        features = pd.read_table(os.path.join(self.output_dir, 'features.txt'),
+            header=None).iloc[:, 0].values
+        feature_index = pd.Series(np.arange(self.n_features), 
+            index=self.feature_names).loc[features]
+        metrics = {}
+        scorers = {'roc_auc': roc_auc_score}
+        for metric in ['roc_auc']:
+            metrics[metric] = np.zeros(len(features))
+            for i, i_feature in enumerate(feature_index):
+                metrics[metric][i] = scorers[metric](self.y, self.X[:, i_feature])
+        metrics = pd.DataFrame(metrics, index=features)
+        metrics.index.name = 'feature'
+        metrics.to_csv(os.path.join(self.output_dir, 'single_feature_metrics.txt'),
+            sep='\t', index=True, header=True)
+
+    @define_step(inputs=['estimator'])
+    def plot_feature_importance(self):
+        self.logger.info('plot feature importance')
+        features = pd.read_table(os.path.join(self.output_dir, 'features.txt'),
+            header=None).iloc[:, 0].values
+        
+        feature_importance = pd.read_table(os.path.join(self.output_dir, 'feature_importances.txt'),
+            names=['feature', 'feature_importance'], header=None)
+        feature_importance = feature_importance.iloc[np.argsort(-feature_importance['feature_importance'].values), :]
+        print(feature_importance.head())
+        sns.set_style('whitegrid')
+        fig, ax = plt.subplots(figsize=(15, 20))
+        sns.barplot('feature_importance', 'feature', color='gray',
+            data=feature_importance, ax=ax)
+        plt.subplots_adjust(left=0.3)
+        plt.savefig(os.path.join(self.output_dir, 'feature_importances_refitted.pdf'))
+        plt.close()
+
+        feature_importance_matrix = pd.read_table(os.path.join(self.output_dir, 'feature_importance_matrix.txt'))
+        feature_importance_matrix = feature_importance_matrix.loc[:, features]
+        feature_importance_matrix = feature_importance_matrix.iloc[:, np.argsort(-feature_importance_matrix.mean(axis=0).values)]
+        data = pd.melt(feature_importance_matrix, var_name='feature', value_name='feature_importance')
+        sns.set_style('whitegrid')
+        fig, ax = plt.subplots(figsize=(15, 25))
+        sns.barplot('feature_importance', 'feature', color='gray', ci='sd',
+            data=data, ax=ax, errwidth=1, capsize=0.2)
+        plt.subplots_adjust(left=0.2)
+        plt.savefig(os.path.join(self.output_dir, 'feature_importances_estimate.pdf'))
+        plt.close()
+
+
+
+        
\ No newline at end of file