--- a +++ b/exseek/scripts/estimators.py @@ -0,0 +1,497 @@ +import numpy as np +import pandas as pd +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as plt +from matplotlib.backends.backend_pdf import PdfPages +import seaborn as sns +sns.set() +from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.linear_model import LogisticRegression +from sklearn.ensemble import RandomForestClassifier +from sklearn.svm import LinearSVC +from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve +from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler +from sklearn.utils.class_weight import compute_sample_weight +from sklearn.model_selection import GridSearchCV +from sklearn.feature_selection import RFE +import os +from tqdm import tqdm +import pickle +import json +from scipy.cluster import hierarchy +from scipy.spatial.distance import pdist, squareform +from tqdm import tqdm +import logging +logging.basicConfig(level=logging.INFO, format='[%(asctime)s] [%(levelname)s] %(name)s: %(message)s') + +def bootstrap(*arrays): + '''Perform bootstrap resampling + + Parameters + ---------- + arrays: list of array-like objects + Data to resample. Bootstrap resampling is performed on the first dimension of each array + + Returns + ------- + arrays: tuple of array-like objects + Resampled data + ''' + n_samples = arrays[0].shape[0] + if not all(n_samples == a.shape[0] for a in arrays): + raise ValueError('the first dimension of all arrays should be equal') + indices = np.random.randint(n_samples, size=n_samples) + return (np.take(a, indices, axis=0) for a in arrays) + +def jackknife(*arrays, remove=1, indices=None): + '''Perform jackknife resampling + + Parameters + ---------- + arrays: list of array-like objects + Data to resample. Bootstrap resampling is performed on the first dimension of each array + + remove: int or float + If `remove` is a integer, remove that number of samples. + If `remove` is a float, remove that fraction of samples. + + indices: array-like + Indices of samples to remove. The `remove` parameter will be ignored. + + Returns + ------- + arrays: tuple of array-like objects + Resampled data + ''' + n_samples = arrays[0].shape[0] + if not all(n_samples == a.shape[0] for a in arrays): + raise ValueError('the first dimension of all arrays should be equal') + if indices is None: + if remove < 1: + n_remove = round(remove*n_samples) + else: + n_remove = remove + indices = np.random.choice(n_samples, replace=False, size=n_remove) + return (np.delete(a, indices, axis=0) for a in arrays) + +class RobustEstimator(BaseEstimator, ClassifierMixin): + '''A wrapper to select robust features based on feature reccurence + + Parameters: + ---------- + estimator: sklearn.base.BaseEstimator object + Base estimator for feature selection + + n_select: int + Number of features to select + + resample_method: str, choices: ('jackknife', 'bootstrap') + Resampling method + + remove: float or int + Fraction (float, 0.0-1.0) or number (int, >= 1) of samples to remove for each round of resampling + + max_runs: int + Maximum rounds of jackknife resampling + + recurring_fraction: float + Minimum required fraction of reccuring samples to select features + + grid_search: dict or None + Parameter grid for optimizing hyper-parameters using GridSearchCV + + rfe: bool + Whether to use RFE to select features rather than select features with higest importance during each run + + Attributes: + ---------- + feature_recurrence_: array-like, shape (n_features,) + Number of resampling rounds in which each feature is retained + + features_: array-like, shape (n_features,) + Indices of selected features + + feature_selection_matrix_: array-like, (max_runs, n_select), type int + Indicator matrix for selected features during each run + + feature_importance_matrix_: array-like, (max_runs, n_features) + Feature importance during each resampling run + + feature_importances_mean_: array-like, (n_features,) + Feature importances averaged across resampling runs + + feature_importance_std_: array-like, (n_features,) + Standard deviation of feature importances across resampling runs + + estimator_: sklearn.base.BaseEstimator object + Estimator fitted on all data using selected features + + feature_importances_: array-like, (n_select,) + Feature importances trained on all data using selected features + + ''' + def __init__(self, estimator, n_select=None, resample_method = 'jackknife', + remove=1, max_runs=50, recurring_fraction=0.5, + grid_search=None, rfe=False): + self.estimator = estimator + self.remove = remove + self.max_runs = max_runs + self.n_select = n_select + self.resample_method = resample_method + self.recurring_fraction = recurring_fraction + self.grid_search = grid_search + self.rfe = rfe + + def fit(self, X, y, sample_weight=None): + n_samples, n_features = X.shape + feature_rank_matrix = np.zeros(n_samples) + if self.remove == 1: + max_runs = n_samples + else: + max_runs = self.max_runs + n_select = self.n_select + if n_select is None: + n_select = n_features + feature_rank_matrix = np.zeros((max_runs, n_features)) + feature_importances_matrix = np.zeros((max_runs, n_features)) + if sample_weight is None: + sample_weight = np.ones(n_samples) + for i_run in range(max_runs): + if self.resample_method == 'bootstrap': + X_, y_, sample_weight_ = bootstrap(X, y, sample_weight) + elif self.resample_method == 'jackknife': + indices_remove = None + if self.remove == 1: + indices_remove = i_run + X_, y_, sample_weight_ = jackknife(X, y, sample_weight, + remove=self.remove, indices=indices_remove) + if self.grid_search is not None: + cv = GridSearchCV(self.estimator, param_grid=self.grid_search, cv=3) + cv.fit(X_, y_, sample_weight=sample_weight_) + self.estimator = cv.best_estimator_ + else: + self.estimator.fit(X_, y_, sample_weight=sample_weight_) + if self.rfe: + rfe = RFE(self.estimator, n_select, step=0.1) + rfe.fit(X_, y_) + feature_importances_matrix[i_run] = rfe.support_.astype('float') + feature_rank_matrix[i_run] = rfe.ranking_ + else: + if hasattr(self.estimator, 'coef_'): + feature_importances = np.square(self.estimator.coef_.flatten()) + else: + feature_importances = self.estimator.feature_importances_ + feature_importances_matrix[i_run] = feature_importances + feature_orders = np.argsort(-feature_importances) + feature_rank_matrix[i_run, feature_orders] = np.arange(n_features) + if self.rfe: + feature_selection_matrix = (feature_rank_matrix == 1).astype(np.int32) + else: + feature_selection_matrix = (feature_rank_matrix < n_select).astype(np.int32) + feature_recurrence = np.sum(feature_selection_matrix, axis=0) + #robust_features = np.nonzero(feature_recurrence > round(n_samples)*self.recurring_fraction)[0] + #robust_features = robust_features[np.argsort(feature_recurrence[robust_features])][::-1] + robust_features = np.argsort(-feature_recurrence)[:n_select] + feature_selection_matrix = feature_selection_matrix[:, robust_features] + + # refit the estimator + if self.grid_search is not None: + cv = GridSearchCV(self.estimator, param_grid=self.grid_search, cv=3) + cv.fit(X[:, robust_features], y, sample_weight=sample_weight) + self.estimator = cv.best_estimator_ + else: + self.estimator.fit(X[:, robust_features], y, sample_weight=sample_weight) + + # save attributes + self.feature_recurrence_ = feature_recurrence + self.features_ = robust_features + self.feature_selection_matrix_ = feature_selection_matrix + self.feature_importances_matrix_ = feature_importances_matrix + self.feature_importances_mean_ = np.mean(feature_importances_matrix, axis=0) + self.feature_importances_std_ = np.std(feature_importances_matrix, axis=0) + self.estimator_ = self.estimator + if hasattr(self.estimator, 'coef_'): + self.feature_importances_ = np.square(self.estimator.coef_.flatten()) + else: + self.feature_importances_ = self.estimator.feature_importances_ + + return self + + def predict(self, X): + return self.estimator_.fit(X) + + def score(self, X): + return self.estimator_.score(X) + +def define_step(inputs=None, outputs=None): + inputs = inputs if inputs is not None else () + outputs = outputs if outputs is not None else () + + def wrapper_generator(f): + def wrapper(self, *args, **kwargs): + for input in inputs: + if not hasattr(self, input): + raise ValueError('missing input "{}" for step f.__name__'.format(input)) + r = f(self, *args, **kwargs) + for output in outputs: + if not hasattr(self, output): + raise ValueError('missing output "{}" for step f.__name__'.format(output)) + return r + return wrapper + return wrapper_generator + +class FeatureSelection(object): + def __init__(self, matrix, + sample_classes, + output_dir, + remove_zero_features=None, + top_features_by_median=None, + transpose=False, + positive_class=None, + negative_class=None, + use_log=False, + scaler=None, + compute_sample_weight=False, + method='logistic_regression', + rfe=False, + n_select=None, + resample_method='jackknife', + jackknife_max_runs=100, + jackknife_remove=0.2, + bootstrap_max_runs=100, + **kwargs): + self.matrix_file = matrix + self.sample_classes_file = sample_classes + self.output_dir = output_dir + self.remove_zero_features = remove_zero_features + self.top_features_by_median = top_features_by_median + self.transpose = transpose + self.positive_class = positive_class + self.negative_class = negative_class + self.use_log = use_log + self.scaler = scaler + self.compute_sample_weight = compute_sample_weight + self.rfe = rfe + self.method = method + self.n_select = n_select + self.resample_method = resample_method + self.jackknife_max_runs = jackknife_max_runs + self.jackknife_remove = jackknife_remove + self.bootstrap_max_runs = bootstrap_max_runs + self.logger = logging.getLogger('FeatureSelection') + + if not os.path.exists(self.output_dir): + self.logger.info('create output directory: ' + self.output_dir) + os.makedirs(self.output_dir) + + @define_step(inputs=['matrix_file'], outputs=['X', 'sample_classes']) + def read_data(self): + self.X = pd.read_table(self.matrix_file, index_col=0) + if self.transpose: + self.logger.info('transpose feature matrix') + self.X = self.X.T + self.logger.info('{} samples, {} features'.format(*self.X.shape)) + self.logger.info('read sample classes: ' + self.sample_classes_file) + self.sample_classes = pd.read_table(self.sample_classes_file, header=None, + names=['sample_id', 'sample_class'], index_col=0) + self.sample_classes = self.sample_classes.iloc[:, 0] + + @define_step(inputs=['X'], outputs=['feature_names', 'n_features']) + def filter_features(self): + if self.remove_zero_features is not None: + self.X = self.X.loc[:, np.isclose(m, 0).sum(axis=0) >= (m.shape[0]*self.remove_zero_features)] + if self.top_features_by_median is not None: + nonzero_samples = (self.X > 0).sum(axis=0) + counts_geomean = np.exp(np.sum(np.log(np.maximum(self.X, 1)), axis=0)/nonzero_samples) + self.X = self.X.loc[:, counts_geomean.sort_values(ascending=False)[:self.top_features_by_median].index.values] + self.feature_names = self.X.columns.values + self.n_features = self.X.shape[1] + self.logger.info('{} features after filtering'.format(self.n_features)) + self.logger.info('features: {} ...'.format(str(self.feature_names[:3]))) + + @define_step(inputs=['X', 'positive_class', 'negative_class'], + outputs=['X', 'y', 'n_samples', 'sample_ids', 'X_raw']) + def select_samples(self): + if (self.positive_class is not None) and (self.negative_class is not None): + self.positive_class = self.positive_class.split(',') + self.negative_class = self.negative_class.split(',') + else: + unique_classes = np.unique(self.sample_classes.values) + if len(unique_classes) != 2: + raise ValueError('expect 2 classes but {} classes found'.format(len(unique_classes))) + self.positive_class, self.negative_class = unique_classes + self.positive_class = np.atleast_1d(self.positive_class) + self.negative_class = np.atleast_1d(self.negative_class) + + self.logger.info('positive class: {}, negative class: {}'.format(self.positive_class, self.negative_class)) + X_pos = self.X.loc[self.sample_classes[self.sample_classes.isin(self.positive_class)].index.values] + X_neg = self.X.loc[self.sample_classes[self.sample_classes.isin(self.negative_class)].index.values] + self.logger.info('number of positive samples: {}, negative samples: {}, class ratio: {}'.format( + X_pos.shape[0], X_neg.shape[0], float(X_pos.shape[0])/X_neg.shape[0])) + self.X = pd.concat([X_pos, X_neg], axis=0) + self.y = np.zeros(self.X.shape[0], dtype=np.int32) + self.y[X_pos.shape[0]:] = 1 + self.X_raw = self.X + self.n_samples = self.X.shape + self.sample_ids = self.X.index.values + + @define_step(inputs=['X', 'use_log', 'scaler'], outputs=['X']) + def scale_features(self): + if self.use_log: + self.logger.info('apply log2 to feature matrix') + self.X = np.log2(self.X + 1) + + if self.scaler == 'zscore': + scaler = StandardScaler() + elif self.scaler == 'robust': + scaler = RobustScaler() + elif self.scaler == 'min_max': + scaler = MinMaxScaler() + elif self.scaler == 'max_abs': + scaler = MaxAbsScaler() + self.logger.info('scale features using {}'.format(self.scaler)) + self.X = scaler.fit_transform(self.X) + + @define_step(inputs=['X', 'method', 'resample_method'], + outputs=['estimator', 'robust_estimator', 'compute_sample_weight']) + def train_model(self): + if np.any(np.isnan(self.X)): + self.logger.info('nan values found in features') + self.estimator = None + grid_search = None + self.logger.info('use {} to select features'.format(self.method)) + if self.method == 'r_test': + self.estimator = TTestEstimator() + elif self.method == 'logistic_regression': + self.estimator = LogisticRegression() + grid_search = {'C': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e2, 1e3, 1e4, 1e5]} + elif self.method == 'random_forest': + self.estimator = RandomForestClassifier() + grid_search = {'max_depth': list(range(2, 10))} + elif self.method == 'linear_svm': + self.estimator = LinearSVC() + grid_search = {'C': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e2, 1e3, 1e4, 1e5]} + else: + raise ValueError('unknown feature selection method: {}'.format(self.method)) + + resampler_args = {} + if self.resample_method == 'jackknife': + resampler_args = {'max_runs': self.jackknife_max_runs, + 'remove': self.jackknife_remove + } + elif self.resample_method == 'bootstrap': + resampler_args = {'max_runs': self.bootstrap_max_runs} + + self.robust_estimator = RobustEstimator(self.estimator, n_select=self.n_select, + grid_search=grid_search, resample_method='bootstrap', + rfe=self.rfe, **resampler_args) + + sample_weight = None + if self.compute_sample_weight: + sample_weight = compute_sample_weight('balanced', self.y) + self.robust_estimator.fit(self.X, self.y, sample_weight=sample_weight) + self.estimator = self.robust_estimator.estimator_ + + @define_step(inputs=['estimator', 'X', 'y']) + def evaluate_model(self): + self.logger.info('evaluate the model') + features = pd.read_table(os.path.join(self.output_dir, 'features.txt'), + header=None).iloc[:, 0].values + feature_index = pd.Series(np.arange(self.n_features), + index=self.feature_names).loc[features] + + y_pred = self.estimator.predict(self.X[:, feature_index]) + y_score = self.estimator.predict_proba(self.X[:, feature_index])[:, 1] + roc_auc = roc_auc_score(self.y, y_score) + fpr, tpr, thresholds = roc_curve(self.y, y_score) + + sns.set_style('whitegrid') + fig, ax = plt.subplots(figsize=(7, 7)) + ax.plot(fpr, tpr, label='AUC = {:.4f}'.format(roc_auc)) + ax.plot([0, 1], [0, 1], linestyle='dashed', color='gray', linewidth=0.8) + ax.set_xlabel('False positive rate') + ax.set_ylabel('True positive rate') + ax.legend() + plt.savefig(os.path.join(self.output_dir, 'roc_curve.pdf')) + plt.close() + + @define_step(inputs=['estimator']) + def save_model(self): + self.logger.info('save model') + with open(os.path.join(self.output_dir, 'model.pkl'), 'wb') as f: + pickle.dump(self.estimator, f) + + @define_step(outputs=['estimator']) + def load_model(self): + self.logger.info('load model') + with open(os.path.join(self.output_dir, 'model.pkl'), 'rb') as f: + self.estimator = pickle.load(f) + + @define_step(inputs=['estimator']) + def save_params(self): + self.logger.info('save model parameters') + with open(os.path.join(self.output_dir, 'params.json'), 'w') as f: + json.dump(self.estimator.get_params(), f, indent=2) + + def save_matrix(self): + self.logger.info('save matrix') + df = pd.DataFrame(self.X, columns=self.feature_names, index=self.sample_ids) + df.index.name = 'sample' + df.to_csv(os.path.join(self.output_dir, 'matrix.txt'), + sep='\t', index=True, header=True) + data = pd.Series(self.y, index=self.sample_ids) + data.to_csv(os.path.join(self.output_dir, 'labels.txt'), + sep='\t', index=True, header=False) + + @define_step(inputs=['output_dir', 'n_features', 'feature_names', 'X', 'y']) + def single_feature_metrics(self): + self.logger.info('compute metrics for selected features independently') + features = pd.read_table(os.path.join(self.output_dir, 'features.txt'), + header=None).iloc[:, 0].values + feature_index = pd.Series(np.arange(self.n_features), + index=self.feature_names).loc[features] + metrics = {} + scorers = {'roc_auc': roc_auc_score} + for metric in ['roc_auc']: + metrics[metric] = np.zeros(len(features)) + for i, i_feature in enumerate(feature_index): + metrics[metric][i] = scorers[metric](self.y, self.X[:, i_feature]) + metrics = pd.DataFrame(metrics, index=features) + metrics.index.name = 'feature' + metrics.to_csv(os.path.join(self.output_dir, 'single_feature_metrics.txt'), + sep='\t', index=True, header=True) + + @define_step(inputs=['estimator']) + def plot_feature_importance(self): + self.logger.info('plot feature importance') + features = pd.read_table(os.path.join(self.output_dir, 'features.txt'), + header=None).iloc[:, 0].values + + feature_importance = pd.read_table(os.path.join(self.output_dir, 'feature_importances.txt'), + names=['feature', 'feature_importance'], header=None) + feature_importance = feature_importance.iloc[np.argsort(-feature_importance['feature_importance'].values), :] + print(feature_importance.head()) + sns.set_style('whitegrid') + fig, ax = plt.subplots(figsize=(15, 20)) + sns.barplot('feature_importance', 'feature', color='gray', + data=feature_importance, ax=ax) + plt.subplots_adjust(left=0.3) + plt.savefig(os.path.join(self.output_dir, 'feature_importances_refitted.pdf')) + plt.close() + + feature_importance_matrix = pd.read_table(os.path.join(self.output_dir, 'feature_importance_matrix.txt')) + feature_importance_matrix = feature_importance_matrix.loc[:, features] + feature_importance_matrix = feature_importance_matrix.iloc[:, np.argsort(-feature_importance_matrix.mean(axis=0).values)] + data = pd.melt(feature_importance_matrix, var_name='feature', value_name='feature_importance') + sns.set_style('whitegrid') + fig, ax = plt.subplots(figsize=(15, 25)) + sns.barplot('feature_importance', 'feature', color='gray', ci='sd', + data=data, ax=ax, errwidth=1, capsize=0.2) + plt.subplots_adjust(left=0.2) + plt.savefig(os.path.join(self.output_dir, 'feature_importances_estimate.pdf')) + plt.close() + + + + \ No newline at end of file