scPanel / Git / [f3902c] /src/scpanel/select

Models:
DanielG/
scPanel
Downloads: 1
[f3902c]: / src / scpanel / select_gene.py
History
Download this file
448 lines (347 with data), 14.8 kB

# import anndata
import itertools
import os
import pickle
import time

import matplotlib.pyplot as plt
from joblib import Parallel, delayed

# from sklearn.preprocessing import LabelEncoder
from sklearn import svm

# import scanpy as sc
# import numpy as np
# import pandas as pd
from sklearn.model_selection import StratifiedKFold

from .SVMRFECV import RFE, RFECV
from .utils_func import *
from anndata._core.anndata import AnnData
from matplotlib.axes._axes import Axes
from scpanel.SVMRFECV import RFECV
from typing import List, Optional, Tuple


def split_n_folds(adata_train: AnnData, nfold: int, out_dir: Optional[str]=None, random_state: int=2349) -> Tuple[List[List[int]], List[List[int]], List[List[float]]]:
    ## add: exclude patients without selected cell type
    n_cell_pat = adata_train.obs.groupby(["patient_id"])["ct"].count()
    exclude_pat = adata_train.obs["patient_id"].isin(n_cell_pat[n_cell_pat == 0].index)
    adata_train = adata_train[~exclude_pat]

    if sum(exclude_pat) > 0:
        print(
            n_cell_pat[n_cell_pat == 0].index.tolist(),
            "get excluded since no selected cell type appears",
        )

    ## split patients
    pat_meta_temp = adata_train.obs[["y", "patient_id"]].drop_duplicates().reset_index()
    cell_meta_temp = adata_train.obs.reset_index()

    patient_class = pat_meta_temp["y"].to_numpy()
    patient = pat_meta_temp["patient_id"].to_numpy()

    skf = StratifiedKFold(n_splits=nfold, shuffle=True, random_state=random_state)
    # sss = StratifiedShuffleSplit(n_splits=nfold, test_size = test_size)

    patient_train_id_list = []
    patient_val_id_list = []

    train_patient_list = []
    val_patient_list = []

    train_index_list = []
    val_index_list = []

    weight_list = []

    for train_index, val_index in skf.split(patient, patient_class):

        train_patient_list.append(train_index)
        val_patient_list.append(val_index)

        patient_train_id = patient[train_index]
        patient_val_id = patient[val_index]

        patient_train_id_list.append(patient_train_id)
        patient_val_id_list.append(patient_val_id)

        cell_meta_fold_train = cell_meta_temp[
            cell_meta_temp["patient_id"].isin(patient_train_id)
        ]
        cell_meta_fold_test = cell_meta_temp[
            cell_meta_temp["patient_id"].isin(patient_val_id)
        ]

        # compute weight for each cell in each fold's training set
        w_fold_train = compute_cell_weight(cell_meta_fold_train)
        weight_list.append(w_fold_train.tolist())

        # get positional index for train and test set in each fold
        cell_train_id = cell_meta_fold_train.index.tolist()
        cell_val_id = cell_meta_fold_test.index.tolist()

        # cell_train_id.sort()
        # cell_val_id.sort()

        if cell_train_id not in train_index_list:
            train_index_list.append(cell_train_id)

        if cell_val_id not in val_index_list:
            val_index_list.append(cell_val_id)

    ## check if weights (np.Series) have the same order as train_index_list
    # np.array_equiv([idx for fold in train_index_list for idx in fold],
    #                w_fold_train.index.values)

    if out_dir is not None:
        # Output
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)

        ## Data and index
        X_train, y_train = get_X_y_from_ann(adata_train)
        with open(os.path.join(out_dir, "Data_X_y.pkl"), "wb") as f:
            d = {"features": X_train, "labels": y_train}
            pickle.dump(d, f, protocol=pickle.HIGHEST_PROTOCOL)
            f.close()
            del d

        with open(
            os.path.join(out_dir, "Data_" + str(nfold) + "fold_index.pkl"), "wb"
        ) as f:
            d = {
                "train": train_index_list,
                "val": val_index_list,
                "sample_weight": w_fold_train,
            }
            pickle.dump(d, f, protocol=pickle.HIGHEST_PROTOCOL)
            f.close()
            del d

        ## nfold splitting information
        # get n_cells, patient_id and class prop for each set
        all_list = train_index_list + val_index_list
        n_cells = [len(sublist) for sublist in all_list]

        patient_ids = patient_train_id_list + patient_val_id_list

        class_prop = [
            np.unique(y_train[index], return_counts=True)[1] for index in all_list
        ]

        patient_prop_train = [
            pat_meta_temp.loc[fold].y.value_counts().tolist()
            for fold in train_patient_list
        ]
        patient_prop_val = [
            pat_meta_temp.loc[fold].y.value_counts().tolist()
            for fold in val_patient_list
        ]
        patient_prop = patient_prop_train + patient_prop_val

        nfold_info = pd.DataFrame(
            {
                "n_cells": n_cells,
                "patient_ids": patient_ids,
                "class_prop": class_prop,
                "pt_prop": patient_prop,
            }
        )

        train_col = ["train_f" + str(i) for i in range(1, nfold + 1)]
        val_col = ["val_f" + str(i) for i in range(1, nfold + 1)]
        nfold_info.index = train_col + val_col
        nfold_info.to_csv(f"{out_dir}/split_nfold_info.csv")

    return train_index_list, val_index_list, weight_list


def gene_score(
    adata_train: AnnData,
    train_index_list: List[List[int]],
    val_index_list: List[List[int]],
    sample_weight_list: List[List[float]],
    out_dir: str,
    ncpus: int,
    step: float=0.03,
    metric: str="average_precision",
    verbose: bool=False,
) -> Tuple[AnnData, RFECV]:

    # metric: https://scikit-learn.org/stable/modules/model_evaluation.html

    X, y = get_X_y_from_ann(adata_train)

    # Fill NaN in numpy
    X = np.nan_to_num(X)
    y = np.nan_to_num(y)

    # model------------
    # model = svm.SVC(kernel="linear", class_weight = 'balanced', verbose=verbose, random_state = 123)
    model = svm.SVC(kernel="linear", verbose=verbose, random_state=123)

    rfecv = RFECV(
        estimator=model, step=step, scoring=metric, cv=10, n_jobs=ncpus, verbose=0
    )
    # X = StandardScaler().fit_transform(X)
    rfecv.fit(
        X, y, train_index_list, val_index_list, sample_weight_list=sample_weight_list
    )

    # organize dataframe for results
    n_gene = X.shape[1]
    cv_dict = rfecv.cv_results_.copy()
    # cv_dict.pop('mean_feature_ranking')
    cv_df = pd.DataFrame.from_dict(cv_dict)

    # find number of features selected in each iteration
    import math

    nfeat = n_gene
    step = step
    steps = [n_gene]
    while nfeat > 1:
        nstep = math.ceil(nfeat * step)
        nfeat = nfeat - nstep
        steps.append(nfeat)

    cv_df.index = steps[::-1]

    adata_train.uns["rfecv_result"] = cv_df
    adata_train.uns["rfecv_result_metric"] = rfecv.scoring

    if out_dir is not None:
        # save tmp output------------------
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)

        model_file = f"{out_dir}/rfecv_ranking_by_{type(model).__name__}.sav"
        pickle.dump(rfecv, open(model_file, "wb"))

    return adata_train, rfecv


def plot_gene_score(adata_train: AnnData, n_genes_plot: int=200, width: int=5, height: int=4, k: Optional[int]=None) -> Axes:

    cv_df = adata_train.uns["rfecv_result"].filter(regex="mean|split")
    cv_df = cv_df.loc[:n_genes_plot,]
    cv_df.columns = cv_df.columns.str.rstrip("_test_score")

    scoring_metrics = adata_train.uns["rfecv_result_metric"]
    if scoring_metrics == "average_precision":
        ylabel = "AUPRC"
    elif scoring_metrics == "roc_auc":
        ylabel = "AUROC"
    else:
        ylabel = scoring_metrics

    fig, axes = plt.subplots(figsize=(width, height))
    for columnName, columnData in cv_df.items():
        if "mean" in columnName:
            axes.plot(columnData, label=columnName)
        else:
            axes.plot(columnData, label=columnName, linestyle="dashed", alpha=0.6)

    axes.spines[["right", "top"]].set_visible(False)

    plt.xlabel("Number of Genes")
    plt.ylabel(ylabel)
    plt.legend()

    if k is not None:
        k_score = cv_df.loc[k, "mean"]
        y_label_adjust = (cv_df["mean"].max() - cv_df["mean"].min()) / 2

        plt.axvline(x=k, color="r", linestyle=":")
        plt.text(
            x=k + 4, y=k_score - y_label_adjust, s=f"n={k}\n{ylabel}={k_score:.3f}"
        )

    return axes


def decide_k(adata_train: AnnData, n_genes_plot: int=100) -> int:
    cv_df = adata_train.uns["rfecv_result"]
    cv_df = cv_df.loc[:n_genes_plot, :]

    data = cv_df.reset_index()[["index", "mean_test_score"]].to_numpy()
    A = data[0]
    B = data[-1]
    # 利用ABC三点坐标计算三角形面积，利用AB边长倒推三角形的高
    Dist = dict()
    for i in range(1, len(data)):
        C = data[i]
        ngene = C[0]
        D = np.append(np.vstack((A, B, C)), [[1], [1], [1]], axis=1)
        S = 1 / 2 * np.linalg.det(D)
        Dist[ngene] = 2 * S / np.linalg.norm(A - B)

    top_n_feat = int(max(Dist, key=Dist.get))
    top_n_feat_auc = cv_df.loc[max(Dist, key=Dist.get), "mean_test_score"]

    # print(f'Number of genes to select = {top_n_feat}')

    return top_n_feat


def select_gene(
    adata_train: AnnData, out_dir: Optional[str]=None, step: float=0.03, top_n_feat: int=5, n_genes_plot: int=100, verbose: int=0
) -> AnnData:

    # retrieve top_n_feat from one SVM-RFE run
    X, y = get_X_y_from_ann(adata_train)

    # Fill NaN in numpy
    X = np.nan_to_num(X)
    y = np.nan_to_num(y)

    # model------------
    model = svm.SVC(kernel="linear", random_state=123)

    ## get ranking of all selected features
    selector = RFE(model, n_features_to_select=1, step=step, verbose=verbose)

    sample_weight = compute_cell_weight(adata_train)
    selector.fit(X, y, sample_weight=sample_weight)

    feature_ranking = pd.DataFrame(
        {"ranking": selector.ranking_}, index=adata_train.var_names
    ).sort_values(by="ranking")
    sig_list_ranked = feature_ranking.index[:top_n_feat].tolist()
    # print(sig_list_ranked)

    adata_train.uns["svm_rfe_genes"] = sig_list_ranked
    adata_train.var["ranking"] = selector.ranking_

    if out_dir is not None:
        # output gene list
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)

        with open(f"{out_dir}/sig_svm.txt", "w") as f:
            for item in sig_list_ranked:
                f.write("%s\n" % item)

        # output adata_train_s with gene scores
        adata_train.write_h5ad(f"{out_dir}/adata_train_s.h5ad")

    return adata_train


def select_gene_stable(
    adata_train,
    n_iter=20,
    nfold=2,
    downsample_prop_list=[0.6, 0.8],
    num_cores=1,
    out_dir=None,
):

    def _single_fit(downsample_prop, i, adata_train, nfold, out_dir):

        downsample_size = round(adata_train.n_obs * downsample_prop)
        i = i + 1

        # create folder to output results for each iteration
        out_dir = f"{out_dir}/{downsample_size}/{i}"
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)

        # metadata for stratified downsampling
        adata_train.obs["downsample_stratify"] = adata_train.obs[["patient_id"]].astype(
            "category"
        )

        down_index_i = resample(
            adata_train.obs_names,
            replace=False,
            n_samples=downsample_size,
            stratify=adata_train.obs["downsample_stratify"],
            random_state=i,
        )
        # downsampling
        adata_train_i = adata_train[adata_train.obs_names.isin(down_index_i),].copy()

        # QC for downsampled traninig data
        # 1. for each cell type, remove samples with <20 cells
        # 2. remove cell types with < 2 samples
        # 3. Remove 0-expressed genes
        # 4. Update training data

        min_cells = 20
        ## Number of cells in each patient
        n_cell_pt = adata_train_i.obs.groupby(
            ["patient_id"], observed=True, as_index=False
        ).size()
        # Remove paients with cells less than min_cells
        pt_keep = n_cell_pt.patient_id[n_cell_pt["size"] >= min_cells].tolist()

        ## Cell types with 0 patient has cells >= min_cells
        if len(pt_keep) > 0:
            adata_train_i = adata_train_i[
                adata_train_i.obs["patient_id"].isin(pt_keep),
            ]
            n_cell_pt = adata_train_i.obs.groupby(
                ["y", "patient_id"], observed=True, as_index=False
            ).size()
            ## Skip cell types with less than 2 patients in at least one condition
            if (n_cell_pt.y.nunique() >= 2) & ((n_cell_pt.y.value_counts() >= 2).all()):
                print("we have >= 2 samples in each condition...")

                ## Remove 0-expressed genes
                sc.pp.filter_genes(adata_train_i, min_cells=1)

                # Split downsampled train data into folds
                train_index_list, val_index_list, sample_weight_list = split_n_folds(
                    adata_train_i, nfold=nfold, out_dir=out_dir, random_state=2349
                )

                adata_train_i, rfecv_i = gene_score(
                    adata_train_i,
                    train_index_list,
                    val_index_list,
                    sample_weight_list=sample_weight_list,
                    step=0.03,
                    out_dir=out_dir,
                    ncpus=None,
                    verbose=False,
                )

                k = decide_k(adata_train_i, n_genes_plot=100)
                adata_train_i = select_gene(
                    adata_train_i, top_n_feat=k, step=0.03, out_dir=out_dir
                )
                sig_svm_i = adata_train_i.uns["svm_rfe_genes"]

        res_i = pd.DataFrame(sig_svm_i, columns=["gene"])
        res_i["downsample_prop"] = downsample_prop
        res_i["downsample_size"] = downsample_size
        res_i["n_iter"] = i

        return res_i

    start = time.time()

    paramlist = itertools.product(downsample_prop_list, range(n_iter))  # 2 nested loops
    res = Parallel(n_jobs=num_cores)(
        delayed(_single_fit)(
            downsample_prop, i, adata_train=adata_train, nfold=nfold, out_dir=out_dir
        )
        for downsample_prop, i in paramlist
    )
    end = time.time()

    res_df = pd.concat(res)
    gene_freq_df = res_df.groupby(
        ["downsample_prop", "downsample_size", "gene"], as_index=False
    ).size()
    adata_train.uns["scPanel_stable_rfecv_result"] = gene_freq_df

    gene_mean = gene_freq_df.groupby("gene")["size"].mean()
    gene_mean = gene_mean[gene_mean > round(n_iter * 0.5)].sort_values(ascending=False)
    adata_train.uns["svm_rfe_genes_stable"] = gene_mean.index.tolist()
    adata_train.uns["svm_rfe_genes_stable_time"] = end - start

    return adata_train