Diff of /modas/contrast.py [000000] .. [a43cea]

Switch to side-by-side view

--- a
+++ b/modas/contrast.py
@@ -0,0 +1,95 @@
+import numpy as np
+import pandas as pd
+from sklearn.decomposition import MiniBatchSparsePCA
+from sklearn.metrics import pairwise_distances
+from sklearn.linear_model import LinearRegression
+from scipy.stats import t
+import statsmodels.api as sm
+from statsmodels.formula.api import ols
+
+
+def standardize_phe(phe):
+    standardized_phe = (phe - np.mean(phe, axis=0)) / np.std(phe, axis=0)
+    return np.nan_to_num(standardized_phe)
+
+
+def phe_transform(phe):
+    transformed_phe = pd.DataFrame(pairwise_distances(phe + 1, metric='braycurtis'), index=phe.index, columns=phe.index)
+    transformed_phe[np.isnan(transformed_phe)] = 0
+    return transformed_phe
+
+
+def scpca(stress, control, phe_name, alpha, n_comp):
+    index = stress.index
+    stress = stress - np.mean(stress, axis=0)
+    stress = standardize_phe(stress)
+    control = control - np.mean(control, axis=0)
+    control = standardize_phe(control)
+    stress_cov = stress.T.dot(stress) / (stress.shape[0] - 1)
+    control_cov = control.T.dot(control) / (control.shape[0] - 1)
+    spca = MiniBatchSparsePCA(n_components=n_comp, random_state=3)
+    scpca_res = spca.fit_transform(stress_cov - alpha * control_cov)
+    scpca_res = pd.DataFrame(scpca_res, index=index, columns=[phe_name+'_CPCA_PC'+str(i) for i in range(1, n_comp + 1)])
+    return scpca_res
+
+
+def beta_test(scpca_pc, stress_phe, control_phe):
+    lr = LinearRegression()
+    lr.fit(scpca_pc.to_frame(), stress_phe)
+    stress_beta = lr.coef_[0]
+    rss_stress = ((stress_phe - lr.predict(scpca_pc.to_frame()))**2).sum()
+    lr.fit(scpca_pc.to_frame(), control_phe)
+    control_beta = lr.coef_[0]
+    rss_control = ((control_phe - lr.predict(scpca_pc.to_frame()))**2).sum()
+    s_scpca_phe = (rss_stress + rss_control) / ((stress_phe.shape[0] - 2) + (control_phe.shape[0] - 2))
+    s_con = np.sqrt(s_scpca_phe / ((scpca_pc - scpca_pc.mean())**2).sum() + s_scpca_phe / ((scpca_pc - scpca_pc.mean())**2).sum())
+    t_stat = (stress_beta - control_beta) / s_con
+    pvalue = (1 - t.cdf(abs(t_stat), (stress_phe.shape[0] - 2) + (control_phe.shape[0] - 2))) * 2
+    return [stress_beta, control_beta, t_stat, pvalue]
+
+
+def scpca_omics(stress_omics, control_omics, alpha, n_comp, beta_test_pvalue):
+    idx = list(set(stress_omics.columns.tolist()) & set(control_omics.columns.tolist()))
+    index = stress_omics.index.intersection(control_omics.index)
+    stress_omics = stress_omics.reindex(index)
+    control_omics = control_omics.reindex(index)
+    omics_scpca_phe = list()
+    omics_scpca_test = list()
+    for mol_feature in idx:
+        stress = phe_transform(stress_omics[[mol_feature]])
+        control = phe_transform(control_omics[[mol_feature]])
+        scpca_res = scpca(stress, control, mol_feature, alpha, n_comp)
+        omics_scpca_phe.append(scpca_res)
+        for scpca_phe_id in scpca_res:
+            omics_scpca_test.append([scpca_phe_id, mol_feature] + beta_test(scpca_res[scpca_phe_id], stress_omics[mol_feature], control_omics[mol_feature]))
+    omics_scpca_phe = pd.concat(omics_scpca_phe, axis=1)
+    omics_scpca_test = pd.DataFrame(omics_scpca_test)
+    omics_scpca_test.columns = ['scpca_phe', 'molecular_feature', 'stress_beta', 'control_beta', 'beta_test_stat', 'beta_test_pvalue']
+    if beta_test_pvalue is None:
+        beta_test_pvalue = 1.0 / omics_scpca_test.shape[0]
+    omics_scpca_phe = omics_scpca_phe.loc[:, omics_scpca_test.loc[omics_scpca_test.beta_test_pvalue <= beta_test_pvalue, 'scpca_phe']]
+    return omics_scpca_phe, omics_scpca_test
+
+
+def anova(stress_omics, control_omics, scpca_qtl_res, geno):
+    index = stress_omics.index.intersection(geno.index)
+    stress_omics = stress_omics.reindex(index)
+    control_omics = control_omics.reindex(index)
+    geno = geno.reindex(index)
+    anova_res = list()
+    for phe_name in scpca_qtl_res.phe_name.unique():
+        mol_feature = '_'.join(phe_name.split('_')[:-2]).replace('m.z', 'm/z')
+        res = pd.concat([stress_omics[mol_feature].to_frame().reset_index(), control_omics[mol_feature].to_frame().reset_index()])
+        res.columns = ['RIL', 'exp']
+        res['con'] = ['stress'] * stress_omics.shape[0] + ['control'] * control_omics.shape[0]
+        for snp in scpca_qtl_res.loc[scpca_qtl_res.phe_name == phe_name, 'SNP']:
+            res['genotype'] = 0
+            res.loc[res['RIL'].isin(geno.index[geno[snp] == 2]), 'genotype'] = 1
+            model = ols('exp ~ C(con) + C(genotype) +\
+                        C(con):C(genotype)', data=res).fit()
+            pvalue = sm.stats.anova_lm(model, typ=2).iloc[2, 3]
+            anova_res.append([phe_name, snp, pvalue])
+    anova_res = pd.DataFrame(anova_res)
+    anova_res.columns = ['phe_name', 'SNP', 'pvalue']
+    scpca_qtl_res = pd.merge(scpca_qtl_res, anova_res, on=['phe_name', 'SNP'])
+    return scpca_qtl_res