--- a +++ b/PhenPred/DataImporter.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python +# Copyright (C) 2022 Emanuel Goncalves + +import pandas as pd +from PhenPred.Utils import scale +from sklearn.preprocessing import StandardScaler + + +class DataImporter: + def __init__(self, data_dir): + self.data_dir = data_dir + + def read_dataset(self, dataset_name): + """ + Convenience function to input any dataset supported + """ + if dataset_name.lower() == "metabolomics": + return self.read_metabolomics() + + elif dataset_name.lower() == "drug_response": + return self.read_drug_response() + + elif dataset_name.lower() == "proteomics": + return self.read_proteomics() + + elif dataset_name.lower() == "tissue": + return self.read_tissue_types() + + elif dataset_name.lower() == "methylation": + return self.read_methylation() + + elif dataset_name.lower() == "transcriptomics": + return self.read_transcriptomics() + + elif dataset_name.lower() == "genomics": + return self.read_genomics() + + elif dataset_name.lower() == "essentiality": + return self.read_essentiality() + + else: + raise Exception(f"Dataset {dataset_name}, not suported.") + + def read_proteomics(self): + return pd.read_csv(f"{self.data_dir}/proteomics.csv", index_col=0).T + + def read_metabolomics(self): + df = pd.read_csv(f"{self.data_dir}/metabolomics.csv", index_col=0).T + df = pd.DataFrame(StandardScaler().fit_transform(df), index=df.index, columns=df.columns) + return df + + def read_drug_response(self, only_gdsc2=True): + df = pd.read_csv(f"{self.data_dir}/drugresponse.csv", index_col=0).T + + if only_gdsc2: + df = df[[c for c in df if c.split(";")[2] == "GDSC2"]] + + return df + + def read_tissue_types(self): + return pd.get_dummies(pd.read_csv(f"{self.data_dir}/samplesheet.csv", index_col=0)["tissue"]) + + def read_methylation(self): + return pd.read_csv(f"{self.data_dir}/methylation.csv", index_col=0).T + + def read_transcriptomics(self): + return pd.read_csv(f"{self.data_dir}/transcriptomics.csv", index_col=0).T + + def read_genomics(self): + return pd.read_csv(f"{self.data_dir}/genomics.csv", index_col=0).T + + def read_essentiality(self, only_context_fitness_genes=True): + df = pd.read_csv(f"{self.data_dir}/crisprcas9.csv", index_col=0).dropna(axis=1) + df = scale(df) + df = df.T + + if only_context_fitness_genes: + cf_genes = (df < -0.5).sum() + cf_genes = cf_genes.loc[(cf_genes < df.shape[0] * 0.5) & (cf_genes >= 10)] + df = df[cf_genes.index] + + return df