Diff of /PhenPred/DataImporter.py [000000] .. [305123]

Switch to side-by-side view

--- a
+++ b/PhenPred/DataImporter.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python
+# Copyright (C) 2022 Emanuel Goncalves
+
+import pandas as pd
+from PhenPred.Utils import scale
+from sklearn.preprocessing import StandardScaler
+
+
+class DataImporter:
+	def __init__(self, data_dir):
+		self.data_dir = data_dir
+
+	def read_dataset(self, dataset_name):
+		"""
+		Convenience function to input any dataset supported
+		"""
+		if dataset_name.lower() == "metabolomics":
+			return self.read_metabolomics()
+
+		elif dataset_name.lower() == "drug_response":
+			return self.read_drug_response()
+
+		elif dataset_name.lower() == "proteomics":
+			return self.read_proteomics()
+
+		elif dataset_name.lower() == "tissue":
+			return self.read_tissue_types()
+
+		elif dataset_name.lower() == "methylation":
+			return self.read_methylation()
+
+		elif dataset_name.lower() == "transcriptomics":
+			return self.read_transcriptomics()
+
+		elif dataset_name.lower() == "genomics":
+			return self.read_genomics()
+
+		elif dataset_name.lower() == "essentiality":
+			return self.read_essentiality()
+
+		else:
+			raise Exception(f"Dataset {dataset_name}, not suported.")
+
+	def read_proteomics(self):
+		return pd.read_csv(f"{self.data_dir}/proteomics.csv", index_col=0).T
+
+	def read_metabolomics(self):
+		df = pd.read_csv(f"{self.data_dir}/metabolomics.csv", index_col=0).T
+		df = pd.DataFrame(StandardScaler().fit_transform(df), index=df.index, columns=df.columns)
+		return df
+
+	def read_drug_response(self, only_gdsc2=True):
+		df = pd.read_csv(f"{self.data_dir}/drugresponse.csv", index_col=0).T
+
+		if only_gdsc2:
+			df = df[[c for c in df if c.split(";")[2] == "GDSC2"]]
+
+		return df
+
+	def read_tissue_types(self):
+		return pd.get_dummies(pd.read_csv(f"{self.data_dir}/samplesheet.csv", index_col=0)["tissue"])
+
+	def read_methylation(self):
+		return pd.read_csv(f"{self.data_dir}/methylation.csv", index_col=0).T
+
+	def read_transcriptomics(self):
+		return pd.read_csv(f"{self.data_dir}/transcriptomics.csv", index_col=0).T
+
+	def read_genomics(self):
+		return pd.read_csv(f"{self.data_dir}/genomics.csv", index_col=0).T
+
+	def read_essentiality(self, only_context_fitness_genes=True):
+		df = pd.read_csv(f"{self.data_dir}/crisprcas9.csv", index_col=0).dropna(axis=1)
+		df = scale(df)
+		df = df.T
+
+		if only_context_fitness_genes:
+			cf_genes = (df < -0.5).sum()
+			cf_genes = cf_genes.loc[(cf_genes < df.shape[0] * 0.5) & (cf_genes >= 10)]
+			df = df[cf_genes.index]
+
+		return df