[305123]: / PhenPred / DataImporter.py

Download this file

83 lines (58 with data), 2.4 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/env python
# Copyright (C) 2022 Emanuel Goncalves
import pandas as pd
from PhenPred.Utils import scale
from sklearn.preprocessing import StandardScaler
class DataImporter:
def __init__(self, data_dir):
self.data_dir = data_dir
def read_dataset(self, dataset_name):
"""
Convenience function to input any dataset supported
"""
if dataset_name.lower() == "metabolomics":
return self.read_metabolomics()
elif dataset_name.lower() == "drug_response":
return self.read_drug_response()
elif dataset_name.lower() == "proteomics":
return self.read_proteomics()
elif dataset_name.lower() == "tissue":
return self.read_tissue_types()
elif dataset_name.lower() == "methylation":
return self.read_methylation()
elif dataset_name.lower() == "transcriptomics":
return self.read_transcriptomics()
elif dataset_name.lower() == "genomics":
return self.read_genomics()
elif dataset_name.lower() == "essentiality":
return self.read_essentiality()
else:
raise Exception(f"Dataset {dataset_name}, not suported.")
def read_proteomics(self):
return pd.read_csv(f"{self.data_dir}/proteomics.csv", index_col=0).T
def read_metabolomics(self):
df = pd.read_csv(f"{self.data_dir}/metabolomics.csv", index_col=0).T
df = pd.DataFrame(StandardScaler().fit_transform(df), index=df.index, columns=df.columns)
return df
def read_drug_response(self, only_gdsc2=True):
df = pd.read_csv(f"{self.data_dir}/drugresponse.csv", index_col=0).T
if only_gdsc2:
df = df[[c for c in df if c.split(";")[2] == "GDSC2"]]
return df
def read_tissue_types(self):
return pd.get_dummies(pd.read_csv(f"{self.data_dir}/samplesheet.csv", index_col=0)["tissue"])
def read_methylation(self):
return pd.read_csv(f"{self.data_dir}/methylation.csv", index_col=0).T
def read_transcriptomics(self):
return pd.read_csv(f"{self.data_dir}/transcriptomics.csv", index_col=0).T
def read_genomics(self):
return pd.read_csv(f"{self.data_dir}/genomics.csv", index_col=0).T
def read_essentiality(self, only_context_fitness_genes=True):
df = pd.read_csv(f"{self.data_dir}/crisprcas9.csv", index_col=0).dropna(axis=1)
df = scale(df)
df = df.T
if only_context_fitness_genes:
cf_genes = (df < -0.5).sum()
cf_genes = cf_genes.loc[(cf_genes < df.shape[0] * 0.5) & (cf_genes >= 10)]
df = df[cf_genes.index]
return df