[793d90]: / lib / gcforest / data_load.py

Download this file

58 lines (43 with data), 1.8 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import os
def obesity_data():
abundance = '../lib/gcforest/data/obesity/abundance_obesity.txt'
f = pd.read_csv(abundance, sep='\t', header=None, index_col=0)
f = f.T
f = f.loc[(f != 0).any(axis=1)]
f.set_index('sampleID', inplace=True)
l = f['disease'].values
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(l)
feature_identifier = 'k__'
feat = [s for s in f.columns if sum([s2 in s for s2 in feature_identifier.split(':')]) > 0]
f = f.loc[:, feat].astype('float')
return f, integer_encoded
def cirrhosis_data():
print(os.path.dirname(os.path.realpath('__file__')))
abundance = '../lib/gcforest/data/cirrhosis/abundance_cirrhosis_strain.txt'
f = pd.read_csv(abundance, sep='\t', header=None, index_col=0)
f = f.T
f = f.loc[(f != 0).any(axis=1)]
f.set_index('sampleID', inplace=True)
l = f['disease'].values
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(l)
feature_identifier = 'k__'
feat = [s for s in f.columns if sum([s2 in s for s2 in feature_identifier.split(':')]) > 0]
f = f.loc[:, feat].astype('float')
return f, integer_encoded
def t2d_data():
abundance = '../lib/gcforest/data/t2d/abundance_t2d_long-t2d_short.txt'
f = pd.read_csv(abundance, sep='\t', header=None, index_col=0)
f = f.T
f = f.loc[(f != 0).any(axis=1)]
f.set_index('sampleID', inplace=True)
l = f['disease']
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(l)
feature_identifier = 'k__'
feat = [s for s in f.columns if sum([s2 in s for s2 in feature_identifier.split(':')]) > 0]
f = f.loc[:, feat].astype('float')
return f, integer_encoded