Diff of /data_integration.py [000000] .. [2d53aa]

Switch to unified view

a b/data_integration.py
1
# DNA methylation
2
3
import numpy as np
4
import pandas as pd
5
6
# Select both samples
7
both_ids = np.loadtxt('data/PANCAN/GDC-PANCAN_both_samples.tsv', delimiter='\t', dtype='U32')
8
both_ids_index = np.insert(both_ids, 0, 'Composite Element REF')
9
10
file_path = 'data/PANCAN/GDC-PANCAN_methylation450_preprocessed.tsv'
11
# DNA methylation: 392761 rows × 8764 columns
12
input_df = pd.read_csv(file_path, sep='\t', header=0, index_col=0, usecols=both_ids_index)[both_ids]
13
14
# Select specific chr
15
all_index_set = set(input_df.index)
16
17
mapping = pd.read_csv('data/illuminaMethyl450_hg38_GDC', sep='\t', header=0, index_col=0)
18
19
chrs = mapping['chrom'].unique()
20
chrs = np.delete(chrs, 17)
21
22
# Store the number of probes for each chromosome
23
chrs_number_dict = {'chrs':list(chrs), 'in_mapping':list(np.zeros(24)), 'in_data':list(np.zeros(24))}
24
chrs_number_df = pd.DataFrame(chrs_number_dict)
25
chrs_number_df.set_index(['chrs'], inplace=True)
26
27
for chrom in chrs:
28
    chr_index_set = set(mapping[mapping['chrom'] == chrom].index)
29
    chrs_number_df.loc[chrom, 'in_mapping'] = len(chr_index_set)
30
    chr_index_exi_set = all_index_set & chr_index_set
31
    chrs_number_df.loc[chrom, 'in_data'] = len(chr_index_exi_set)
32
    chr_index_exi_array = np.array(list(chr_index_exi_set))
33
34
    chr_df = input_df.loc[chr_index_exi_array]
35
    output_path = 'data/PANCAN/GDC-PANCAN_methylation450_preprocessed_both_' + chrom + '.tsv'
36
    chr_df.to_csv(output_path, sep='\t')
37
38
chrs_number_df.to_csv('data/PANCAN/GDC-PANCAN_methylation450_preprocessed_chr_number.tsv', sep='\t')
39
input_df.to_csv('data/PANCAN/GDC-PANCAN_methylation450_preprocessed_both.tsv', sep='\t')
40
41
42
# Combine methy and expr data to a single file
43
44
# P
45
input_path = 'data/PANCAN/GDC-PANCAN_'
46
47
sample_id = np.loadtxt(input_path + 'both_samples.tsv', delimiter='\t', dtype='str')
48
49
expr_path = input_path + 'htseq_fpkm_'
50
methy_path = input_path + 'methylation450_'
51
52
# Set the dtype to f32 for memory saving purpose
53
all_cols_f32 = {col: np.float32 for col in sample_id}
54
55
print('Loading gene expression data...')
56
expr_df = pd.read_csv(expr_path + 'preprocessed_both.tsv', sep='\t', header=0, index_col=0, dtype=all_cols_f32)
57
58
print('Loading DNA methylation data...')
59
methy_df = pd.read_csv(methy_path + 'preprocessed_both.tsv', sep='\t', header=0, index_col=0, dtype=all_cols_f32)
60
61
multi_df = pd.concat([methy_df, expr_df])
62
out_path = input_path + 'preprocessed_both.tsv'
63
multi_df.to_csv(out_path, sep='\t')