|
a |
|
b/data_integration.py |
|
|
1 |
# DNA methylation |
|
|
2 |
|
|
|
3 |
import numpy as np |
|
|
4 |
import pandas as pd |
|
|
5 |
|
|
|
6 |
# Select both samples |
|
|
7 |
both_ids = np.loadtxt('data/PANCAN/GDC-PANCAN_both_samples.tsv', delimiter='\t', dtype='U32') |
|
|
8 |
both_ids_index = np.insert(both_ids, 0, 'Composite Element REF') |
|
|
9 |
|
|
|
10 |
file_path = 'data/PANCAN/GDC-PANCAN_methylation450_preprocessed.tsv' |
|
|
11 |
# DNA methylation: 392761 rows × 8764 columns |
|
|
12 |
input_df = pd.read_csv(file_path, sep='\t', header=0, index_col=0, usecols=both_ids_index)[both_ids] |
|
|
13 |
|
|
|
14 |
# Select specific chr |
|
|
15 |
all_index_set = set(input_df.index) |
|
|
16 |
|
|
|
17 |
mapping = pd.read_csv('data/illuminaMethyl450_hg38_GDC', sep='\t', header=0, index_col=0) |
|
|
18 |
|
|
|
19 |
chrs = mapping['chrom'].unique() |
|
|
20 |
chrs = np.delete(chrs, 17) |
|
|
21 |
|
|
|
22 |
# Store the number of probes for each chromosome |
|
|
23 |
chrs_number_dict = {'chrs':list(chrs), 'in_mapping':list(np.zeros(24)), 'in_data':list(np.zeros(24))} |
|
|
24 |
chrs_number_df = pd.DataFrame(chrs_number_dict) |
|
|
25 |
chrs_number_df.set_index(['chrs'], inplace=True) |
|
|
26 |
|
|
|
27 |
for chrom in chrs: |
|
|
28 |
chr_index_set = set(mapping[mapping['chrom'] == chrom].index) |
|
|
29 |
chrs_number_df.loc[chrom, 'in_mapping'] = len(chr_index_set) |
|
|
30 |
chr_index_exi_set = all_index_set & chr_index_set |
|
|
31 |
chrs_number_df.loc[chrom, 'in_data'] = len(chr_index_exi_set) |
|
|
32 |
chr_index_exi_array = np.array(list(chr_index_exi_set)) |
|
|
33 |
|
|
|
34 |
chr_df = input_df.loc[chr_index_exi_array] |
|
|
35 |
output_path = 'data/PANCAN/GDC-PANCAN_methylation450_preprocessed_both_' + chrom + '.tsv' |
|
|
36 |
chr_df.to_csv(output_path, sep='\t') |
|
|
37 |
|
|
|
38 |
chrs_number_df.to_csv('data/PANCAN/GDC-PANCAN_methylation450_preprocessed_chr_number.tsv', sep='\t') |
|
|
39 |
input_df.to_csv('data/PANCAN/GDC-PANCAN_methylation450_preprocessed_both.tsv', sep='\t') |
|
|
40 |
|
|
|
41 |
|
|
|
42 |
# Combine methy and expr data to a single file |
|
|
43 |
|
|
|
44 |
# P |
|
|
45 |
input_path = 'data/PANCAN/GDC-PANCAN_' |
|
|
46 |
|
|
|
47 |
sample_id = np.loadtxt(input_path + 'both_samples.tsv', delimiter='\t', dtype='str') |
|
|
48 |
|
|
|
49 |
expr_path = input_path + 'htseq_fpkm_' |
|
|
50 |
methy_path = input_path + 'methylation450_' |
|
|
51 |
|
|
|
52 |
# Set the dtype to f32 for memory saving purpose |
|
|
53 |
all_cols_f32 = {col: np.float32 for col in sample_id} |
|
|
54 |
|
|
|
55 |
print('Loading gene expression data...') |
|
|
56 |
expr_df = pd.read_csv(expr_path + 'preprocessed_both.tsv', sep='\t', header=0, index_col=0, dtype=all_cols_f32) |
|
|
57 |
|
|
|
58 |
print('Loading DNA methylation data...') |
|
|
59 |
methy_df = pd.read_csv(methy_path + 'preprocessed_both.tsv', sep='\t', header=0, index_col=0, dtype=all_cols_f32) |
|
|
60 |
|
|
|
61 |
multi_df = pd.concat([methy_df, expr_df]) |
|
|
62 |
out_path = input_path + 'preprocessed_both.tsv' |
|
|
63 |
multi_df.to_csv(out_path, sep='\t') |