--- a +++ b/data_integration.py @@ -0,0 +1,63 @@ +# DNA methylation + +import numpy as np +import pandas as pd + +# Select both samples +both_ids = np.loadtxt('data/PANCAN/GDC-PANCAN_both_samples.tsv', delimiter='\t', dtype='U32') +both_ids_index = np.insert(both_ids, 0, 'Composite Element REF') + +file_path = 'data/PANCAN/GDC-PANCAN_methylation450_preprocessed.tsv' +# DNA methylation: 392761 rows × 8764 columns +input_df = pd.read_csv(file_path, sep='\t', header=0, index_col=0, usecols=both_ids_index)[both_ids] + +# Select specific chr +all_index_set = set(input_df.index) + +mapping = pd.read_csv('data/illuminaMethyl450_hg38_GDC', sep='\t', header=0, index_col=0) + +chrs = mapping['chrom'].unique() +chrs = np.delete(chrs, 17) + +# Store the number of probes for each chromosome +chrs_number_dict = {'chrs':list(chrs), 'in_mapping':list(np.zeros(24)), 'in_data':list(np.zeros(24))} +chrs_number_df = pd.DataFrame(chrs_number_dict) +chrs_number_df.set_index(['chrs'], inplace=True) + +for chrom in chrs: + chr_index_set = set(mapping[mapping['chrom'] == chrom].index) + chrs_number_df.loc[chrom, 'in_mapping'] = len(chr_index_set) + chr_index_exi_set = all_index_set & chr_index_set + chrs_number_df.loc[chrom, 'in_data'] = len(chr_index_exi_set) + chr_index_exi_array = np.array(list(chr_index_exi_set)) + + chr_df = input_df.loc[chr_index_exi_array] + output_path = 'data/PANCAN/GDC-PANCAN_methylation450_preprocessed_both_' + chrom + '.tsv' + chr_df.to_csv(output_path, sep='\t') + +chrs_number_df.to_csv('data/PANCAN/GDC-PANCAN_methylation450_preprocessed_chr_number.tsv', sep='\t') +input_df.to_csv('data/PANCAN/GDC-PANCAN_methylation450_preprocessed_both.tsv', sep='\t') + + +# Combine methy and expr data to a single file + +# P +input_path = 'data/PANCAN/GDC-PANCAN_' + +sample_id = np.loadtxt(input_path + 'both_samples.tsv', delimiter='\t', dtype='str') + +expr_path = input_path + 'htseq_fpkm_' +methy_path = input_path + 'methylation450_' + +# Set the dtype to f32 for memory saving purpose +all_cols_f32 = {col: np.float32 for col in sample_id} + +print('Loading gene expression data...') +expr_df = pd.read_csv(expr_path + 'preprocessed_both.tsv', sep='\t', header=0, index_col=0, dtype=all_cols_f32) + +print('Loading DNA methylation data...') +methy_df = pd.read_csv(methy_path + 'preprocessed_both.tsv', sep='\t', header=0, index_col=0, dtype=all_cols_f32) + +multi_df = pd.concat([methy_df, expr_df]) +out_path = input_path + 'preprocessed_both.tsv' +multi_df.to_csv(out_path, sep='\t')