OmiVAE / Git / Diff of /data

Models:
AlyssaS/
OmiVAE
Downloads: 1
Diff of /data_integration.py [000000] .. [2d53aa]
Switch to side-by-side view

--- a
+++ b/data_integration.py
@@ -0,0 +1,63 @@
+# DNA methylation
+
+import numpy as np
+import pandas as pd
+
+# Select both samples
+both_ids = np.loadtxt('data/PANCAN/GDC-PANCAN_both_samples.tsv', delimiter='\t', dtype='U32')
+both_ids_index = np.insert(both_ids, 0, 'Composite Element REF')
+
+file_path = 'data/PANCAN/GDC-PANCAN_methylation450_preprocessed.tsv'
+# DNA methylation: 392761 rows × 8764 columns
+input_df = pd.read_csv(file_path, sep='\t', header=0, index_col=0, usecols=both_ids_index)[both_ids]
+
+# Select specific chr
+all_index_set = set(input_df.index)
+
+mapping = pd.read_csv('data/illuminaMethyl450_hg38_GDC', sep='\t', header=0, index_col=0)
+
+chrs = mapping['chrom'].unique()
+chrs = np.delete(chrs, 17)
+
+# Store the number of probes for each chromosome
+chrs_number_dict = {'chrs':list(chrs), 'in_mapping':list(np.zeros(24)), 'in_data':list(np.zeros(24))}
+chrs_number_df = pd.DataFrame(chrs_number_dict)
+chrs_number_df.set_index(['chrs'], inplace=True)
+
+for chrom in chrs:
+    chr_index_set = set(mapping[mapping['chrom'] == chrom].index)
+    chrs_number_df.loc[chrom, 'in_mapping'] = len(chr_index_set)
+    chr_index_exi_set = all_index_set & chr_index_set
+    chrs_number_df.loc[chrom, 'in_data'] = len(chr_index_exi_set)
+    chr_index_exi_array = np.array(list(chr_index_exi_set))
+
+    chr_df = input_df.loc[chr_index_exi_array]
+    output_path = 'data/PANCAN/GDC-PANCAN_methylation450_preprocessed_both_' + chrom + '.tsv'
+    chr_df.to_csv(output_path, sep='\t')
+
+chrs_number_df.to_csv('data/PANCAN/GDC-PANCAN_methylation450_preprocessed_chr_number.tsv', sep='\t')
+input_df.to_csv('data/PANCAN/GDC-PANCAN_methylation450_preprocessed_both.tsv', sep='\t')
+
+
+# Combine methy and expr data to a single file
+
+# P
+input_path = 'data/PANCAN/GDC-PANCAN_'
+
+sample_id = np.loadtxt(input_path + 'both_samples.tsv', delimiter='\t', dtype='str')
+
+expr_path = input_path + 'htseq_fpkm_'
+methy_path = input_path + 'methylation450_'
+
+# Set the dtype to f32 for memory saving purpose
+all_cols_f32 = {col: np.float32 for col in sample_id}
+
+print('Loading gene expression data...')
+expr_df = pd.read_csv(expr_path + 'preprocessed_both.tsv', sep='\t', header=0, index_col=0, dtype=all_cols_f32)
+
+print('Loading DNA methylation data...')
+methy_df = pd.read_csv(methy_path + 'preprocessed_both.tsv', sep='\t', header=0, index_col=0, dtype=all_cols_f32)
+
+multi_df = pd.concat([methy_df, expr_df])
+out_path = input_path + 'preprocessed_both.tsv'
+multi_df.to_csv(out_path, sep='\t')