a b/util/preprocess.py
1
"""
2
Contain some omics data preprocess functions
3
"""
4
import pandas as pd
5
6
7
def separate_B(B_df_single):
8
    """
9
    Separate the DNA methylation dataframe into subsets according to their targeting chromosomes
10
11
    Parameters:
12
        B_df_single(DataFrame) -- a dataframe that contains the single DNA methylation matrix
13
14
    Return:
15
        B_df_list(list) -- a list with 23 subset dataframe
16
        B_dim(list) -- the dims of each chromosome
17
    """
18
    anno = pd.read_csv('./anno/B_anno.csv', dtype={'CHR': str}, index_col=0)
19
    anno_contain = anno.loc[B_df_single.index, :]
20
    print('Separating B.tsv according the targeting chromosome...')
21
    B_df_list, B_dim_list = [], []
22
    ch_id = list(range(1, 23))
23
    ch_id.append('X')
24
    for ch in ch_id:
25
        ch_index = anno_contain[anno_contain.CHR == str(ch)].index
26
        ch_df = B_df_single.loc[ch_index, :]
27
        B_df_list.append(ch_df)
28
        B_dim_list.append(len(ch_df))
29
30
    return B_df_list, B_dim_list