|
a |
|
b/util/preprocess.py |
|
|
1 |
""" |
|
|
2 |
Contain some omics data preprocess functions |
|
|
3 |
""" |
|
|
4 |
import pandas as pd |
|
|
5 |
|
|
|
6 |
|
|
|
7 |
def separate_B(B_df_single): |
|
|
8 |
""" |
|
|
9 |
Separate the DNA methylation dataframe into subsets according to their targeting chromosomes |
|
|
10 |
|
|
|
11 |
Parameters: |
|
|
12 |
B_df_single(DataFrame) -- a dataframe that contains the single DNA methylation matrix |
|
|
13 |
|
|
|
14 |
Return: |
|
|
15 |
B_df_list(list) -- a list with 23 subset dataframe |
|
|
16 |
B_dim(list) -- the dims of each chromosome |
|
|
17 |
""" |
|
|
18 |
anno = pd.read_csv('./anno/B_anno.csv', dtype={'CHR': str}, index_col=0) |
|
|
19 |
anno_contain = anno.loc[B_df_single.index, :] |
|
|
20 |
print('Separating B.tsv according the targeting chromosome...') |
|
|
21 |
B_df_list, B_dim_list = [], [] |
|
|
22 |
ch_id = list(range(1, 23)) |
|
|
23 |
ch_id.append('X') |
|
|
24 |
for ch in ch_id: |
|
|
25 |
ch_index = anno_contain[anno_contain.CHR == str(ch)].index |
|
|
26 |
ch_df = B_df_single.loc[ch_index, :] |
|
|
27 |
B_df_list.append(ch_df) |
|
|
28 |
B_dim_list.append(len(ch_df)) |
|
|
29 |
|
|
|
30 |
return B_df_list, B_dim_list |