|
a |
|
b/preprocessing_scr/mapper.py |
|
|
1 |
from __future__ import print_function |
|
|
2 |
import pandas as pd |
|
|
3 |
import sys,os |
|
|
4 |
import numpy as np |
|
|
5 |
#from scipy.stats import zscore |
|
|
6 |
|
|
|
7 |
|
|
|
8 |
def expand(df,column,sep="|"): |
|
|
9 |
'''Expand rows containing separator.''' |
|
|
10 |
ndx = 0 |
|
|
11 |
expanded_df = {} |
|
|
12 |
for row in df.iterrows(): |
|
|
13 |
if type(row[1][column]) == str: |
|
|
14 |
for x in row[1][column].split(sep): |
|
|
15 |
row_ = row[1].copy() |
|
|
16 |
row_[column] = x |
|
|
17 |
expanded_df[ndx] = row_ |
|
|
18 |
ndx +=1 |
|
|
19 |
else: |
|
|
20 |
expanded_df[ndx] = row[1] |
|
|
21 |
ndx +=1 |
|
|
22 |
return pd.DataFrame.from_dict(expanded_df).T |
|
|
23 |
|
|
|
24 |
|
|
|
25 |
def parse_mapping_table(dataframe, query_id, target_id): |
|
|
26 |
''' Takes the dataframe with gene ids mapping and column names of wuery and target ID. |
|
|
27 |
\nReturns a dictionary with one-to-one, one-to-many, many-to-one mappings and a lsit with unmapped identitfiers (one-to-none). Tested with Homo_sapiens.gene_info from NCBI.''' |
|
|
28 |
|
|
|
29 |
mapper = {"one-to-one":{},"one-to-many":{},"many-to-one":{},"one-to-none":[]} |
|
|
30 |
|
|
|
31 |
df = dataframe.loc[:,[query_id, target_id]] |
|
|
32 |
# exclude rows of all NAs |
|
|
33 |
df_size = df.shape[0] |
|
|
34 |
df.dropna(how="all",inplace=True) |
|
|
35 |
if df_size - df.shape[0] > 0: |
|
|
36 |
print(df_size - df.shape[0],"rows with both",query_id,"and",target_id,"empty",file=sys.stderr) |
|
|
37 |
else: |
|
|
38 |
print("Ok: no empty rows detected",file=sys.stderr) |
|
|
39 |
# duplicated pairs |
|
|
40 |
df_size = df.shape[0] |
|
|
41 |
df.drop_duplicates(inplace=True) |
|
|
42 |
if df_size - df.shape[0] > 0: |
|
|
43 |
print(df_size - df.shape[0],"duplicated pairs dropped",file=sys.stderr) |
|
|
44 |
else: |
|
|
45 |
print("Ok: no duplicated pairs detected",file=sys.stderr) |
|
|
46 |
# exclude NA query IDs |
|
|
47 |
df_size = df.shape[0] |
|
|
48 |
df.dropna(subset=[query_id],axis=0,inplace = True) |
|
|
49 |
if df_size -df.shape[0] >0: |
|
|
50 |
print(df_size -df.shape[0],"rows with empty",query_id,"were excluded",file=sys.stderr) |
|
|
51 |
else: |
|
|
52 |
print("Ok: All",query_id,"rows are not empty.",file=sys.stderr) |
|
|
53 |
|
|
|
54 |
# recognized query ids mapped to no target ids |
|
|
55 |
found_not_mapped = list(set(df.loc[df[target_id].isnull(),query_id].values)) |
|
|
56 |
if len(found_not_mapped) > 0: |
|
|
57 |
df = df.loc[~df[query_id].isin(found_not_mapped),:].copy() |
|
|
58 |
print(len(found_not_mapped),query_id,"ids mapped to no",target_id,file= sys.stderr) |
|
|
59 |
|
|
|
60 |
else: |
|
|
61 |
print("Ok: All",query_id,"are mapped to",target_id,file= sys.stderr) |
|
|
62 |
|
|
|
63 |
# uniqueness of query ids; one-to-many is not acceptable |
|
|
64 |
query_dups = list(set(df.loc[df.duplicated(subset=[query_id],keep = False),:][query_id].values)) |
|
|
65 |
if len(query_dups) > 0: |
|
|
66 |
print(len(query_dups),query_id,"mapped to multiple",target_id,file= sys.stderr) |
|
|
67 |
df_one_to_many = df.loc[df[query_id].isin(query_dups),:].copy() |
|
|
68 |
df = df.loc[~df[query_id].isin(query_dups),:].copy() |
|
|
69 |
df_one_to_many = df_one_to_many.groupby(query_id).agg({target_id:list}) |
|
|
70 |
mapper["one-to-many"] = df_one_to_many.to_dict()[target_id] |
|
|
71 |
else: |
|
|
72 |
print("Ok: All",query_id,"are unique",file= sys.stderr) |
|
|
73 |
|
|
|
74 |
# uniqueness of target ids; many-to-one is ok for synonyms, but not for primary id |
|
|
75 |
query_ambiguous = list(set(df.loc[df.duplicated(subset=[target_id],keep = False),:][query_id].values)) |
|
|
76 |
if len(query_ambiguous) > 0: |
|
|
77 |
print(len(query_ambiguous),"different",query_id, |
|
|
78 |
"mapped to the same",target_id,file= sys.stderr) |
|
|
79 |
df_many_to_one = df.loc[df[query_id].isin(query_ambiguous),:].copy() |
|
|
80 |
df = df.loc[~df[query_id].isin(query_ambiguous),:].copy() |
|
|
81 |
df_many_to_one.set_index(query_id,inplace=True,drop=True) |
|
|
82 |
mapper["many-to-one"] = df_many_to_one.to_dict()[target_id] |
|
|
83 |
else: |
|
|
84 |
print("Ok: All",target_id,"are unique",file= sys.stderr) |
|
|
85 |
|
|
|
86 |
if len(query_dups) == 0 and len(query_ambiguous) == 0: |
|
|
87 |
print("Ok: One-to-one mapping between",query_id,"and",target_id,file= sys.stderr) |
|
|
88 |
print(df.shape[0],query_id,"can be mapped directly to",target_id,file= sys.stderr) |
|
|
89 |
|
|
|
90 |
# one-to-one |
|
|
91 |
df.set_index(query_id,inplace=True,drop=True) |
|
|
92 |
mapper["one-to-one"]=df.to_dict()[target_id] |
|
|
93 |
|
|
|
94 |
# query_id without target_id |
|
|
95 |
mapper["one-to-none"]= found_not_mapped |
|
|
96 |
return mapper |
|
|
97 |
|
|
|
98 |
|
|
|
99 |
def apply_mappers(df, main_mapper, alt_mapper, verbose = True,handle_duplicates = "keep"): |
|
|
100 |
'''Converts IDs in DF indices.\n |
|
|
101 |
handle_duplicates - how to deal with duplicated IDs in the resulted DF:\n |
|
|
102 |
\tsum - group by index and sum\n |
|
|
103 |
\taverage - group by index and keep average\n |
|
|
104 |
\tdrop - drop duplicates\n |
|
|
105 |
\tkeep - do nothing.''' |
|
|
106 |
ID_list = list(df.index.values) |
|
|
107 |
|
|
|
108 |
# main mapper, e.g. NCBI symbol -> Entrez Gene ID |
|
|
109 |
symbols_mapped_directly = {} |
|
|
110 |
recognized_not_mapped = [] # found in target IDs of mapper but not |
|
|
111 |
symbol_one2many = [] # not mapped because of ambiguity |
|
|
112 |
symbol_many2one = [] # not mapped because of ambiguity |
|
|
113 |
# Alternative mapper |
|
|
114 |
# applied in case the main mapper failed: e.g. NCBI synonym -> NCBI symbol -> Entrez Gene ID |
|
|
115 |
via_alt_symbol = {} |
|
|
116 |
via_nonuniq_alt_symbol = {} |
|
|
117 |
alt_symbol_one2many = [] # |
|
|
118 |
synonym_match_current_symbol = [] # these synonyms are not used in mapping because they match with ID in main mapped |
|
|
119 |
not_found_at_all =[] |
|
|
120 |
loc = {} |
|
|
121 |
loc_not_found =[] |
|
|
122 |
# store all valid target IDs |
|
|
123 |
valid_target_ids = main_mapper["one-to-one"].values()+ main_mapper["many-to-one"].values() + alt_mapper["one-to-one"].values() + alt_mapper["many-to-one"].values() |
|
|
124 |
for l in main_mapper["one-to-many"].values() +alt_mapper["one-to-many"].values(): |
|
|
125 |
valid_target_ids += l |
|
|
126 |
|
|
|
127 |
|
|
|
128 |
for symbol in ID_list: |
|
|
129 |
if symbol in main_mapper["one-to-one"].keys(): |
|
|
130 |
symbols_mapped_directly[symbol] = main_mapper["one-to-one"][symbol] |
|
|
131 |
elif symbol in main_mapper["one-to-none"]: |
|
|
132 |
recognized_not_mapped.append(symbol) |
|
|
133 |
elif symbol in main_mapper["one-to-many"].keys(): |
|
|
134 |
symbol_one2many.append(symbol) |
|
|
135 |
elif symbol in main_mapper["many-to-one"].keys(): |
|
|
136 |
symbol_many2one.append(symbol) |
|
|
137 |
# alternative mappper |
|
|
138 |
elif symbol in alt_mapper["one-to-one"].keys(): |
|
|
139 |
via_alt_symbol[symbol] = alt_mapper["one-to-one"][symbol] |
|
|
140 |
elif symbol in alt_mapper["one-to-many"].keys(): |
|
|
141 |
alt_symbol_one2many.append(symbol) |
|
|
142 |
elif symbol in alt_mapper["many-to-one"].keys(): # it is Ok if many synonyms match |
|
|
143 |
via_nonuniq_alt_symbol[symbol] = alt_mapper["many-to-one"][symbol] |
|
|
144 |
elif symbol.startswith("LOC"): |
|
|
145 |
LOC_id = int(symbol[3:]) |
|
|
146 |
if LOC_id in valid_target_ids: |
|
|
147 |
loc[symbol] = LOC_id |
|
|
148 |
else: |
|
|
149 |
loc_not_found.append(symbol) |
|
|
150 |
else: |
|
|
151 |
not_found_at_all.append(symbol) |
|
|
152 |
|
|
|
153 |
query2target ={} |
|
|
154 |
for symbol in [symbols_mapped_directly,via_alt_symbol,via_nonuniq_alt_symbol,loc]: |
|
|
155 |
query2target.update(symbol) |
|
|
156 |
not_mapped = recognized_not_mapped +symbol_one2many+ alt_symbol_one2many + loc_not_found + not_found_at_all+ symbol_many2one |
|
|
157 |
|
|
|
158 |
if verbose: |
|
|
159 |
print("Mapped:",len(query2target.keys()), |
|
|
160 |
"\n\tdirectly via main_mapper",len(symbols_mapped_directly.keys()), |
|
|
161 |
"\n\tvia alternative mapper",len(via_alt_symbol.keys()), |
|
|
162 |
"\n\tvia one of multiple synonyms in alternative mapper",len(via_nonuniq_alt_symbol.keys()), |
|
|
163 |
"\n\tLOC",len(loc.keys()), |
|
|
164 |
"\nUnmapped:",len(not_mapped), |
|
|
165 |
"\n\trecognized symbols without Entrez ID",len(recognized_not_mapped), |
|
|
166 |
"\n\tmultiple query_ids map to the same target_id",len(symbol_many2one), |
|
|
167 |
"\n\tquery_ids map to multiple target_ids in the main mapper",len(symbol_one2many), |
|
|
168 |
"\n\tquery_ids map to multiple target_ids in the alternative mapper",len(alt_symbol_one2many), |
|
|
169 |
"\n\tLOC not found in Entrez",len(loc_not_found), |
|
|
170 |
"\n\tNot found at all:",len( not_found_at_all)) |
|
|
171 |
|
|
|
172 |
# find duplicated |
|
|
173 |
mapped_symbols = pd.Series(query2target) |
|
|
174 |
dups = mapped_symbols[mapped_symbols.duplicated(keep=False)].index.values |
|
|
175 |
if len(dups) >0: |
|
|
176 |
print("Warning: query IDs mapping to duplicated target IDs in mapping table:", len(dups)) |
|
|
177 |
#if verbose: |
|
|
178 |
# print("IDs mapped to multiple target IDs:\n", dups,file=sys.stderr) |
|
|
179 |
|
|
|
180 |
# exclude not mapped query IDs and map |
|
|
181 |
df_size_dif = df.shape[0] |
|
|
182 |
df = df.loc[~df.index.isin(not_mapped ),:].copy() |
|
|
183 |
df_size_dif = df_size_dif - df.shape[0] |
|
|
184 |
if df_size_dif > 0: |
|
|
185 |
print("Warning: query IDs not mapped to any target IDs excluded:", df_size_dif) |
|
|
186 |
df.rename(mapper=query2target, axis='index',inplace=True) |
|
|
187 |
|
|
|
188 |
|
|
|
189 |
# sum genes genes (sum of duplicated Entrez IDs) |
|
|
190 |
if handle_duplicates == "keep": |
|
|
191 |
if verbose: |
|
|
192 |
dups = df.groupby(df.index).size() |
|
|
193 |
dups = list(set(dups[dups>1].index.values)) |
|
|
194 |
print("IDs mapped to multiple target IDs are kept:\n", dups, file=sys.stderr) |
|
|
195 |
elif handle_duplicates == "sum": |
|
|
196 |
df = df.groupby(df.index).apply(sum) |
|
|
197 |
elif handle_duplicates == "average": |
|
|
198 |
df = df.groupby(df.index).apply(np.average) |
|
|
199 |
elif handle_duplicates == "drop": |
|
|
200 |
df = df.loc[~dups,:].copy() |
|
|
201 |
else: |
|
|
202 |
print("'handle_duplicates' must be keep, sum, average or drop.", file =sys.stderr) |
|
|
203 |
return None |
|
|
204 |
df.sort_index(inplace=True) |
|
|
205 |
return (df,query2target,not_mapped) |
|
|
206 |
|