Switch to unified view

a b/preprocessing_scr/mapper.py
1
from __future__ import print_function
2
import pandas as pd 
3
import sys,os
4
import numpy as np 
5
#from scipy.stats import zscore
6
7
8
def expand(df,column,sep="|"):
9
    '''Expand rows containing separator.'''
10
    ndx = 0 
11
    expanded_df = {}
12
    for row in df.iterrows():
13
        if type(row[1][column]) == str:
14
            for x in row[1][column].split(sep):
15
                row_ = row[1].copy()
16
                row_[column] = x
17
                expanded_df[ndx] = row_
18
                ndx +=1
19
        else:
20
            expanded_df[ndx] = row[1]
21
            ndx +=1
22
    return pd.DataFrame.from_dict(expanded_df).T
23
24
25
def parse_mapping_table(dataframe, query_id, target_id):
26
    ''' Takes the dataframe with gene ids mapping and column names of wuery and target ID.
27
    \nReturns a dictionary with one-to-one, one-to-many, many-to-one mappings and a lsit with unmapped identitfiers (one-to-none). Tested with Homo_sapiens.gene_info from NCBI.'''
28
    
29
    mapper = {"one-to-one":{},"one-to-many":{},"many-to-one":{},"one-to-none":[]}
30
    
31
    df = dataframe.loc[:,[query_id, target_id]]
32
    # exclude rows of all NAs
33
    df_size = df.shape[0]
34
    df.dropna(how="all",inplace=True)
35
    if df_size - df.shape[0] > 0:
36
        print(df_size - df.shape[0],"rows with both",query_id,"and",target_id,"empty",file=sys.stderr)
37
    else:
38
        print("Ok: no empty rows detected",file=sys.stderr)
39
    # duplicated pairs
40
    df_size = df.shape[0]
41
    df.drop_duplicates(inplace=True)
42
    if df_size - df.shape[0] > 0:
43
        print(df_size - df.shape[0],"duplicated pairs dropped",file=sys.stderr)
44
    else:
45
        print("Ok: no duplicated pairs detected",file=sys.stderr)
46
    # exclude NA query IDs
47
    df_size = df.shape[0]
48
    df.dropna(subset=[query_id],axis=0,inplace = True)
49
    if df_size -df.shape[0] >0:
50
        print(df_size -df.shape[0],"rows with empty",query_id,"were excluded",file=sys.stderr)
51
    else:
52
        print("Ok: All",query_id,"rows are not empty.",file=sys.stderr)
53
        
54
    # recognized query ids mapped to no target ids
55
    found_not_mapped = list(set(df.loc[df[target_id].isnull(),query_id].values))
56
    if len(found_not_mapped) > 0:
57
        df = df.loc[~df[query_id].isin(found_not_mapped),:].copy()
58
        print(len(found_not_mapped),query_id,"ids mapped to no",target_id,file= sys.stderr)
59
        
60
    else:
61
        print("Ok: All",query_id,"are mapped to",target_id,file= sys.stderr)
62
    
63
    # uniqueness of query ids; one-to-many is not acceptable
64
    query_dups = list(set(df.loc[df.duplicated(subset=[query_id],keep = False),:][query_id].values))
65
    if len(query_dups) > 0:
66
        print(len(query_dups),query_id,"mapped to multiple",target_id,file= sys.stderr)
67
        df_one_to_many =  df.loc[df[query_id].isin(query_dups),:].copy()
68
        df = df.loc[~df[query_id].isin(query_dups),:].copy()
69
        df_one_to_many = df_one_to_many.groupby(query_id).agg({target_id:list})
70
        mapper["one-to-many"] = df_one_to_many.to_dict()[target_id]
71
    else:
72
        print("Ok: All",query_id,"are unique",file= sys.stderr)
73
    
74
    # uniqueness of target ids; many-to-one is ok for synonyms, but not for primary id
75
    query_ambiguous = list(set(df.loc[df.duplicated(subset=[target_id],keep = False),:][query_id].values))
76
    if len(query_ambiguous) > 0:
77
        print(len(query_ambiguous),"different",query_id,
78
              "mapped to the same",target_id,file= sys.stderr)
79
        df_many_to_one =  df.loc[df[query_id].isin(query_ambiguous),:].copy()
80
        df = df.loc[~df[query_id].isin(query_ambiguous),:].copy()
81
        df_many_to_one.set_index(query_id,inplace=True,drop=True)
82
        mapper["many-to-one"] = df_many_to_one.to_dict()[target_id]
83
    else:
84
        print("Ok: All",target_id,"are unique",file= sys.stderr)
85
        
86
    if len(query_dups) == 0 and len(query_ambiguous) == 0:
87
        print("Ok: One-to-one mapping between",query_id,"and",target_id,file= sys.stderr)
88
    print(df.shape[0],query_id,"can be mapped directly to",target_id,file= sys.stderr)
89
    
90
    # one-to-one
91
    df.set_index(query_id,inplace=True,drop=True)
92
    mapper["one-to-one"]=df.to_dict()[target_id]
93
    
94
    # query_id without target_id
95
    mapper["one-to-none"]= found_not_mapped
96
    return  mapper
97
98
99
def apply_mappers(df, main_mapper, alt_mapper, verbose = True,handle_duplicates = "keep"):
100
    '''Converts IDs in DF indices.\n
101
    handle_duplicates  - how to deal with duplicated IDs in the resulted DF:\n
102
    \tsum - group by index and sum\n
103
    \taverage - group by index and keep average\n
104
    \tdrop - drop duplicates\n
105
    \tkeep - do nothing.'''
106
    ID_list = list(df.index.values)
107
    
108
    # main mapper, e.g. NCBI symbol -> Entrez Gene ID
109
    symbols_mapped_directly = {}
110
    recognized_not_mapped = [] # found in target IDs of mapper but not 
111
    symbol_one2many = [] # not mapped because of ambiguity
112
    symbol_many2one = [] # not mapped because of ambiguity
113
    # Alternative mapper
114
    # applied in case the main mapper failed: e.g. NCBI synonym -> NCBI symbol -> Entrez Gene ID
115
    via_alt_symbol = {}
116
    via_nonuniq_alt_symbol = {}
117
    alt_symbol_one2many = []  # 
118
    synonym_match_current_symbol = [] # these synonyms are not used in mapping because they match with ID in main mapped
119
    not_found_at_all =[]
120
    loc = {}
121
    loc_not_found =[]
122
    # store all valid target IDs
123
    valid_target_ids = main_mapper["one-to-one"].values()+ main_mapper["many-to-one"].values() + alt_mapper["one-to-one"].values() + alt_mapper["many-to-one"].values()
124
    for l in main_mapper["one-to-many"].values() +alt_mapper["one-to-many"].values():
125
        valid_target_ids += l
126
        
127
       
128
    for symbol in ID_list:
129
        if symbol in main_mapper["one-to-one"].keys():
130
            symbols_mapped_directly[symbol] = main_mapper["one-to-one"][symbol]
131
        elif  symbol in main_mapper["one-to-none"]:
132
            recognized_not_mapped.append(symbol)
133
        elif symbol in main_mapper["one-to-many"].keys():
134
            symbol_one2many.append(symbol)
135
        elif symbol in main_mapper["many-to-one"].keys():
136
            symbol_many2one.append(symbol)
137
        # alternative mappper
138
        elif symbol in alt_mapper["one-to-one"].keys():
139
            via_alt_symbol[symbol] = alt_mapper["one-to-one"][symbol]
140
        elif symbol in alt_mapper["one-to-many"].keys():
141
            alt_symbol_one2many.append(symbol)
142
        elif symbol in alt_mapper["many-to-one"].keys(): # it is Ok if many synonyms match 
143
            via_nonuniq_alt_symbol[symbol] = alt_mapper["many-to-one"][symbol]
144
        elif symbol.startswith("LOC"):
145
            LOC_id = int(symbol[3:])
146
            if LOC_id in valid_target_ids:
147
                loc[symbol] = LOC_id
148
            else:
149
                loc_not_found.append(symbol)
150
        else:
151
            not_found_at_all.append(symbol)
152
        
153
    query2target ={}
154
    for symbol in [symbols_mapped_directly,via_alt_symbol,via_nonuniq_alt_symbol,loc]:
155
        query2target.update(symbol)
156
    not_mapped = recognized_not_mapped +symbol_one2many+ alt_symbol_one2many + loc_not_found + not_found_at_all+ symbol_many2one
157
    
158
    if verbose:
159
        print("Mapped:",len(query2target.keys()), 
160
      "\n\tdirectly via main_mapper",len(symbols_mapped_directly.keys()),
161
     "\n\tvia alternative mapper",len(via_alt_symbol.keys()),
162
      "\n\tvia one of multiple synonyms in alternative mapper",len(via_nonuniq_alt_symbol.keys()),
163
      "\n\tLOC",len(loc.keys()),
164
      "\nUnmapped:",len(not_mapped),
165
      "\n\trecognized symbols without Entrez ID",len(recognized_not_mapped),
166
      "\n\tmultiple query_ids map to the same target_id",len(symbol_many2one),
167
      "\n\tquery_ids map to multiple target_ids in the main mapper",len(symbol_one2many),
168
      "\n\tquery_ids map to multiple target_ids in the alternative mapper",len(alt_symbol_one2many),
169
      "\n\tLOC not found in Entrez",len(loc_not_found),
170
     "\n\tNot found at all:",len( not_found_at_all))
171
    
172
    # find duplicated 
173
    mapped_symbols = pd.Series(query2target)
174
    dups = mapped_symbols[mapped_symbols.duplicated(keep=False)].index.values
175
    if len(dups) >0:
176
        print("Warning: query IDs mapping to duplicated target IDs in mapping table:", len(dups))
177
        #if verbose:
178
        #    print("IDs mapped to multiple target IDs:\n", dups,file=sys.stderr)
179
    
180
    # exclude not mapped query IDs and map
181
    df_size_dif = df.shape[0]
182
    df = df.loc[~df.index.isin(not_mapped ),:].copy()
183
    df_size_dif = df_size_dif - df.shape[0]
184
    if df_size_dif > 0:
185
        print("Warning: query IDs not mapped to any target IDs excluded:", df_size_dif)
186
    df.rename(mapper=query2target, axis='index',inplace=True)
187
    
188
189
    # sum genes genes (sum of duplicated Entrez IDs)
190
    if handle_duplicates == "keep":
191
        if verbose:
192
            dups = df.groupby(df.index).size()
193
            dups = list(set(dups[dups>1].index.values))
194
            print("IDs mapped to multiple target IDs are kept:\n", dups, file=sys.stderr)
195
    elif handle_duplicates == "sum":
196
        df = df.groupby(df.index).apply(sum)
197
    elif handle_duplicates == "average":
198
        df = df.groupby(df.index).apply(np.average)
199
    elif handle_duplicates == "drop":
200
        df = df.loc[~dups,:].copy()
201
    else:
202
        print("'handle_duplicates' must be keep, sum, average or drop.", file =sys.stderr)
203
        return None
204
    df.sort_index(inplace=True)
205
    return (df,query2target,not_mapped)
206