--- a +++ b/icu_data_defs.py @@ -0,0 +1,134 @@ +import pandas as pd +import utils +from constants import variable_type,clinical_source,NO_UNITS,column_names + +class data_dictionary(object): + + def __init__(self,xls_fname): + + self.load(xls_fname) + + def load(self,xls_fname): + xls = pd.ExcelFile(xls_fname) + obj_dict = {} + obj_dict['xls_fname'] = xls_fname + df_tables={} + df_names={} + for sheet_name in xls.sheet_names: + df_tables[sheet_name] = xls.parse(sheet_name,index_col=0) + df_names[sheet_name] = sheet_name + obj_dict['tables'] = utils.Bunch(**df_tables) + obj_dict['table_names'] = utils.Bunch(**df_names) + + self.__dict__.update(**obj_dict) + self.__refresh_components() + + def __refresh_components(self): + components = map(str,self.tables.definitions.component.unique().tolist()) + keys = map(lambda component: str.upper(component.replace(' ','_')),components) + self.__dict__['components'] = utils.Bunch(**dict(zip(keys,components))) + + def save(self,xls_fname=None): + if xls_fname is None: xls_fname = self.xls_fname + writer = pd.ExcelWriter(xls_fname, engine='xlsxwriter') + for table_name,table in self.tables.__dict__.iteritems(): + table.to_excel(writer,table_name) + writer.save() + return + + def add_definition(self,component,units=NO_UNITS, + variable_type=variable_type.QUANTITATIVE, + clinical_source=clinical_source.OBSERVATION, + lower_limit=pd.np.nan, + upper_limit=pd.np.nan, + list_id=pd.np.nan): + new_id = _next_id(self.tables.definitions) + self.tables.definitions.loc[new_id] = [component,units,variable_type,clinical_source,lower_limit,upper_limit,list_id] + self.__refresh_components() + return new_id + + def add_panel(self,panel_name,panel_map): + """ + panel map: {table_name:[ids]} + """ + new_panel_id = _next_id(self.tables.panels) + new_list_id = _next_id(self.tables.lists) + self.tables.panels.loc[new_panel_id] = [panel_name,new_list_id] + for ref_table,ref_ids in panel_map.iteritems(): + for ref_id in ref_ids: + self.add_item_to_panel(new_panel_id,ref_table,ref_id) + return new_panel_id + + def add_item_to_panel(self,panel_id,ref_table,ref_id): + list_id = self.tables.panels.loc[panel_id,'list_id'] + return self.__add_list_item(list_id,ref_table,ref_id,pd.np.nan) + + def __add_list_item(self,list_id,ref_table,ref_id,seq_num): + orig_index_name = self.tables.lists.index.name + list_df = self.tables.lists.reset_index(drop=False) + new_id = _next_id(list_df) + list_df.loc[new_id] = [list_id,ref_table,ref_id,seq_num] + list_df.set_index(orig_index_name,inplace=True) + self.tables.lists = list_df + return new_id + + def add_category(self,val_numeric,val_text): + new_id = _next_id(self.tables.categories) + self.tables.categories.loc[new_id] = [val_numeric,val_text] + return new_id + + def add_category_list(self,categories,is_ordered=False): + new_list_id = _next_id(self.tables.lists) + for i,category_id in enumerate(categories): + self.__add_list_item(new_list_id, + self.table_names.categories, + category_id, + i if is_ordered else pd.np.nan) + return new_list_id + + def get_panel_defintions(self,panel_id): + list_id = self.tables.panels.loc[panel_id,'list_id'] + def_list = [] + for index, row in self.tables.lists.loc[list_id].iterrows(): + table = row['table'] + id_ = row['id'] + if table == 'panels': + defs = self.get_panel_defintions(id_) + else: defs = self.tables.__dict__[table].loc[[id_]] + + def_list.append(defs) + + return pd.concat(def_list) + + def get_categories(self,component): + joined = self.tables.definitions.merge(self.tables.lists, left_on='list_id',right_index=True) + joined = joined.merge(self.tables.categories,left_on='id',right_index=True) + filtered = joined.loc[joined.component == component] + if filtered.shape[0] == 0: return None + out_df = filtered[['seq_num','val_numeric','val_text']].set_index('seq_num').sort_index() + return out_df + + def defs_for_component(self,component): + return self.get_defs({column_names.COMPONENT : component}) + + def get_clinical_source(self,component): + return self.defs_for_component(component).loc[:,'clinical_source'].iloc[0] + + def get_variable_type(self,component): + return self.defs_for_component(component).loc[:,'variable_type'].iloc[0] + + def get_defs(self,data_specs=[],operator='or'): + return _filter_defs(self.tables.definitions,data_specs,operator) + + def get_components(self,specs=[],panel_id=None,operator='or'): + if panel_id is not None: + defs = self.get_panel_defintions(panel_id) + else: + defs = self.tables.definitions + return _filter_defs(defs,specs,operator).component.unique().tolist() + +def _filter_defs(defs,specs,operator='or'): + return defs.loc[utils.complex_row_mask(defs,specs,operator)] + +def _next_id(df): + return max(df.index.tolist())+1