Diff of /src/dash/data.py [000000] .. [bd22c4]

Switch to side-by-side view

--- a
+++ b/src/dash/data.py
@@ -0,0 +1,356 @@
+
+from sqlalchemy import create_engine, MetaData, Table, select, join
+import pandas as pd
+import re
+import numpy as np
+
+# SQLite path
+db_path = 'sqlite:///../../data/SQLite Database/20200901/Covid-19 Study DB.sqlite'
+
+omics_id_dict = {
+        "proteomics":1,
+        "lipidomics":2,
+        "metabolomics":3,
+        "transcriptomics":5
+    }
+
+def get_omics_data(with_metadata=False, dataset="proteomics"):
+
+    omics_id = omics_id_dict[dataset]
+
+    # Create an engine that connects to the Covid-19 Study DB.sqlite file: engine
+    engine = create_engine(db_path)
+
+    # Establish connection
+    connection = engine.connect()
+
+    # pull table into df
+    query = "SELECT * from {}_measurements".format(dataset)
+    omics_measurements_df = pd.read_sql_query(query, connection)
+
+    # pull table into df
+    query = "SELECT * from {}_runs".format(dataset)
+    omics_runs_df = pd.read_sql_query(query, connection)
+
+    # pull table into df
+    if dataset == "metabolomics":
+        query = "SELECT * from rawfiles WHERE ome_id=3 OR ome_id=4 AND sample_ID<>-1 AND keep=1".format(omics_id)
+
+    else:
+        query = "SELECT * from rawfiles WHERE ome_id={} AND sample_ID<>-1 and keep=1".format(omics_id)
+
+    rawfiles_df = pd.read_sql_query(query, connection)
+    ## NOTE: For some reason, SQL filter not working for keep on raw files
+    rawfiles_df = rawfiles_df[rawfiles_df['keep']==1]
+
+    # pull table into df, drop sample 54
+    query = "SELECT * from deidentified_patient_metadata WHERE sample_id<>54"
+    deidentified_patient_metadata_df = pd.read_sql_query(query, connection)
+
+    # make sure the merge by columns are all the same type -> pandas seems sensitive to this
+    omics_measurements_df = omics_measurements_df.astype({'replicate_id': 'int32'})
+    omics_runs_df = omics_runs_df.astype({'replicate_id': 'int32', 'rawfile_id': 'int32'})
+    rawfiles_df = rawfiles_df.astype({'rawfile_id': 'int32', 'sample_id': 'int32'})
+    deidentified_patient_metadata_df = deidentified_patient_metadata_df.astype({'sample_id': 'int32'})
+
+    joined_df = omics_measurements_df\
+                .join(omics_runs_df.set_index('replicate_id'), on='replicate_id')\
+                .join(rawfiles_df.set_index('rawfile_id'), on='rawfile_id')\
+                .join(deidentified_patient_metadata_df.set_index('sample_id'), on='sample_id')
+
+    # drop samples that are missing COVID or ICU status
+    joined_df.dropna(subset=['ICU_1','COVID'], inplace=True)
+
+    # pivot to wide format
+    wide_df = joined_df.pivot_table(index='sample_id', columns='biomolecule_id', values='normalized_abundance')
+    wide_df.columns = [str(col) for col in wide_df.columns]
+
+    if dataset == "metabolomics":
+        query = "SELECT * from biomolecules WHERE omics_id=3 OR omics_id=4".format(omics_id)
+    else:
+        query = "SELECT * from biomolecules WHERE omics_id={}".format(omics_id)
+
+    # get biomolecule names
+    biomolecules_df = pd.read_sql_query(query, connection)
+
+    # close DB connection
+    connection.close()
+
+    # build biomolecule name dict and drop list
+    biomolecule_name_dict = {}
+    biomolecule_drop_list = []
+    for index, row in biomolecules_df.iterrows():
+        biomolecule_id = str(row['biomolecule_id'])
+        standardized_name = row['standardized_name']
+        biomolecule_name_dict[biomolecule_id] = standardized_name
+
+        keep = row['keep']
+        if keep!="1":
+            biomolecule_drop_list.append(biomolecule_id)
+
+    # drop biomolecules
+    wide_df.drop(biomolecule_drop_list, axis=1, inplace=True)
+
+    # replace wide_df column names
+    #new_col_names = []
+    #for col in wide_df.columns:
+    #    new_col_names.append(biomolecule_name_dict[str(col)])
+    #wide_df.columns = new_col_names
+
+    # record quant value range
+    quant_value_range = wide_df.shape[1]
+
+    # optional return matrix with clinical metadata
+    if with_metadata:
+
+        combined_df = wide_df.join(deidentified_patient_metadata_df.set_index('sample_id'), on='sample_id')#.dropna()
+        return combined_df, quant_value_range
+
+    return wide_df, quant_value_range
+
+def get_biomolecule_names(dataset='proteomics'):
+
+    dataset_abr_prefix = "[{}] ".format(dataset[0].upper())
+
+    print("Getting biomolecule names for dataset: {}".format(dataset))
+    omics_id = omics_id_dict[dataset]
+
+    # Create an engine that connects to the Covid-19 Study DB.sqlite file: engine
+    engine = create_engine(db_path)
+
+    # Establish connection
+    connection = engine.connect()
+
+    if dataset == "metabolomics":
+        query = "SELECT * from biomolecules WHERE omics_id=3 OR omics_id=4 and KEEP=1".format(omics_id)
+    else:
+        query = "SELECT * from biomolecules WHERE omics_id={} and KEEP=1".format(omics_id)
+
+    # get biomolecule names
+    biomolecules_df = pd.read_sql_query(query, connection)
+
+    # build biomolecule name dict and drop list
+    biomolecule_name_dict = {}
+    for index, row in biomolecules_df.iterrows():
+        biomolecule_id = str(row['biomolecule_id'])
+        standardized_name = row['standardized_name']
+        biomolecule_name_dict[biomolecule_id] = dataset_abr_prefix + standardized_name
+
+    # return dictionary with biomolecule ids and standard names
+
+    if not dataset=="proteomics":
+
+        # close DB connection
+        connection.close()
+
+        return biomolecule_name_dict
+
+    # for proteomics data, return fasta headers instead
+
+    query = "SELECT * from metadata"
+    # get biomolecule names
+    metadata_df = pd.read_sql_query(query, connection)
+
+    fasta_header_df = metadata_df[metadata_df['metadata_type'] == 'fasta_header']
+    fasta_header_dict = {}
+    for index, row in fasta_header_df.iterrows():
+        biomolecule_id = str(int(row['biomolecule_id'])) # string was getting truncated by rstrip(".0")
+        fasta_header = str(row['metadata_value'])
+        fasta_header_dict[biomolecule_id] = fasta_header
+
+    for biomolecule_id in biomolecule_name_dict:
+
+        fasta_header = fasta_header_dict[biomolecule_id]
+        fasta_header = re.search("\s(.*?)\sO[SX]=", fasta_header).group(1)
+        biomolecule_name_dict[biomolecule_id] = dataset_abr_prefix + fasta_header
+
+    # close DB connection
+    connection.close()
+
+    return biomolecule_name_dict
+
+def get_combined_data(df_dict, quant_range_dict, with_transcripts=False):
+
+    # load metabolomics data matrix
+    metabolomics_df, metabolomics_quant_range = df_dict['metabolomics'], quant_range_dict['metabolomics']
+    lipidomics_df, lipidomics_quant_range = df_dict['lipidomics'], quant_range_dict['lipidomics']
+    proteomics_df, proteomics_quant_range = df_dict['proteomics'], quant_range_dict['proteomics']
+    if with_transcripts:
+        #transcriptomics_df, transcriptomics_quant_range = get_omics_data(dataset='transcriptomics', with_metadata=True)
+        transcriptomics_df, transcriptomics_quant_range = df_dict['transcriptomics'], quant_range_dict['transcriptomics']
+
+    # get quant columns
+    lipidomics_quant_columns = lipidomics_df.columns[:lipidomics_quant_range]
+    lipidomics_quant_df = lipidomics_df[lipidomics_quant_columns]
+
+    metabolomics_quant_columns = metabolomics_df.columns[:metabolomics_quant_range]
+    metabolomics_quant_df = metabolomics_df[metabolomics_quant_columns]
+
+    proteomics_quant_columns = proteomics_df.columns[:proteomics_quant_range]
+    proteomics_quant_df = proteomics_df[proteomics_quant_columns]
+
+    if with_transcripts:
+        transcriptomics_quant_columns = transcriptomics_df.columns[:transcriptomics_quant_range]
+        transcriptomics_quant_df = transcriptomics_df[transcriptomics_quant_columns]
+
+    # get clinical_metadata_df
+    clinical_metadata_columns = proteomics_df.columns[proteomics_quant_range:]
+    clinical_metadata_df = proteomics_df[clinical_metadata_columns]
+    clinical_metadata_df
+
+    # join quant values together
+    if with_transcripts:
+        combined_df = proteomics_quant_df.join(lipidomics_quant_df).join(metabolomics_quant_df).join(transcriptomics_quant_df)
+    else:
+        combined_df = proteomics_quant_df.join(lipidomics_quant_df).join(metabolomics_quant_df)
+    combined_quant_range = combined_df.shape[1]
+    combined_quant_columns = combined_df.columns[:combined_quant_range]
+    # now join with clinical metadata
+    combined_df = combined_df.join(clinical_metadata_df)
+
+    # drop any samples with missing values in quant columns
+    combined_df.dropna(subset=combined_quant_columns,inplace=True)
+
+    # also return df_dict with combined_df
+    df_dict['combined'] = combined_df
+
+    # update quant_range_dict
+    quant_range_dict['combined'] = combined_quant_range
+
+    return df_dict, quant_range_dict
+
+def get_p_values():
+
+    # Create an engine that connects to the Covid-19 Study DB.sqlite file: engine
+    engine = create_engine(db_path)
+
+    # Establish connection
+    connection = engine.connect()
+
+    query = "SELECT * from biomolecules WHERE KEEP=1"
+    # get biomolecule names
+    biomolecules_df = pd.read_sql_query(query, connection)
+
+    # build biomolecule name dict and drop list
+    biomolecule_name_dict = {}
+    for index, row in biomolecules_df.iterrows():
+        biomolecule_id = str(row['biomolecule_id'])
+        standardized_name = row['standardized_name']
+        biomolecule_name_dict[biomolecule_id] = standardized_name
+
+    query = "SELECT * from pvalues"
+    # get biomolecule names
+    pvalues_df = pd.read_sql_query(query, connection)
+
+    # drop biomolecule_ids where keep!=1
+    drop_list = []
+    for index, row in pvalues_df.iterrows():
+        biomolecule_id = row['biomolecule_id']
+        if str(biomolecule_id) not in biomolecule_name_dict:
+            drop_list.append(index)
+
+    pvalues_df.drop(drop_list, inplace=True)
+
+    return pvalues_df
+
+def get_volcano_data(pvalues_df, df_dict, quant_value_range,
+    global_names_dict, comparison_column='COVID'):
+
+    group_1_quant_value_dict = {}
+    formula = 'normalized_abundance ~ COVID * ICU_1 + Gender + Age_less_than_90 vs. normalized_abundance ~ ICU_1 + Gender + Age_less_than_90'
+
+    pvalues_df = pvalues_df[(pvalues_df['comparison']=='COVID_vs_NONCOVID') & (pvalues_df['formula']==formula)]
+    print("Volcano data pvalues shape: {}".format(pvalues_df.shape))
+
+    group_1 = 1
+    group_2 = 0
+
+    combined_df = df_dict['combined']
+    quant_value_columns = combined_df.columns[:quant_value_range]
+
+    group_1_quant_value_dict = {}
+
+    for sample_id, row in combined_df[combined_df[comparison_column]==group_1].iterrows():
+
+        for biomolecule_id in quant_value_columns:
+            quant_value = row[biomolecule_id]
+
+            if not biomolecule_id in group_1_quant_value_dict:
+                group_1_quant_value_dict[biomolecule_id] = [quant_value]
+
+            else:
+                group_1_quant_value_dict[biomolecule_id].append(quant_value)
+
+    group_2_quant_value_dict = {}
+
+    for sample_id, row in combined_df[combined_df[comparison_column]==group_2].iterrows():
+
+        for biomolecule_id in quant_value_columns:
+            quant_value = row[biomolecule_id]
+
+            if not biomolecule_id in group_2_quant_value_dict:
+                group_2_quant_value_dict[biomolecule_id] = [quant_value]
+
+            else:
+                group_2_quant_value_dict[biomolecule_id].append(quant_value)
+
+
+    FC_dict = {}
+    for biomolecule_id in quant_value_columns:
+
+        group_1_quant_values = group_1_quant_value_dict[biomolecule_id]
+        group_2_quant_values = group_2_quant_value_dict[biomolecule_id]
+
+        # in log2 space; subtract
+        FC = np.mean(group_1_quant_values) - np.mean(group_2_quant_values)
+
+        FC_dict[biomolecule_id] = FC
+
+    FC_list = []
+    ome_list = []
+    standardized_name_list = []
+    std_list = []
+    for index, row in pvalues_df.iterrows():
+        biomolecule_id = str(row['biomolecule_id'])
+
+        # get standard deviation
+        std = np.std(combined_df[biomolecule_id])
+        std_list.append(std)
+
+        ## NOTE: should create biomolecule_ome_dict
+        if biomolecule_id in df_dict['proteomics'].columns:
+            ome_list.append("proteomics")
+        elif biomolecule_id in df_dict['lipidomics'].columns:
+            ome_list.append("lipidomics")
+        elif biomolecule_id in df_dict['metabolomics'].columns:
+            ome_list.append("metabolomics")
+        elif biomolecule_id in df_dict['transcriptomics'].columns:
+            ome_list.append("transcriptomics")
+        else:
+            #print("Biomolecule {} not mapped to ome!".format(biomolecule_id)) # don't currenly have targeted metabolomics included
+            ome_list.append(np.nan)
+            #break
+
+        try:
+            FC = FC_dict[biomolecule_id]
+        except:
+            # may have been a dropped biomolecule or from a different ome
+            FC = np.nan
+
+        FC_list.append(FC)
+
+        try:
+            standardized_name = global_names_dict['combined'][biomolecule_id]
+        except:
+            # may have been a dropped biomolecule or from a different ome
+            standardized_name = np.nan
+
+        standardized_name_list.append(standardized_name)
+
+    pvalues_df['log2_FC'] = FC_list
+    pvalues_df['ome_type'] = ome_list
+    pvalues_df['standardized_name'] = standardized_name_list
+    pvalues_df['neg_log10_p_value'] = pvalues_df['p_value'].apply(np.log10).apply(np.negative)
+    pvalues_df['std'] = std_list
+
+    return pvalues_df