COVID-19_Multi-Omics / Git / [bd22c4] /src/dash/data.py

Models:
AlyssaS/
COVID-19_Multi-Omics
Downloads: 1
[bd22c4]: / src / dash / data.py
History
Download this file
357 lines (259 with data), 13.3 kB

from sqlalchemy import create_engine, MetaData, Table, select, join
import pandas as pd
import re
import numpy as np

# SQLite path
db_path = 'sqlite:///../../data/SQLite Database/20200901/Covid-19 Study DB.sqlite'

omics_id_dict = {
        "proteomics":1,
        "lipidomics":2,
        "metabolomics":3,
        "transcriptomics":5
    }

def get_omics_data(with_metadata=False, dataset="proteomics"):

    omics_id = omics_id_dict[dataset]

    # Create an engine that connects to the Covid-19 Study DB.sqlite file: engine
    engine = create_engine(db_path)

    # Establish connection
    connection = engine.connect()

    # pull table into df
    query = "SELECT * from {}_measurements".format(dataset)
    omics_measurements_df = pd.read_sql_query(query, connection)

    # pull table into df
    query = "SELECT * from {}_runs".format(dataset)
    omics_runs_df = pd.read_sql_query(query, connection)

    # pull table into df
    if dataset == "metabolomics":
        query = "SELECT * from rawfiles WHERE ome_id=3 OR ome_id=4 AND sample_ID<>-1 AND keep=1".format(omics_id)

    else:
        query = "SELECT * from rawfiles WHERE ome_id={} AND sample_ID<>-1 and keep=1".format(omics_id)

    rawfiles_df = pd.read_sql_query(query, connection)
    ## NOTE: For some reason, SQL filter not working for keep on raw files
    rawfiles_df = rawfiles_df[rawfiles_df['keep']==1]

    # pull table into df, drop sample 54
    query = "SELECT * from deidentified_patient_metadata WHERE sample_id<>54"
    deidentified_patient_metadata_df = pd.read_sql_query(query, connection)

    # make sure the merge by columns are all the same type -> pandas seems sensitive to this
    omics_measurements_df = omics_measurements_df.astype({'replicate_id': 'int32'})
    omics_runs_df = omics_runs_df.astype({'replicate_id': 'int32', 'rawfile_id': 'int32'})
    rawfiles_df = rawfiles_df.astype({'rawfile_id': 'int32', 'sample_id': 'int32'})
    deidentified_patient_metadata_df = deidentified_patient_metadata_df.astype({'sample_id': 'int32'})

    joined_df = omics_measurements_df\
                .join(omics_runs_df.set_index('replicate_id'), on='replicate_id')\
                .join(rawfiles_df.set_index('rawfile_id'), on='rawfile_id')\
                .join(deidentified_patient_metadata_df.set_index('sample_id'), on='sample_id')

    # drop samples that are missing COVID or ICU status
    joined_df.dropna(subset=['ICU_1','COVID'], inplace=True)

    # pivot to wide format
    wide_df = joined_df.pivot_table(index='sample_id', columns='biomolecule_id', values='normalized_abundance')
    wide_df.columns = [str(col) for col in wide_df.columns]

    if dataset == "metabolomics":
        query = "SELECT * from biomolecules WHERE omics_id=3 OR omics_id=4".format(omics_id)
    else:
        query = "SELECT * from biomolecules WHERE omics_id={}".format(omics_id)

    # get biomolecule names
    biomolecules_df = pd.read_sql_query(query, connection)

    # close DB connection
    connection.close()

    # build biomolecule name dict and drop list
    biomolecule_name_dict = {}
    biomolecule_drop_list = []
    for index, row in biomolecules_df.iterrows():
        biomolecule_id = str(row['biomolecule_id'])
        standardized_name = row['standardized_name']
        biomolecule_name_dict[biomolecule_id] = standardized_name

        keep = row['keep']
        if keep!="1":
            biomolecule_drop_list.append(biomolecule_id)

    # drop biomolecules
    wide_df.drop(biomolecule_drop_list, axis=1, inplace=True)

    # replace wide_df column names
    #new_col_names = []
    #for col in wide_df.columns:
    #    new_col_names.append(biomolecule_name_dict[str(col)])
    #wide_df.columns = new_col_names

    # record quant value range
    quant_value_range = wide_df.shape[1]

    # optional return matrix with clinical metadata
    if with_metadata:

        combined_df = wide_df.join(deidentified_patient_metadata_df.set_index('sample_id'), on='sample_id')#.dropna()
        return combined_df, quant_value_range

    return wide_df, quant_value_range

def get_biomolecule_names(dataset='proteomics'):

    dataset_abr_prefix = "[{}] ".format(dataset[0].upper())

    print("Getting biomolecule names for dataset: {}".format(dataset))
    omics_id = omics_id_dict[dataset]

    # Create an engine that connects to the Covid-19 Study DB.sqlite file: engine
    engine = create_engine(db_path)

    # Establish connection
    connection = engine.connect()

    if dataset == "metabolomics":
        query = "SELECT * from biomolecules WHERE omics_id=3 OR omics_id=4 and KEEP=1".format(omics_id)
    else:
        query = "SELECT * from biomolecules WHERE omics_id={} and KEEP=1".format(omics_id)

    # get biomolecule names
    biomolecules_df = pd.read_sql_query(query, connection)

    # build biomolecule name dict and drop list
    biomolecule_name_dict = {}
    for index, row in biomolecules_df.iterrows():
        biomolecule_id = str(row['biomolecule_id'])
        standardized_name = row['standardized_name']
        biomolecule_name_dict[biomolecule_id] = dataset_abr_prefix + standardized_name

    # return dictionary with biomolecule ids and standard names

    if not dataset=="proteomics":

        # close DB connection
        connection.close()

        return biomolecule_name_dict

    # for proteomics data, return fasta headers instead

    query = "SELECT * from metadata"
    # get biomolecule names
    metadata_df = pd.read_sql_query(query, connection)

    fasta_header_df = metadata_df[metadata_df['metadata_type'] == 'fasta_header']
    fasta_header_dict = {}
    for index, row in fasta_header_df.iterrows():
        biomolecule_id = str(int(row['biomolecule_id'])) # string was getting truncated by rstrip(".0")
        fasta_header = str(row['metadata_value'])
        fasta_header_dict[biomolecule_id] = fasta_header

    for biomolecule_id in biomolecule_name_dict:

        fasta_header = fasta_header_dict[biomolecule_id]
        fasta_header = re.search("\s(.*?)\sO[SX]=", fasta_header).group(1)
        biomolecule_name_dict[biomolecule_id] = dataset_abr_prefix + fasta_header

    # close DB connection
    connection.close()

    return biomolecule_name_dict

def get_combined_data(df_dict, quant_range_dict, with_transcripts=False):

    # load metabolomics data matrix
    metabolomics_df, metabolomics_quant_range = df_dict['metabolomics'], quant_range_dict['metabolomics']
    lipidomics_df, lipidomics_quant_range = df_dict['lipidomics'], quant_range_dict['lipidomics']
    proteomics_df, proteomics_quant_range = df_dict['proteomics'], quant_range_dict['proteomics']
    if with_transcripts:
        #transcriptomics_df, transcriptomics_quant_range = get_omics_data(dataset='transcriptomics', with_metadata=True)
        transcriptomics_df, transcriptomics_quant_range = df_dict['transcriptomics'], quant_range_dict['transcriptomics']

    # get quant columns
    lipidomics_quant_columns = lipidomics_df.columns[:lipidomics_quant_range]
    lipidomics_quant_df = lipidomics_df[lipidomics_quant_columns]

    metabolomics_quant_columns = metabolomics_df.columns[:metabolomics_quant_range]
    metabolomics_quant_df = metabolomics_df[metabolomics_quant_columns]

    proteomics_quant_columns = proteomics_df.columns[:proteomics_quant_range]
    proteomics_quant_df = proteomics_df[proteomics_quant_columns]

    if with_transcripts:
        transcriptomics_quant_columns = transcriptomics_df.columns[:transcriptomics_quant_range]
        transcriptomics_quant_df = transcriptomics_df[transcriptomics_quant_columns]

    # get clinical_metadata_df
    clinical_metadata_columns = proteomics_df.columns[proteomics_quant_range:]
    clinical_metadata_df = proteomics_df[clinical_metadata_columns]
    clinical_metadata_df

    # join quant values together
    if with_transcripts:
        combined_df = proteomics_quant_df.join(lipidomics_quant_df).join(metabolomics_quant_df).join(transcriptomics_quant_df)
    else:
        combined_df = proteomics_quant_df.join(lipidomics_quant_df).join(metabolomics_quant_df)
    combined_quant_range = combined_df.shape[1]
    combined_quant_columns = combined_df.columns[:combined_quant_range]
    # now join with clinical metadata
    combined_df = combined_df.join(clinical_metadata_df)

    # drop any samples with missing values in quant columns
    combined_df.dropna(subset=combined_quant_columns,inplace=True)

    # also return df_dict with combined_df
    df_dict['combined'] = combined_df

    # update quant_range_dict
    quant_range_dict['combined'] = combined_quant_range

    return df_dict, quant_range_dict

def get_p_values():

    # Create an engine that connects to the Covid-19 Study DB.sqlite file: engine
    engine = create_engine(db_path)

    # Establish connection
    connection = engine.connect()

    query = "SELECT * from biomolecules WHERE KEEP=1"
    # get biomolecule names
    biomolecules_df = pd.read_sql_query(query, connection)

    # build biomolecule name dict and drop list
    biomolecule_name_dict = {}
    for index, row in biomolecules_df.iterrows():
        biomolecule_id = str(row['biomolecule_id'])
        standardized_name = row['standardized_name']
        biomolecule_name_dict[biomolecule_id] = standardized_name

    query = "SELECT * from pvalues"
    # get biomolecule names
    pvalues_df = pd.read_sql_query(query, connection)

    # drop biomolecule_ids where keep!=1
    drop_list = []
    for index, row in pvalues_df.iterrows():
        biomolecule_id = row['biomolecule_id']
        if str(biomolecule_id) not in biomolecule_name_dict:
            drop_list.append(index)

    pvalues_df.drop(drop_list, inplace=True)

    return pvalues_df

def get_volcano_data(pvalues_df, df_dict, quant_value_range,
    global_names_dict, comparison_column='COVID'):

    group_1_quant_value_dict = {}
    formula = 'normalized_abundance ~ COVID * ICU_1 + Gender + Age_less_than_90 vs. normalized_abundance ~ ICU_1 + Gender + Age_less_than_90'

    pvalues_df = pvalues_df[(pvalues_df['comparison']=='COVID_vs_NONCOVID') & (pvalues_df['formula']==formula)]
    print("Volcano data pvalues shape: {}".format(pvalues_df.shape))

    group_1 = 1
    group_2 = 0

    combined_df = df_dict['combined']
    quant_value_columns = combined_df.columns[:quant_value_range]

    group_1_quant_value_dict = {}

    for sample_id, row in combined_df[combined_df[comparison_column]==group_1].iterrows():

        for biomolecule_id in quant_value_columns:
            quant_value = row[biomolecule_id]

            if not biomolecule_id in group_1_quant_value_dict:
                group_1_quant_value_dict[biomolecule_id] = [quant_value]

            else:
                group_1_quant_value_dict[biomolecule_id].append(quant_value)

    group_2_quant_value_dict = {}

    for sample_id, row in combined_df[combined_df[comparison_column]==group_2].iterrows():

        for biomolecule_id in quant_value_columns:
            quant_value = row[biomolecule_id]

            if not biomolecule_id in group_2_quant_value_dict:
                group_2_quant_value_dict[biomolecule_id] = [quant_value]

            else:
                group_2_quant_value_dict[biomolecule_id].append(quant_value)


    FC_dict = {}
    for biomolecule_id in quant_value_columns:

        group_1_quant_values = group_1_quant_value_dict[biomolecule_id]
        group_2_quant_values = group_2_quant_value_dict[biomolecule_id]

        # in log2 space; subtract
        FC = np.mean(group_1_quant_values) - np.mean(group_2_quant_values)

        FC_dict[biomolecule_id] = FC

    FC_list = []
    ome_list = []
    standardized_name_list = []
    std_list = []
    for index, row in pvalues_df.iterrows():
        biomolecule_id = str(row['biomolecule_id'])

        # get standard deviation
        std = np.std(combined_df[biomolecule_id])
        std_list.append(std)

        ## NOTE: should create biomolecule_ome_dict
        if biomolecule_id in df_dict['proteomics'].columns:
            ome_list.append("proteomics")
        elif biomolecule_id in df_dict['lipidomics'].columns:
            ome_list.append("lipidomics")
        elif biomolecule_id in df_dict['metabolomics'].columns:
            ome_list.append("metabolomics")
        elif biomolecule_id in df_dict['transcriptomics'].columns:
            ome_list.append("transcriptomics")
        else:
            #print("Biomolecule {} not mapped to ome!".format(biomolecule_id)) # don't currenly have targeted metabolomics included
            ome_list.append(np.nan)
            #break

        try:
            FC = FC_dict[biomolecule_id]
        except:
            # may have been a dropped biomolecule or from a different ome
            FC = np.nan

        FC_list.append(FC)

        try:
            standardized_name = global_names_dict['combined'][biomolecule_id]
        except:
            # may have been a dropped biomolecule or from a different ome
            standardized_name = np.nan

        standardized_name_list.append(standardized_name)

    pvalues_df['log2_FC'] = FC_list
    pvalues_df['ome_type'] = ome_list
    pvalues_df['standardized_name'] = standardized_name_list
    pvalues_df['neg_log10_p_value'] = pvalues_df['p_value'].apply(np.log10).apply(np.negative)
    pvalues_df['std'] = std_list

    return pvalues_df