from sqlalchemy import create_engine, MetaData, Table, select, join
import pandas as pd
import re
import numpy as np
# SQLite path
db_path = 'sqlite:///../../data/SQLite Database/20200901/Covid-19 Study DB.sqlite'
omics_id_dict = {
"proteomics":1,
"lipidomics":2,
"metabolomics":3,
"transcriptomics":5
}
def get_omics_data(with_metadata=False, dataset="proteomics"):
omics_id = omics_id_dict[dataset]
# Create an engine that connects to the Covid-19 Study DB.sqlite file: engine
engine = create_engine(db_path)
# Establish connection
connection = engine.connect()
# pull table into df
query = "SELECT * from {}_measurements".format(dataset)
omics_measurements_df = pd.read_sql_query(query, connection)
# pull table into df
query = "SELECT * from {}_runs".format(dataset)
omics_runs_df = pd.read_sql_query(query, connection)
# pull table into df
if dataset == "metabolomics":
query = "SELECT * from rawfiles WHERE ome_id=3 OR ome_id=4 AND sample_ID<>-1 AND keep=1".format(omics_id)
else:
query = "SELECT * from rawfiles WHERE ome_id={} AND sample_ID<>-1 and keep=1".format(omics_id)
rawfiles_df = pd.read_sql_query(query, connection)
## NOTE: For some reason, SQL filter not working for keep on raw files
rawfiles_df = rawfiles_df[rawfiles_df['keep']==1]
# pull table into df, drop sample 54
query = "SELECT * from deidentified_patient_metadata WHERE sample_id<>54"
deidentified_patient_metadata_df = pd.read_sql_query(query, connection)
# make sure the merge by columns are all the same type -> pandas seems sensitive to this
omics_measurements_df = omics_measurements_df.astype({'replicate_id': 'int32'})
omics_runs_df = omics_runs_df.astype({'replicate_id': 'int32', 'rawfile_id': 'int32'})
rawfiles_df = rawfiles_df.astype({'rawfile_id': 'int32', 'sample_id': 'int32'})
deidentified_patient_metadata_df = deidentified_patient_metadata_df.astype({'sample_id': 'int32'})
joined_df = omics_measurements_df\
.join(omics_runs_df.set_index('replicate_id'), on='replicate_id')\
.join(rawfiles_df.set_index('rawfile_id'), on='rawfile_id')\
.join(deidentified_patient_metadata_df.set_index('sample_id'), on='sample_id')
# drop samples that are missing COVID or ICU status
joined_df.dropna(subset=['ICU_1','COVID'], inplace=True)
# pivot to wide format
wide_df = joined_df.pivot_table(index='sample_id', columns='biomolecule_id', values='normalized_abundance')
wide_df.columns = [str(col) for col in wide_df.columns]
if dataset == "metabolomics":
query = "SELECT * from biomolecules WHERE omics_id=3 OR omics_id=4".format(omics_id)
else:
query = "SELECT * from biomolecules WHERE omics_id={}".format(omics_id)
# get biomolecule names
biomolecules_df = pd.read_sql_query(query, connection)
# close DB connection
connection.close()
# build biomolecule name dict and drop list
biomolecule_name_dict = {}
biomolecule_drop_list = []
for index, row in biomolecules_df.iterrows():
biomolecule_id = str(row['biomolecule_id'])
standardized_name = row['standardized_name']
biomolecule_name_dict[biomolecule_id] = standardized_name
keep = row['keep']
if keep!="1":
biomolecule_drop_list.append(biomolecule_id)
# drop biomolecules
wide_df.drop(biomolecule_drop_list, axis=1, inplace=True)
# replace wide_df column names
#new_col_names = []
#for col in wide_df.columns:
# new_col_names.append(biomolecule_name_dict[str(col)])
#wide_df.columns = new_col_names
# record quant value range
quant_value_range = wide_df.shape[1]
# optional return matrix with clinical metadata
if with_metadata:
combined_df = wide_df.join(deidentified_patient_metadata_df.set_index('sample_id'), on='sample_id')#.dropna()
return combined_df, quant_value_range
return wide_df, quant_value_range
def get_biomolecule_names(dataset='proteomics'):
dataset_abr_prefix = "[{}] ".format(dataset[0].upper())
print("Getting biomolecule names for dataset: {}".format(dataset))
omics_id = omics_id_dict[dataset]
# Create an engine that connects to the Covid-19 Study DB.sqlite file: engine
engine = create_engine(db_path)
# Establish connection
connection = engine.connect()
if dataset == "metabolomics":
query = "SELECT * from biomolecules WHERE omics_id=3 OR omics_id=4 and KEEP=1".format(omics_id)
else:
query = "SELECT * from biomolecules WHERE omics_id={} and KEEP=1".format(omics_id)
# get biomolecule names
biomolecules_df = pd.read_sql_query(query, connection)
# build biomolecule name dict and drop list
biomolecule_name_dict = {}
for index, row in biomolecules_df.iterrows():
biomolecule_id = str(row['biomolecule_id'])
standardized_name = row['standardized_name']
biomolecule_name_dict[biomolecule_id] = dataset_abr_prefix + standardized_name
# return dictionary with biomolecule ids and standard names
if not dataset=="proteomics":
# close DB connection
connection.close()
return biomolecule_name_dict
# for proteomics data, return fasta headers instead
query = "SELECT * from metadata"
# get biomolecule names
metadata_df = pd.read_sql_query(query, connection)
fasta_header_df = metadata_df[metadata_df['metadata_type'] == 'fasta_header']
fasta_header_dict = {}
for index, row in fasta_header_df.iterrows():
biomolecule_id = str(int(row['biomolecule_id'])) # string was getting truncated by rstrip(".0")
fasta_header = str(row['metadata_value'])
fasta_header_dict[biomolecule_id] = fasta_header
for biomolecule_id in biomolecule_name_dict:
fasta_header = fasta_header_dict[biomolecule_id]
fasta_header = re.search("\s(.*?)\sO[SX]=", fasta_header).group(1)
biomolecule_name_dict[biomolecule_id] = dataset_abr_prefix + fasta_header
# close DB connection
connection.close()
return biomolecule_name_dict
def get_combined_data(df_dict, quant_range_dict, with_transcripts=False):
# load metabolomics data matrix
metabolomics_df, metabolomics_quant_range = df_dict['metabolomics'], quant_range_dict['metabolomics']
lipidomics_df, lipidomics_quant_range = df_dict['lipidomics'], quant_range_dict['lipidomics']
proteomics_df, proteomics_quant_range = df_dict['proteomics'], quant_range_dict['proteomics']
if with_transcripts:
#transcriptomics_df, transcriptomics_quant_range = get_omics_data(dataset='transcriptomics', with_metadata=True)
transcriptomics_df, transcriptomics_quant_range = df_dict['transcriptomics'], quant_range_dict['transcriptomics']
# get quant columns
lipidomics_quant_columns = lipidomics_df.columns[:lipidomics_quant_range]
lipidomics_quant_df = lipidomics_df[lipidomics_quant_columns]
metabolomics_quant_columns = metabolomics_df.columns[:metabolomics_quant_range]
metabolomics_quant_df = metabolomics_df[metabolomics_quant_columns]
proteomics_quant_columns = proteomics_df.columns[:proteomics_quant_range]
proteomics_quant_df = proteomics_df[proteomics_quant_columns]
if with_transcripts:
transcriptomics_quant_columns = transcriptomics_df.columns[:transcriptomics_quant_range]
transcriptomics_quant_df = transcriptomics_df[transcriptomics_quant_columns]
# get clinical_metadata_df
clinical_metadata_columns = proteomics_df.columns[proteomics_quant_range:]
clinical_metadata_df = proteomics_df[clinical_metadata_columns]
clinical_metadata_df
# join quant values together
if with_transcripts:
combined_df = proteomics_quant_df.join(lipidomics_quant_df).join(metabolomics_quant_df).join(transcriptomics_quant_df)
else:
combined_df = proteomics_quant_df.join(lipidomics_quant_df).join(metabolomics_quant_df)
combined_quant_range = combined_df.shape[1]
combined_quant_columns = combined_df.columns[:combined_quant_range]
# now join with clinical metadata
combined_df = combined_df.join(clinical_metadata_df)
# drop any samples with missing values in quant columns
combined_df.dropna(subset=combined_quant_columns,inplace=True)
# also return df_dict with combined_df
df_dict['combined'] = combined_df
# update quant_range_dict
quant_range_dict['combined'] = combined_quant_range
return df_dict, quant_range_dict
def get_p_values():
# Create an engine that connects to the Covid-19 Study DB.sqlite file: engine
engine = create_engine(db_path)
# Establish connection
connection = engine.connect()
query = "SELECT * from biomolecules WHERE KEEP=1"
# get biomolecule names
biomolecules_df = pd.read_sql_query(query, connection)
# build biomolecule name dict and drop list
biomolecule_name_dict = {}
for index, row in biomolecules_df.iterrows():
biomolecule_id = str(row['biomolecule_id'])
standardized_name = row['standardized_name']
biomolecule_name_dict[biomolecule_id] = standardized_name
query = "SELECT * from pvalues"
# get biomolecule names
pvalues_df = pd.read_sql_query(query, connection)
# drop biomolecule_ids where keep!=1
drop_list = []
for index, row in pvalues_df.iterrows():
biomolecule_id = row['biomolecule_id']
if str(biomolecule_id) not in biomolecule_name_dict:
drop_list.append(index)
pvalues_df.drop(drop_list, inplace=True)
return pvalues_df
def get_volcano_data(pvalues_df, df_dict, quant_value_range,
global_names_dict, comparison_column='COVID'):
group_1_quant_value_dict = {}
formula = 'normalized_abundance ~ COVID * ICU_1 + Gender + Age_less_than_90 vs. normalized_abundance ~ ICU_1 + Gender + Age_less_than_90'
pvalues_df = pvalues_df[(pvalues_df['comparison']=='COVID_vs_NONCOVID') & (pvalues_df['formula']==formula)]
print("Volcano data pvalues shape: {}".format(pvalues_df.shape))
group_1 = 1
group_2 = 0
combined_df = df_dict['combined']
quant_value_columns = combined_df.columns[:quant_value_range]
group_1_quant_value_dict = {}
for sample_id, row in combined_df[combined_df[comparison_column]==group_1].iterrows():
for biomolecule_id in quant_value_columns:
quant_value = row[biomolecule_id]
if not biomolecule_id in group_1_quant_value_dict:
group_1_quant_value_dict[biomolecule_id] = [quant_value]
else:
group_1_quant_value_dict[biomolecule_id].append(quant_value)
group_2_quant_value_dict = {}
for sample_id, row in combined_df[combined_df[comparison_column]==group_2].iterrows():
for biomolecule_id in quant_value_columns:
quant_value = row[biomolecule_id]
if not biomolecule_id in group_2_quant_value_dict:
group_2_quant_value_dict[biomolecule_id] = [quant_value]
else:
group_2_quant_value_dict[biomolecule_id].append(quant_value)
FC_dict = {}
for biomolecule_id in quant_value_columns:
group_1_quant_values = group_1_quant_value_dict[biomolecule_id]
group_2_quant_values = group_2_quant_value_dict[biomolecule_id]
# in log2 space; subtract
FC = np.mean(group_1_quant_values) - np.mean(group_2_quant_values)
FC_dict[biomolecule_id] = FC
FC_list = []
ome_list = []
standardized_name_list = []
std_list = []
for index, row in pvalues_df.iterrows():
biomolecule_id = str(row['biomolecule_id'])
# get standard deviation
std = np.std(combined_df[biomolecule_id])
std_list.append(std)
## NOTE: should create biomolecule_ome_dict
if biomolecule_id in df_dict['proteomics'].columns:
ome_list.append("proteomics")
elif biomolecule_id in df_dict['lipidomics'].columns:
ome_list.append("lipidomics")
elif biomolecule_id in df_dict['metabolomics'].columns:
ome_list.append("metabolomics")
elif biomolecule_id in df_dict['transcriptomics'].columns:
ome_list.append("transcriptomics")
else:
#print("Biomolecule {} not mapped to ome!".format(biomolecule_id)) # don't currenly have targeted metabolomics included
ome_list.append(np.nan)
#break
try:
FC = FC_dict[biomolecule_id]
except:
# may have been a dropped biomolecule or from a different ome
FC = np.nan
FC_list.append(FC)
try:
standardized_name = global_names_dict['combined'][biomolecule_id]
except:
# may have been a dropped biomolecule or from a different ome
standardized_name = np.nan
standardized_name_list.append(standardized_name)
pvalues_df['log2_FC'] = FC_list
pvalues_df['ome_type'] = ome_list
pvalues_df['standardized_name'] = standardized_name_list
pvalues_df['neg_log10_p_value'] = pvalues_df['p_value'].apply(np.log10).apply(np.negative)
pvalues_df['std'] = std_list
return pvalues_df