[bd22c4]: / src / db_queries / db_queries.py

Download this file

135 lines (102 with data), 5.2 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
from sqlalchemy import create_engine, MetaData, Table, select, join
import pandas as pd
from argparse import ArgumentParser
from argparse import ArgumentDefaultsHelpFormatter
from datetime import datetime
#argument parser
parser = ArgumentParser(description="Script to generate matrix from omics dataset.",\
epilog="Ya Welcome!",\
formatter_class=ArgumentDefaultsHelpFormatter)
parser.add_argument('-d', '--dataset', metavar='e.g., "proteomics"',
help='Select dataset', required=True)
args = vars(parser.parse_args())
# SQLite path
db_path = 'sqlite:///../../data/SQLite Database/20200609/Covid-19 Study DB.sqlite'
def get_omics_data(with_metadata=False, dataset="proteomics"):
omics_id_dict = {
"proteomics":1,
"lipidomics":2,
"metabolomics":3,
"transcriptomics":5
}
omics_id = omics_id_dict[dataset]
# Create an engine that connects to the Covid-19 Study DB.sqlite file: engine
engine = create_engine(db_path)
# Establish connection
connection = engine.connect()
# pull table into df
query = "SELECT * from {}_measurements".format(dataset)
omics_measurements_df = pd.read_sql_query(query, connection)
# pull table into df
query = "SELECT * from {}_runs".format(dataset)
omics_runs_df = pd.read_sql_query(query, connection)
# pull table into df
if dataset == "metabolomics":
query = "SELECT * from rawfiles WHERE ome_id=3 OR ome_id=4 AND sample_ID<>-1 AND keep=1".format(omics_id)
else:
query = "SELECT * from rawfiles WHERE ome_id={} AND sample_ID<>-1 and keep=1".format(omics_id)
rawfiles_df = pd.read_sql_query(query, connection)
## NOTE: For some reason, SQL filter not working for keep on raw files
rawfiles_df = rawfiles_df[rawfiles_df['keep']==1]
# pull table into df
deidentified_patient_metadata_df = pd.read_sql_query("SELECT * from deidentified_patient_metadata", connection)
# make sure the merge by columns are all the same type -> pandas seems sensitive to this
omics_measurements_df = omics_measurements_df.astype({'replicate_id': 'int32'})
omics_runs_df = omics_runs_df.astype({'replicate_id': 'int32', 'rawfile_id': 'int32'})
rawfiles_df = rawfiles_df.astype({'rawfile_id': 'int32', 'sample_id': 'int32'})
deidentified_patient_metadata_df = deidentified_patient_metadata_df.astype({'sample_id': 'int32'})
joined_df = omics_measurements_df\
.join(omics_runs_df.set_index('replicate_id'), on='replicate_id')\
.join(rawfiles_df.set_index('rawfile_id'), on='rawfile_id')\
.join(deidentified_patient_metadata_df.set_index('sample_id'), on='sample_id')
# drop samples that are missing COVID or ICU status
joined_df.dropna(subset=['ICU_1','COVID'], inplace=True)
# pivot to wide format
wide_df = joined_df.pivot_table(index='sample_id', columns='biomolecule_id', values='normalized_abundance')
wide_df.columns = [str(col) for col in wide_df.columns]
if dataset == "metabolomics":
query = "SELECT * from biomolecules WHERE omics_id=3 OR omics_id=4".format(omics_id)
else:
query = "SELECT * from biomolecules WHERE omics_id={}".format(omics_id)
# get biomolecule names
biomolecules_df = pd.read_sql_query(query, connection)
# close DB connection
connection.close()
# build biomolecule name dict and drop list
biomolecule_name_dict = {}
biomolecule_drop_list = []
for index, row in biomolecules_df.iterrows():
biomolecule_id = str(row['biomolecule_id'])
standardized_name = row['standardized_name']
biomolecule_name_dict[biomolecule_id] = standardized_name
keep = row['keep']
if keep!="1":
biomolecule_drop_list.append(biomolecule_id)
# drop biomolecules
wide_df.drop(biomolecule_drop_list, axis=1, inplace=True)
# replace wide_df column names
new_col_names = []
for col in wide_df.columns:
new_col_names.append(biomolecule_name_dict[str(col)])
wide_df.columns = new_col_names
# record quant value range
quant_value_range = wide_df.shape[1]
# optional return matrix with clinical metadata
if with_metadata:
combined_df = wide_df.join(deidentified_patient_metadata_df.set_index('sample_id'), on='sample_id')#.dropna()
return combined_df, quant_value_range
return wide_df, quant_value_range
if __name__ == "__main__":
dataset=args['dataset'].lower()
if not dataset in ['proteomics', 'metabolomics', 'lipidomics', 'transcriptomics']:
print("Dataset {} not recognized...".format(dataset))
print("Please select from: {}".format(",".join(dataset)))
omics_df, quant_value_range = get_omics_data(with_metadata=True, dataset=dataset)
quant_columns = omics_df.columns[:quant_value_range]
quant_df = omics_df[quant_columns]
print("{} combined data set has shape: {}".format(dataset, omics_df.shape))
print("{} quant data set has shape: {}".format(dataset, quant_df.shape))
date = datetime.today().strftime('%Y-%m-%d')
outpath = "{}_{}.tsv".format(date, dataset)
print("Writing out data to: {}".format(outpath))
omics_df.to_csv(outpath, sep="\t")