--- a +++ b/src/dash/plot.py @@ -0,0 +1,455 @@ + +import plotly.express as px +import pandas as pd +import numpy as np +from scipy.spatial import distance +from plotly import graph_objects +import statsmodels.api as sm +from statsmodels.stats.outliers_influence import summary_table + +color_dict = { + "COVID_ICU":"#D53E4F", + "COVID_NONICU":"#FDAE61", + "NONCOVID_ICU":"#74ADD1", + "NONCOVID_NONICU":"#66C2A5", + "Male":"#F46D43", + "Female":"#5AAE61", + "Col7":"#8073AC", + "Col8":"#DE77AE", + "proteomics":"#9E0142", + "lipidomics":"#F4A582", + "metabolomics":"#2A4023", + "transcriptomics":"#2C0379", + "selected_biomolecule":"black" + } + +def get_color_list(combined_df): + # from combined_df + + # get colors + color_list = [] + for sample_id, row in combined_df.iterrows(): + + ICU_1 = row['ICU_1'] + COVID = row['COVID'] + + if pd.isnull(ICU_1): + color = "Col12" + + elif ICU_1 == 1 and COVID == 1: + color = color_dict["COVID_ICU"] + color = "COVID_ICU" + + elif ICU_1 == 1 and COVID == 0: + color = color_dict["NONCOVID_ICU"] + color = "NONCOVID_ICU" + + elif ICU_1 == 0 and COVID == 1: + color = color_dict["COVID_NONICU"] + color = 'COVID_NONICU' + + elif ICU_1 == 0 and COVID == 0: + color = color_dict["NONCOVID_NONICU"] + color = "NONCOVID_NONICU" + + color_list.append(color) + + return color_list + +def biomolecule_bar(combined_df, biomolecule_id, biomolecule_names_dict): + + biomolecule_name = biomolecule_names_dict[biomolecule_id] + + # set y-axis label + if "[T] " in biomolecule_name: + y_axis_title = 'log2(Norm. Count) Value' + else: + y_axis_title = 'log2(LFQ) Value' + + # sort the samples by group + color_list = get_color_list(combined_df) + combined_df['color_by'] = color_list + combined_df['sample'] = combined_df.index + combined_df.sort_values(by=['color_by', 'sample'], inplace=True) + + fig = px.bar(combined_df, x=[i for i in range(combined_df.shape[0])], + y=combined_df[biomolecule_id], + color=combined_df['color_by'], + hover_data=['sample'], + color_discrete_map=color_dict) + + fig.update_layout( + title="{}".format(biomolecule_name), + legend_title_text='Group', + xaxis_title='Sample', + yaxis_title=y_axis_title, + font=dict( + family="Helvetica", + size=18, + color="#7f7f7f")) + + return fig + +def boxplot(combined_df, biomolecule_id, biomolecule_names_dict): + + biomolecule_name = biomolecule_names_dict[biomolecule_id] + + # set y-axis label + if "[T] " in biomolecule_name: + y_axis_title = 'log2(Norm. Count) Value' + else: + y_axis_title = 'log2(LFQ) Value' + + color_list = get_color_list(combined_df) + + df = pd.DataFrame({'y':combined_df[biomolecule_id], 'color':color_list, + 'sample':combined_df.index}) + + fig = px.box(df, y="y", color="color", color_discrete_map=color_dict, + points='all', hover_data=['sample']) + + fig.update_traces(quartilemethod="exclusive") # or "inclusive", or "linear" by default + fig.update_layout( + title="{}".format(biomolecule_name), + legend_title_text='Group', + xaxis_title='Group', + yaxis_title=y_axis_title, + showlegend=False, + font=dict( + family="Helvetica", + size=18, + color="#7f7f7f")) + + + return fig + +def pca_scores_plot(combined_df, quant_value_range): + + from sklearn.decomposition import PCA + + quant_columns = combined_df.columns[:quant_value_range] + quant_df = combined_df[quant_columns] + + pca = PCA(n_components = 10) + PCA = pca.fit_transform(quant_df) + + PC1s = [] + PC2s = [] + for PCs in PCA: + PC1 = PCs[0] + PC2 = PCs[1] + PC1s.append(PC1) + PC2s.append(PC2) + + color_list = get_color_list(combined_df) + + df = pd.DataFrame({'x':PC1s, 'y':PC2s, + 'sample_id':combined_df.index.tolist(), + 'COVID':combined_df['COVID']}) + + fig = px.scatter(df, x="x", y="y", hover_data=['sample_id'], + color=color_list, + color_discrete_map=color_dict, + symbol='COVID') + + fig.update_traces(marker=dict(size=15, opacity=0.8)) + + fig.update_layout( + title="Samples (n={})".format(quant_df.shape[0]), + legend_title_text='Group', + xaxis_title='PC1 ({}%)'.format(round(100*pca.explained_variance_ratio_[0],1)), + yaxis_title='PC2 ({}%)'.format(round(100*pca.explained_variance_ratio_[1],1)), + showlegend=False, + font=dict( + family="Helvetica", + size=18, + color="#7f7f7f") + ) + + return fig + +def pca_loadings_plot(combined_df, quant_value_range, dataset_id, biomolecule_names_dict, ome_type_list): + + from sklearn.decomposition import PCA + + quant_columns = combined_df.columns[:quant_value_range] + # # NOTE: For some reason, quant_df here ends up with larger shape... + quant_df = combined_df[quant_columns] + + # NOTE: There appear to be duplicate lipid names + # All lipid features currently set to keep=1 + quant_df = quant_df.loc[:,~quant_df.columns.duplicated()] + + pca = PCA(n_components = 10) + PCA = pca.fit_transform(quant_df) + + loadings = pca.components_.T * np.sqrt(pca.explained_variance_) + + PC1_index = 0 + PC1_loadings = [x[PC1_index] for x in loadings] + PC2_index = 1 + PC2_loadings = [y[PC2_index] for y in loadings] + + df = pd.DataFrame({'x':PC1_loadings, 'y':PC2_loadings, + 'biomolecule_id':quant_df.columns.tolist(), + 'standardized_name':[biomolecule_names_dict[i] for i in quant_df.columns.tolist()], + 'ome_type':ome_type_list}) + + # downsample larger plots + if df.shape[0] > 1000: + keep_list = downsample_scatter_data_by_variance(quant_df) + + df_drop_list = [] + for index,row in df.iterrows(): + if not row['biomolecule_id'] in keep_list: + df_drop_list.append(index) + df = df.drop(df_drop_list) + + fig = px.scatter(df, x="x", y="y", + hover_data=['biomolecule_id', 'standardized_name'], + color="ome_type", + color_discrete_map=color_dict) + + fig.update_traces(marker=dict(size=10, opacity=0.5)) + + fig.update_layout( + title="{} (n={})".format(dataset_id, quant_df.shape[1]), + legend_title_text='Group', + xaxis_title='Loadings on PC1', + yaxis_title='Loadings on PC2', + font=dict( + family="Helvetica", + size=18, + color="#7f7f7f") + ) + + # only show the color legend with combined datasets + #if not dataset_id=="Combined": + # fig.update_layout(showlegend=False) + + return fig + +def downsample_scatter_data(df): + + # df should have, x, y, and ome_type columns + + # filter top n biomolecules on loadings plot, by distance from origin + origin = (0,0) + distance_list = [] + for index, row in df.iterrows(): + x = row['x'] + y = row['y'] + coordinates = (x, y) + d = distance.euclidean(origin, coordinates) + distance_list.append(d) + + df['distance_from_origin'] = distance_list + + distance_std = np.std(distance_list) + downsample_range = distance_std + + drop_index_list = [] + for ome_type in list(set(df['ome_type'])): + # drop 20 % of measurements for each ome, randomly subsample 50% of those + #drop_row_num = round(df[df['ome_type'] == ome_type].shape[0] * 0.2) + #drop_indices = df[df['ome_type'] == ome_type].\ + # sort_values(by='distance_from_origin').\ + # iloc[:drop_row_num].sample(frac=0.5, random_state=1).index.tolist() + + # randomly downsample half of data within one standard deviation from origin + ome_df = df[(df['ome_type'] == ome_type) & (df['distance_from_origin'] < downsample_range)] + drop_indices = ome_df.sample(random_state=1, frac=0.25).index.tolist() + drop_index_list.extend(drop_indices) + + df = df.drop(drop_index_list) + + return df + +def downsample_scatter_data_by_variance(df): + # return top 1000 features by variance + keep_list = df.std(axis=0).sort_values(ascending=False)[:1000] + + return keep_list + +def downsample_volcano_data(df): + + keep_index_list = [] + for ome_type in list(set(df['ome_type'])): + # keep data for each ome with top 1000 features by variance (std) + ome_df = df[df['ome_type'] == ome_type].sort_values(by='std', ascending=False) + #ome_df = df[df['ome_type'] == ome_type].sort_values(by='q_value', ascending=True) + keep_indices = ome_df.index.tolist()[:2000] + keep_index_list.extend(keep_indices) + + df = df.loc[keep_index_list] + + return df + +def volcano_plot(volcano_df): + + #volcano_df.dropna(inplace=True) + volcano_df = volcano_df.dropna() + + df = pd.DataFrame({'x':volcano_df['log2_FC'], + 'y':volcano_df['neg_log10_p_value'], + 'biomolecule_id':volcano_df['biomolecule_id'], + 'standardized_name':volcano_df['standardized_name'], + 'ome_type':volcano_df['ome_type'], + 'p_value':volcano_df['p_value'], + 'q_value':volcano_df['q_value'], + 'std':volcano_df['std']}) + + df = downsample_volcano_data(df) + + fig = px.scatter(df, x="x", y="y", + hover_data=['biomolecule_id', 'standardized_name', 'p_value', 'q_value'], + opacity=0.5, + size='y', + color='ome_type', + color_discrete_map=color_dict) + + #fig.update_traces(marker=dict(size=10, opacity=0.5)) + + #confounders = ", ".join(volcano_df.iloc[0]['confounders'].split(";")) + title = "COVID vs NONCOVID" + + fig.update_layout( + title="{} (n={})".format(title, volcano_df.shape[0]), + legend_title_text='Dataset', + xaxis_title='Effect Size (log2 FC)', + yaxis_title='Significance (-log10(Corrected P-value))', + font=dict( + family="Helvetica", + size=18, + color="#7f7f7f") + ) + + return fig + +def correlation_scatter(combined_df, biomolecule_id, selected_groups, + biomolecule_name, clinical_measurement): + + # set y-axis label + if "[T] " in biomolecule_name: + y_axis_title = '{} \nlog2(Norm. Counts)'.format(biomolecule_name) + + else: + y_axis_title = '{} \nlog2(LFQ)'.format(biomolecule_name) + + # shorten biomolecule name + if len(biomolecule_name) > 15: + biomolecule_name = biomolecule_name[:15] + ".." + + ## NOTE: See https://pythonplot.com/ for confidence interval example + if clinical_measurement == "Gender": + combined_df.replace("M", 0, inplace=True) + combined_df.replace("F", 1, inplace=True) + + # drop samples with missing values for clinical measurement + # strip any whitespace + combined_df = combined_df.apply(lambda x: x.astype(str).str.strip() if x.dtype == "object" else x) + combined_df.replace('', np.nan, inplace=True) + combined_df = combined_df.dropna(subset=[clinical_measurement, 'COVID']) + + color_list = get_color_list(combined_df) + combined_df['group'] = color_list + + # drop by group + for group in ['COVID_ICU', 'COVID_NONICU', 'NONCOVID_ICU', "NONCOVID_NONICU"]: + if not group in selected_groups: + combined_df = combined_df[combined_df['group'] != group] + + # set target and explanatory variables + y_var = biomolecule_id + x_var = clinical_measurement + + ### Run regression with statsmodels ### + x=combined_df[x_var].astype(float) + y=combined_df[y_var].astype(float) + X = sm.add_constant(x) + res = sm.OLS(y, X).fit() + rsquared = round(res.rsquared, 3) + p_val = '%.3E' % res.f_pvalue + + # get regression data for range of HFD values + x_min = round(min(x),1) + x_max = round(max(x), 1) + x_range = np.arange(x_min,x_max + 0.1 ,0.1) + #x_range = np.array([i for i in range(min(combined_df[biomolecule_id]), max(combined_df[biomolecule_id], 0.1))]) + X = sm.add_constant(x_range) + out_of_sample_predictions = res.get_prediction(X) + preds = out_of_sample_predictions.summary_frame(alpha=0.05) + + ### + + df = pd.DataFrame({'x':x, + 'y':y, + 'sample_id':combined_df.index.tolist(), + 'COVID':combined_df['COVID'], + 'group':combined_df['group']}) + + fig = px.scatter(df, x="x", y="y", hover_data=['sample_id'], + color='group', + color_discrete_map=color_dict) + + fig.update_traces(marker=dict(size=15, opacity=0.8)) + + # add regression data + line_of_best_fit = graph_objects.Scatter({ + 'mode' : 'lines', + 'x' : x_range, + 'y' : preds['mean'], + 'name' : 'Trend', + 'opacity' : 0.6, + 'line' : { + 'color' : 'black' + } + }) + + #Add a lower bound for the confidence interval, white + mean_ci_lower = graph_objects.Scatter({ + 'mode' : 'lines', + 'x' : x_range, + 'y' : preds['mean_ci_lower'], + 'name' : 'Lower 95% CI', + 'showlegend' : False, + 'line' : { + 'color' : 'white' + } + }) + # Upper bound for the confidence band, transparent but with fill + mean_ci_upper = graph_objects.Scatter( { + 'type' : 'scatter', + 'mode' : 'lines', + 'x' : x_range, + 'y' : preds['mean_ci_upper'], + 'name' : '95% CI', + 'fill' : 'tonexty', + 'line' : { + 'color' : 'white' + }, + 'fillcolor' : 'rgba(255, 127, 14, 0.3)' + }) + + fig.add_trace(line_of_best_fit) + fig.add_trace(mean_ci_lower) + fig.add_trace(mean_ci_upper) + + if len(x_var) > 10: + x_var = x_var[:10] + ".." + formula = "Biomolecule {} ~ {}".format(biomolecule_id, x_var) + plot_title = "{}, R2: {}, p value: {}, n={}".format(formula, rsquared, p_val, combined_df.shape[0]) + + fig.update_layout( + title=plot_title, + legend_title_text='Group', + xaxis_title='{}'.format(clinical_measurement), + yaxis_title=y_axis_title, + showlegend=True, + font=dict( + family="Helvetica", + size=18, + color="#7f7f7f") + ) + + return fig