--- a
+++ b/src/dash/plot.py
@@ -0,0 +1,455 @@
+
+import plotly.express as px
+import pandas as pd
+import numpy as np
+from scipy.spatial import distance
+from plotly import graph_objects
+import statsmodels.api as sm
+from statsmodels.stats.outliers_influence import summary_table
+
+color_dict = {
+                "COVID_ICU":"#D53E4F",
+                "COVID_NONICU":"#FDAE61",
+                "NONCOVID_ICU":"#74ADD1",
+                "NONCOVID_NONICU":"#66C2A5",
+                "Male":"#F46D43",
+                "Female":"#5AAE61",
+                "Col7":"#8073AC",
+                "Col8":"#DE77AE",
+                "proteomics":"#9E0142",
+                "lipidomics":"#F4A582",
+                "metabolomics":"#2A4023",
+                "transcriptomics":"#2C0379",
+                "selected_biomolecule":"black"
+                }
+
+def get_color_list(combined_df):
+    # from combined_df
+
+    # get colors
+    color_list = []
+    for sample_id, row in combined_df.iterrows():
+
+        ICU_1 = row['ICU_1']
+        COVID = row['COVID']
+
+        if pd.isnull(ICU_1):
+            color = "Col12"
+
+        elif ICU_1 == 1 and COVID == 1:
+            color = color_dict["COVID_ICU"]
+            color = "COVID_ICU"
+
+        elif ICU_1 == 1 and COVID == 0:
+            color = color_dict["NONCOVID_ICU"]
+            color = "NONCOVID_ICU"
+
+        elif ICU_1 == 0 and COVID == 1:
+            color = color_dict["COVID_NONICU"]
+            color = 'COVID_NONICU'
+
+        elif ICU_1 == 0 and COVID == 0:
+            color = color_dict["NONCOVID_NONICU"]
+            color = "NONCOVID_NONICU"
+
+        color_list.append(color)
+
+    return color_list
+
+def biomolecule_bar(combined_df, biomolecule_id, biomolecule_names_dict):
+
+    biomolecule_name = biomolecule_names_dict[biomolecule_id]
+
+    # set y-axis label
+    if "[T] " in biomolecule_name:
+        y_axis_title = 'log2(Norm. Count) Value'
+    else:
+        y_axis_title = 'log2(LFQ) Value'
+
+    # sort the samples by group
+    color_list = get_color_list(combined_df)
+    combined_df['color_by'] = color_list
+    combined_df['sample'] = combined_df.index
+    combined_df.sort_values(by=['color_by', 'sample'], inplace=True)
+
+    fig = px.bar(combined_df, x=[i for i in range(combined_df.shape[0])],
+        y=combined_df[biomolecule_id],
+        color=combined_df['color_by'],
+        hover_data=['sample'],
+        color_discrete_map=color_dict)
+
+    fig.update_layout(
+        title="{}".format(biomolecule_name),
+        legend_title_text='Group',
+        xaxis_title='Sample',
+        yaxis_title=y_axis_title,
+        font=dict(
+            family="Helvetica",
+            size=18,
+            color="#7f7f7f"))
+
+    return fig
+
+def boxplot(combined_df, biomolecule_id, biomolecule_names_dict):
+
+    biomolecule_name = biomolecule_names_dict[biomolecule_id]
+
+    # set y-axis label
+    if "[T] " in biomolecule_name:
+        y_axis_title = 'log2(Norm. Count) Value'
+    else:
+        y_axis_title = 'log2(LFQ) Value'
+
+    color_list = get_color_list(combined_df)
+
+    df = pd.DataFrame({'y':combined_df[biomolecule_id], 'color':color_list,
+                        'sample':combined_df.index})
+
+    fig = px.box(df, y="y", color="color", color_discrete_map=color_dict,
+                points='all', hover_data=['sample'])
+
+    fig.update_traces(quartilemethod="exclusive") # or "inclusive", or "linear" by default
+    fig.update_layout(
+        title="{}".format(biomolecule_name),
+        legend_title_text='Group',
+        xaxis_title='Group',
+        yaxis_title=y_axis_title,
+        showlegend=False,
+        font=dict(
+            family="Helvetica",
+            size=18,
+            color="#7f7f7f"))
+
+
+    return fig
+
+def pca_scores_plot(combined_df, quant_value_range):
+
+    from sklearn.decomposition import PCA
+
+    quant_columns = combined_df.columns[:quant_value_range]
+    quant_df = combined_df[quant_columns]
+
+    pca = PCA(n_components = 10)
+    PCA = pca.fit_transform(quant_df)
+
+    PC1s = []
+    PC2s = []
+    for PCs in PCA:
+        PC1 = PCs[0]
+        PC2 = PCs[1]
+        PC1s.append(PC1)
+        PC2s.append(PC2)
+
+    color_list = get_color_list(combined_df)
+
+    df = pd.DataFrame({'x':PC1s, 'y':PC2s,
+        'sample_id':combined_df.index.tolist(),
+        'COVID':combined_df['COVID']})
+
+    fig = px.scatter(df, x="x", y="y", hover_data=['sample_id'],
+                    color=color_list,
+                    color_discrete_map=color_dict,
+                    symbol='COVID')
+
+    fig.update_traces(marker=dict(size=15, opacity=0.8))
+
+    fig.update_layout(
+        title="Samples (n={})".format(quant_df.shape[0]),
+        legend_title_text='Group',
+        xaxis_title='PC1 ({}%)'.format(round(100*pca.explained_variance_ratio_[0],1)),
+        yaxis_title='PC2 ({}%)'.format(round(100*pca.explained_variance_ratio_[1],1)),
+        showlegend=False,
+        font=dict(
+            family="Helvetica",
+            size=18,
+            color="#7f7f7f")
+        )
+
+    return fig
+
+def pca_loadings_plot(combined_df, quant_value_range, dataset_id, biomolecule_names_dict, ome_type_list):
+
+    from sklearn.decomposition import PCA
+
+    quant_columns = combined_df.columns[:quant_value_range]
+    # # NOTE: For some reason, quant_df here ends up with larger shape...
+    quant_df = combined_df[quant_columns]
+
+    #  NOTE: There appear to be duplicate lipid names
+    # All lipid features currently set to keep=1
+    quant_df = quant_df.loc[:,~quant_df.columns.duplicated()]
+
+    pca = PCA(n_components = 10)
+    PCA = pca.fit_transform(quant_df)
+
+    loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
+
+    PC1_index = 0
+    PC1_loadings = [x[PC1_index] for x in loadings]
+    PC2_index = 1
+    PC2_loadings = [y[PC2_index] for y in loadings]
+
+    df = pd.DataFrame({'x':PC1_loadings, 'y':PC2_loadings,
+        'biomolecule_id':quant_df.columns.tolist(),
+        'standardized_name':[biomolecule_names_dict[i] for i in quant_df.columns.tolist()],
+        'ome_type':ome_type_list})
+
+    # downsample larger plots
+    if df.shape[0] > 1000:
+        keep_list = downsample_scatter_data_by_variance(quant_df)
+
+        df_drop_list = []
+        for index,row in df.iterrows():
+            if not row['biomolecule_id'] in keep_list:
+                df_drop_list.append(index)
+        df = df.drop(df_drop_list)
+
+    fig = px.scatter(df, x="x", y="y",
+        hover_data=['biomolecule_id', 'standardized_name'],
+        color="ome_type",
+        color_discrete_map=color_dict)
+
+    fig.update_traces(marker=dict(size=10, opacity=0.5))
+
+    fig.update_layout(
+        title="{} (n={})".format(dataset_id, quant_df.shape[1]),
+        legend_title_text='Group',
+        xaxis_title='Loadings on PC1',
+        yaxis_title='Loadings on PC2',
+        font=dict(
+            family="Helvetica",
+            size=18,
+            color="#7f7f7f")
+        )
+
+    # only show the color legend with combined datasets
+    #if not dataset_id=="Combined":
+    #    fig.update_layout(showlegend=False)
+
+    return fig
+
+def downsample_scatter_data(df):
+
+    # df should have, x, y, and ome_type columns
+
+    # filter top n biomolecules on loadings plot, by distance from origin
+    origin = (0,0)
+    distance_list = []
+    for index, row in df.iterrows():
+        x = row['x']
+        y = row['y']
+        coordinates = (x, y)
+        d = distance.euclidean(origin, coordinates)
+        distance_list.append(d)
+
+    df['distance_from_origin'] = distance_list
+
+    distance_std = np.std(distance_list)
+    downsample_range = distance_std
+
+    drop_index_list = []
+    for ome_type in list(set(df['ome_type'])):
+        # drop 20 % of measurements for each ome, randomly subsample 50% of those
+        #drop_row_num = round(df[df['ome_type'] == ome_type].shape[0] * 0.2)
+        #drop_indices = df[df['ome_type'] == ome_type].\
+        #    sort_values(by='distance_from_origin').\
+        #    iloc[:drop_row_num].sample(frac=0.5, random_state=1).index.tolist()
+
+        # randomly downsample half of data within one standard deviation from origin
+        ome_df = df[(df['ome_type'] == ome_type) & (df['distance_from_origin'] < downsample_range)]
+        drop_indices = ome_df.sample(random_state=1, frac=0.25).index.tolist()
+        drop_index_list.extend(drop_indices)
+
+    df = df.drop(drop_index_list)
+
+    return df
+
+def downsample_scatter_data_by_variance(df):
+    # return top 1000 features by variance
+    keep_list = df.std(axis=0).sort_values(ascending=False)[:1000]
+
+    return keep_list
+
+def downsample_volcano_data(df):
+
+    keep_index_list = []
+    for ome_type in list(set(df['ome_type'])):
+        # keep data for each ome with top 1000 features by variance (std)
+        ome_df = df[df['ome_type'] == ome_type].sort_values(by='std', ascending=False)
+        #ome_df = df[df['ome_type'] == ome_type].sort_values(by='q_value', ascending=True)
+        keep_indices = ome_df.index.tolist()[:2000]
+        keep_index_list.extend(keep_indices)
+
+    df = df.loc[keep_index_list]
+
+    return df
+
+def volcano_plot(volcano_df):
+
+    #volcano_df.dropna(inplace=True)
+    volcano_df = volcano_df.dropna()
+
+    df = pd.DataFrame({'x':volcano_df['log2_FC'],
+        'y':volcano_df['neg_log10_p_value'],
+        'biomolecule_id':volcano_df['biomolecule_id'],
+        'standardized_name':volcano_df['standardized_name'],
+        'ome_type':volcano_df['ome_type'],
+        'p_value':volcano_df['p_value'],
+        'q_value':volcano_df['q_value'],
+        'std':volcano_df['std']})
+
+    df = downsample_volcano_data(df)
+
+    fig = px.scatter(df, x="x", y="y",
+    hover_data=['biomolecule_id', 'standardized_name', 'p_value', 'q_value'],
+    opacity=0.5,
+    size='y',
+    color='ome_type',
+    color_discrete_map=color_dict)
+
+    #fig.update_traces(marker=dict(size=10, opacity=0.5))
+
+    #confounders = ", ".join(volcano_df.iloc[0]['confounders'].split(";"))
+    title = "COVID vs NONCOVID"
+
+    fig.update_layout(
+        title="{} (n={})".format(title, volcano_df.shape[0]),
+        legend_title_text='Dataset',
+        xaxis_title='Effect Size (log2 FC)',
+        yaxis_title='Significance (-log10(Corrected P-value))',
+        font=dict(
+            family="Helvetica",
+            size=18,
+            color="#7f7f7f")
+        )
+
+    return fig
+
+def correlation_scatter(combined_df, biomolecule_id, selected_groups,
+    biomolecule_name, clinical_measurement):
+
+    # set y-axis label
+    if "[T] " in biomolecule_name:
+        y_axis_title = '{} \nlog2(Norm. Counts)'.format(biomolecule_name)
+
+    else:
+        y_axis_title = '{} \nlog2(LFQ)'.format(biomolecule_name)
+
+    # shorten biomolecule name
+    if len(biomolecule_name) > 15:
+        biomolecule_name = biomolecule_name[:15] + ".."
+
+    ## NOTE: See https://pythonplot.com/ for confidence interval example
+    if clinical_measurement == "Gender":
+        combined_df.replace("M", 0, inplace=True)
+        combined_df.replace("F", 1, inplace=True)
+
+    # drop samples with missing values for clinical measurement
+    # strip any whitespace
+    combined_df = combined_df.apply(lambda x: x.astype(str).str.strip() if x.dtype == "object" else x)
+    combined_df.replace('', np.nan, inplace=True)
+    combined_df = combined_df.dropna(subset=[clinical_measurement, 'COVID'])
+
+    color_list = get_color_list(combined_df)
+    combined_df['group'] = color_list
+
+    # drop by group
+    for group in ['COVID_ICU', 'COVID_NONICU', 'NONCOVID_ICU', "NONCOVID_NONICU"]:
+        if not group in selected_groups:
+            combined_df = combined_df[combined_df['group'] != group]
+
+    # set target and explanatory variables
+    y_var = biomolecule_id
+    x_var = clinical_measurement
+
+    ### Run regression with statsmodels ###
+    x=combined_df[x_var].astype(float)
+    y=combined_df[y_var].astype(float)
+    X = sm.add_constant(x)
+    res = sm.OLS(y, X).fit()
+    rsquared = round(res.rsquared, 3)
+    p_val = '%.3E' % res.f_pvalue
+
+    # get regression data for range of HFD values
+    x_min = round(min(x),1)
+    x_max = round(max(x), 1)
+    x_range = np.arange(x_min,x_max + 0.1 ,0.1)
+    #x_range = np.array([i for i in range(min(combined_df[biomolecule_id]), max(combined_df[biomolecule_id], 0.1))])
+    X = sm.add_constant(x_range)
+    out_of_sample_predictions = res.get_prediction(X)
+    preds = out_of_sample_predictions.summary_frame(alpha=0.05)
+
+    ###
+
+    df = pd.DataFrame({'x':x,
+        'y':y,
+        'sample_id':combined_df.index.tolist(),
+        'COVID':combined_df['COVID'],
+        'group':combined_df['group']})
+
+    fig = px.scatter(df, x="x", y="y", hover_data=['sample_id'],
+                    color='group',
+                    color_discrete_map=color_dict)
+
+    fig.update_traces(marker=dict(size=15, opacity=0.8))
+
+    # add regression data
+    line_of_best_fit = graph_objects.Scatter({
+    'mode' : 'lines',
+    'x' : x_range,
+    'y' : preds['mean'],
+    'name' : 'Trend',
+    'opacity' : 0.6,
+    'line' : {
+        'color' : 'black'
+    }
+    })
+
+    #Add a lower bound for the confidence interval, white
+    mean_ci_lower = graph_objects.Scatter({
+        'mode' : 'lines',
+        'x' : x_range,
+        'y' : preds['mean_ci_lower'],
+        'name' : 'Lower 95% CI',
+        'showlegend' : False,
+        'line' : {
+            'color' : 'white'
+        }
+    })
+    # Upper bound for the confidence band, transparent but with fill
+    mean_ci_upper = graph_objects.Scatter( {
+        'type' : 'scatter',
+        'mode' : 'lines',
+        'x' : x_range,
+        'y' : preds['mean_ci_upper'],
+        'name' : '95% CI',
+        'fill' : 'tonexty',
+        'line' : {
+            'color' : 'white'
+        },
+        'fillcolor' : 'rgba(255, 127, 14, 0.3)'
+    })
+
+    fig.add_trace(line_of_best_fit)
+    fig.add_trace(mean_ci_lower)
+    fig.add_trace(mean_ci_upper)
+
+    if len(x_var) > 10:
+        x_var = x_var[:10] + ".."
+    formula = "Biomolecule {} ~ {}".format(biomolecule_id, x_var)
+    plot_title = "{}, R2: {}, p value: {}, n={}".format(formula, rsquared, p_val, combined_df.shape[0])
+
+    fig.update_layout(
+        title=plot_title,
+        legend_title_text='Group',
+        xaxis_title='{}'.format(clinical_measurement),
+        yaxis_title=y_axis_title,
+        showlegend=True,
+        font=dict(
+            family="Helvetica",
+            size=18,
+            color="#7f7f7f")
+        )
+
+    return fig