In [1]:
import pandas as pd
import numpy as np
import os
import faiss
from sklearn.preprocessing import StandardScaler, MinMaxScaler, QuantileTransformer
from plotly import offline
from sklearn.decomposition import TruncatedSVD
from MulticoreTSNE import MulticoreTSNE as TSNE
import umap

import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot

files = os.listdir("../data")

def fix_array(x):
    x = np.fromstring(
    x.replace('\n','')
    .replace('[','')
    .replace(']','')
    .replace('  ',' '), sep=' ')
    return x.reshape((1, 768))

qa = pd.read_csv("../data/" + files[0])
for file in files[1:]:
    print(file)
    qa = pd.concat([qa, pd.read_csv("../data/" + file)], axis = 0)
    

qa.drop(["answer_bert", "question_bert", "Unnamed: 0"], axis = 1, inplace = True)

qa["Q_FFNN_embeds"] = qa["Q_FFNN_embeds"].apply(fix_array)
qa["A_FFNN_embeds"] = qa["A_FFNN_embeds"].apply(fix_array)


.DS_Store
EHealthQAwithBertAndFFNNEmbeddings.csv
HealthTapFFNNEmbeddings.csv
askDocsFFNNEmbeddings.csv
webMDFFNNEmbeddings.csv


In [None]:
for n_items in range(100, 5000, 500):
    for perplexity in range(1, 40, 5):
        n_iters = 5000
        qa = qa.sample(frac = 1)
        qa.reset_index(inplace = True, drop = True)
        question_bert = np.concatenate(qa["Q_FFNN_embeds"].values, axis=0)
        answer_bert = np.concatenate(qa["A_FFNN_embeds"].values, axis=0)
        question_bert = question_bert.astype('float32')
        answer_bert = answer_bert.astype('float32')

        answer_index = faiss.IndexFlatIP(answer_bert.shape[-1])
        answer_index.add(answer_bert)

        question_index = faiss.IndexFlatIP(question_bert.shape[-1])
        question_index.add(question_bert)

        k = len(question_bert)
        D1, I1 = answer_index.search(question_bert[0:1].astype('float32'), k)
        D2, I2 = question_index.search(question_bert[0:1].astype('float32'), k)
        QT = QuantileTransformer()
        D2 = (QT.fit_transform(-D2.T)**4) #* 20
        D1 = (QT.fit_transform(-D1.T)**4) #* 20
        closest_ind_q = list(I2[0, :n_items]) +list(I2[0, -n_items:])                                          
        closest_ind_a = list(I1[0, :n_items]) +list(I1[0, -n_items:])
        dist_answers = answer_bert[closest_ind_a, :]
        dist_questions = question_bert[closest_ind_q, :]
        D1_answers = D1[closest_ind_a, :]
        D2_questions = D2[closest_ind_q, :]
        reducer = TSNE(n_components = 3, perplexity=perplexity, n_iter = n_iters)
        reduced_dimensions = reducer.fit_transform(np.concatenate([dist_questions, dist_answers, answer_bert[0:1]], axis = 0))
        question_bert_3d_close = reduced_dimensions[:n_items]
        question_bert_3d_far = reduced_dimensions[n_items:n_items*2]
        answer_bert_3d_close = reduced_dimensions[n_items*2:n_items*3]
        answer_bert_3d_far = reduced_dimensions[n_items*3:-1]
        question_bert_dist_close = D2_questions[:n_items]
        question_bert_dist_far = D2_questions[n_items:n_items*2]
        answer_bert_dist_close = D1_answers[n_items*2:n_items*3]
        answer_bert_dist_far = D1_answers[n_items*3:-1]

        init_notebook_mode(connected=True)

        orig_q = go.Scatter3d(
            name = "Original Question",
            x=question_bert_3d_close[0:1,0],
            y=question_bert_3d_close[0:1,1],
            z=question_bert_3d_close[0:1,2],
            mode='markers',
            text = qa["question"].loc[closest_ind_q[:1]],
            marker=dict(
                size=12,
                line=dict(
                    color='rgba(255, 0, 0, 0.14)',
                    width=0.1
                ),
                opacity=1.0
            )
        )
        orig_a = go.Scatter3d(
            name = "Original Answer",
            x=reduced_dimensions[-1:,0],
            y=reduced_dimensions[-1:,1],
            z=reduced_dimensions[-1:,2],
            mode='markers',
            text = qa["answer"][0:1],
            marker=dict(
                size=12,
                line=dict(
                    color='rgba(0, 255, 0, 0.14)',
                    width=0.1
                ),
                opacity=1.0
            )
        )
        recommended_a = go.Scatter3d(
            name = "Recommended Answers",
            x=answer_bert_3d_close[0:5,0],
            y=answer_bert_3d_close[0:5,1],
            z=answer_bert_3d_close[0:5,2],
            mode='markers',
            text = qa["answer"].loc[closest_ind_a[:5]],
            marker=dict(
                size=12,
                line=dict(
                    color='rgba(0, 255, 0, 0.14)',
                    width=0.1
                ),
                opacity=1.0
            )
        )

        close_q = go.Scatter3d(
            name = "Similar Questions",
            x=question_bert_3d_close[:,0],
            y=question_bert_3d_close[:,1],
            z=question_bert_3d_close[:,2],
            mode='markers',
            text = qa["question"].loc[closest_ind_q],
            marker=dict(
                size=question_bert_dist_close*16,
                line=dict(
                    color='rgba(217, 217, 217, 0.14)',
                    width=0.1
                ),
                opacity=0.8
            )
        )

        close_a = go.Scatter3d(
            name = "Similar Answers",
            x=answer_bert_3d_close[5:,0],
            y=answer_bert_3d_close[5:,1],
            z=answer_bert_3d_close[5:,2],
            mode='markers',
            text = qa["answer"].loc[closest_ind_a],
            marker=dict(
                size=answer_bert_dist_close*16,
                line=dict(
                    color='rgba(244, 100, 40, 0.14)',
                    width=0.1
                ),
                opacity=0.8
            )
        )

        far_q = go.Scatter3d(
            name = "Dissimilar Questions",
            x=question_bert_3d_far[:,0],
            y=question_bert_3d_far[:,1],
            z=question_bert_3d_far[:,2],
            mode='markers',
            text = qa["question"].loc[closest_ind_q],
            marker=dict(
                size=question_bert_dist_far,
                line=dict(
                    color='rgba(40, 100, 217, 0.14)',
                    width=0.1
                ),
                opacity=0.8
            )
        )

        far_a = go.Scatter3d(
            name = "Dissimilar Answers",
            x=answer_bert_3d_far[:,0],
            y=answer_bert_3d_far[:,1],
            z=answer_bert_3d_far[:,2],
            mode='markers',
            text = qa["answer"].loc[closest_ind_a],
            marker=dict(
                size=answer_bert_dist_far,
                line=dict(
                    color='rgba(255, 40, 40, 0.14)',
                    width=0.1
                ),
                opacity=0.8
            )
        )

        data = [orig_q, orig_a, close_q, close_a, 
                #far_q, far_a, 
                recommended_a
               ]
        layout = go.Layout(
            margin=dict(
                l=0,
                r=0,
                b=0,
                t=0
            )
        )
        fig = go.Figure(data=data, layout=layout)
        #iplot(fig, filename='simple-3d-scatter')

        offline.plot(fig, filename="./experiments/n_items_" + str(n_items) + "_perplexity_" + str(perplexity) + '.html', auto_open=False) 
        