DocProduct / Git / [51873b] /notebooks/visualization.ipynb

Models:
philipB/
DocProduct
Downloads: 1
[51873b]: / notebooks / visualization.ipynb
History
Download this file
506 lines (505 with data), 20.9 kB

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      ".DS_Store\n",
      "EHealthQAwithBertAndFFNNEmbeddings.csv\n",
      "HealthTapFFNNEmbeddings.csv\n",
      "askDocsFFNNEmbeddings.csv\n",
      "webMDFFNNEmbeddings.csv\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "import faiss\n",
    "from sklearn.preprocessing import StandardScaler, MinMaxScaler, QuantileTransformer\n",
    "from plotly import offline\n",
    "from sklearn.decomposition import TruncatedSVD\n",
    "from MulticoreTSNE import MulticoreTSNE as TSNE\n",
    "import umap\n",
    "\n",
    "import plotly.plotly as py\n",
    "import plotly.graph_objs as go\n",
    "from plotly.offline import init_notebook_mode, iplot\n",
    "\n",
    "files = os.listdir(\"../data\")\n",
    "\n",
    "def fix_array(x):\n",
    "    x = np.fromstring(\n",
    "    x.replace('\\n','')\n",
    "    .replace('[','')\n",
    "    .replace(']','')\n",
    "    .replace('  ',' '), sep=' ')\n",
    "    return x.reshape((1, 768))\n",
    "\n",
    "qa = pd.read_csv(\"../data/\" + files[0])\n",
    "for file in files[1:]:\n",
    "    print(file)\n",
    "    qa = pd.concat([qa, pd.read_csv(\"../data/\" + file)], axis = 0)\n",
    "    \n",
    "\n",
    "qa.drop([\"answer_bert\", \"question_bert\", \"Unnamed: 0\"], axis = 1, inplace = True)\n",
    "\n",
    "qa[\"Q_FFNN_embeds\"] = qa[\"Q_FFNN_embeds\"].apply(fix_array)\n",
    "qa[\"A_FFNN_embeds\"] = qa[\"A_FFNN_embeds\"].apply(fix_array)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
      ],
      "text/vnd.plotly.v1+html": [
       "<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
      ],
      "text/vnd.plotly.v1+html": [
       "<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
      ],
      "text/vnd.plotly.v1+html": [
       "<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
      ],
      "text/vnd.plotly.v1+html": [
       "<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
      ],
      "text/vnd.plotly.v1+html": [
       "<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
      ],
      "text/vnd.plotly.v1+html": [
       "<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
      ],
      "text/vnd.plotly.v1+html": [
       "<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
      ],
      "text/vnd.plotly.v1+html": [
       "<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
      ],
      "text/vnd.plotly.v1+html": [
       "<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
      ],
      "text/vnd.plotly.v1+html": [
       "<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
      ],
      "text/vnd.plotly.v1+html": [
       "<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
      ],
      "text/vnd.plotly.v1+html": [
       "<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
      ],
      "text/vnd.plotly.v1+html": [
       "<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
      ],
      "text/vnd.plotly.v1+html": [
       "<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
      ],
      "text/vnd.plotly.v1+html": [
       "<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
      ],
      "text/vnd.plotly.v1+html": [
       "<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
      ],
      "text/vnd.plotly.v1+html": [
       "<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
      ],
      "text/vnd.plotly.v1+html": [
       "<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
      ],
      "text/vnd.plotly.v1+html": [
       "<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "for n_items in range(100, 5000, 500):\n",
    "    for perplexity in range(1, 40, 5):\n",
    "        n_iters = 5000\n",
    "        qa = qa.sample(frac = 1)\n",
    "        qa.reset_index(inplace = True, drop = True)\n",
    "        question_bert = np.concatenate(qa[\"Q_FFNN_embeds\"].values, axis=0)\n",
    "        answer_bert = np.concatenate(qa[\"A_FFNN_embeds\"].values, axis=0)\n",
    "        question_bert = question_bert.astype('float32')\n",
    "        answer_bert = answer_bert.astype('float32')\n",
    "\n",
    "        answer_index = faiss.IndexFlatIP(answer_bert.shape[-1])\n",
    "        answer_index.add(answer_bert)\n",
    "\n",
    "        question_index = faiss.IndexFlatIP(question_bert.shape[-1])\n",
    "        question_index.add(question_bert)\n",
    "\n",
    "        k = len(question_bert)\n",
    "        D1, I1 = answer_index.search(question_bert[0:1].astype('float32'), k)\n",
    "        D2, I2 = question_index.search(question_bert[0:1].astype('float32'), k)\n",
    "        QT = QuantileTransformer()\n",
    "        D2 = (QT.fit_transform(-D2.T)**4) #* 20\n",
    "        D1 = (QT.fit_transform(-D1.T)**4) #* 20\n",
    "        closest_ind_q = list(I2[0, :n_items]) +list(I2[0, -n_items:])                                          \n",
    "        closest_ind_a = list(I1[0, :n_items]) +list(I1[0, -n_items:])\n",
    "        dist_answers = answer_bert[closest_ind_a, :]\n",
    "        dist_questions = question_bert[closest_ind_q, :]\n",
    "        D1_answers = D1[closest_ind_a, :]\n",
    "        D2_questions = D2[closest_ind_q, :]\n",
    "        reducer = TSNE(n_components = 3, perplexity=perplexity, n_iter = n_iters)\n",
    "        reduced_dimensions = reducer.fit_transform(np.concatenate([dist_questions, dist_answers, answer_bert[0:1]], axis = 0))\n",
    "        question_bert_3d_close = reduced_dimensions[:n_items]\n",
    "        question_bert_3d_far = reduced_dimensions[n_items:n_items*2]\n",
    "        answer_bert_3d_close = reduced_dimensions[n_items*2:n_items*3]\n",
    "        answer_bert_3d_far = reduced_dimensions[n_items*3:-1]\n",
    "        question_bert_dist_close = D2_questions[:n_items]\n",
    "        question_bert_dist_far = D2_questions[n_items:n_items*2]\n",
    "        answer_bert_dist_close = D1_answers[n_items*2:n_items*3]\n",
    "        answer_bert_dist_far = D1_answers[n_items*3:-1]\n",
    "\n",
    "        init_notebook_mode(connected=True)\n",
    "\n",
    "        orig_q = go.Scatter3d(\n",
    "            name = \"Original Question\",\n",
    "            x=question_bert_3d_close[0:1,0],\n",
    "            y=question_bert_3d_close[0:1,1],\n",
    "            z=question_bert_3d_close[0:1,2],\n",
    "            mode='markers',\n",
    "            text = qa[\"question\"].loc[closest_ind_q[:1]],\n",
    "            marker=dict(\n",
    "                size=12,\n",
    "                line=dict(\n",
    "                    color='rgba(255, 0, 0, 0.14)',\n",
    "                    width=0.1\n",
    "                ),\n",
    "                opacity=1.0\n",
    "            )\n",
    "        )\n",
    "        orig_a = go.Scatter3d(\n",
    "            name = \"Original Answer\",\n",
    "            x=reduced_dimensions[-1:,0],\n",
    "            y=reduced_dimensions[-1:,1],\n",
    "            z=reduced_dimensions[-1:,2],\n",
    "            mode='markers',\n",
    "            text = qa[\"answer\"][0:1],\n",
    "            marker=dict(\n",
    "                size=12,\n",
    "                line=dict(\n",
    "                    color='rgba(0, 255, 0, 0.14)',\n",
    "                    width=0.1\n",
    "                ),\n",
    "                opacity=1.0\n",
    "            )\n",
    "        )\n",
    "        recommended_a = go.Scatter3d(\n",
    "            name = \"Recommended Answers\",\n",
    "            x=answer_bert_3d_close[0:5,0],\n",
    "            y=answer_bert_3d_close[0:5,1],\n",
    "            z=answer_bert_3d_close[0:5,2],\n",
    "            mode='markers',\n",
    "            text = qa[\"answer\"].loc[closest_ind_a[:5]],\n",
    "            marker=dict(\n",
    "                size=12,\n",
    "                line=dict(\n",
    "                    color='rgba(0, 255, 0, 0.14)',\n",
    "                    width=0.1\n",
    "                ),\n",
    "                opacity=1.0\n",
    "            )\n",
    "        )\n",
    "\n",
    "        close_q = go.Scatter3d(\n",
    "            name = \"Similar Questions\",\n",
    "            x=question_bert_3d_close[:,0],\n",
    "            y=question_bert_3d_close[:,1],\n",
    "            z=question_bert_3d_close[:,2],\n",
    "            mode='markers',\n",
    "            text = qa[\"question\"].loc[closest_ind_q],\n",
    "            marker=dict(\n",
    "                size=question_bert_dist_close*16,\n",
    "                line=dict(\n",
    "                    color='rgba(217, 217, 217, 0.14)',\n",
    "                    width=0.1\n",
    "                ),\n",
    "                opacity=0.8\n",
    "            )\n",
    "        )\n",
    "\n",
    "        close_a = go.Scatter3d(\n",
    "            name = \"Similar Answers\",\n",
    "            x=answer_bert_3d_close[5:,0],\n",
    "            y=answer_bert_3d_close[5:,1],\n",
    "            z=answer_bert_3d_close[5:,2],\n",
    "            mode='markers',\n",
    "            text = qa[\"answer\"].loc[closest_ind_a],\n",
    "            marker=dict(\n",
    "                size=answer_bert_dist_close*16,\n",
    "                line=dict(\n",
    "                    color='rgba(244, 100, 40, 0.14)',\n",
    "                    width=0.1\n",
    "                ),\n",
    "                opacity=0.8\n",
    "            )\n",
    "        )\n",
    "\n",
    "        far_q = go.Scatter3d(\n",
    "            name = \"Dissimilar Questions\",\n",
    "            x=question_bert_3d_far[:,0],\n",
    "            y=question_bert_3d_far[:,1],\n",
    "            z=question_bert_3d_far[:,2],\n",
    "            mode='markers',\n",
    "            text = qa[\"question\"].loc[closest_ind_q],\n",
    "            marker=dict(\n",
    "                size=question_bert_dist_far,\n",
    "                line=dict(\n",
    "                    color='rgba(40, 100, 217, 0.14)',\n",
    "                    width=0.1\n",
    "                ),\n",
    "                opacity=0.8\n",
    "            )\n",
    "        )\n",
    "\n",
    "        far_a = go.Scatter3d(\n",
    "            name = \"Dissimilar Answers\",\n",
    "            x=answer_bert_3d_far[:,0],\n",
    "            y=answer_bert_3d_far[:,1],\n",
    "            z=answer_bert_3d_far[:,2],\n",
    "            mode='markers',\n",
    "            text = qa[\"answer\"].loc[closest_ind_a],\n",
    "            marker=dict(\n",
    "                size=answer_bert_dist_far,\n",
    "                line=dict(\n",
    "                    color='rgba(255, 40, 40, 0.14)',\n",
    "                    width=0.1\n",
    "                ),\n",
    "                opacity=0.8\n",
    "            )\n",
    "        )\n",
    "\n",
    "        data = [orig_q, orig_a, close_q, close_a, \n",
    "                #far_q, far_a, \n",
    "                recommended_a\n",
    "               ]\n",
    "        layout = go.Layout(\n",
    "            margin=dict(\n",
    "                l=0,\n",
    "                r=0,\n",
    "                b=0,\n",
    "                t=0\n",
    "            )\n",
    "        )\n",
    "        fig = go.Figure(data=data, layout=layout)\n",
    "        #iplot(fig, filename='simple-3d-scatter')\n",
    "\n",
    "        offline.plot(fig, filename=\"./experiments/n_items_\" + str(n_items) + \"_perplexity_\" + str(perplexity) + '.html', auto_open=False) \n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}