[03245f]: / notebooks / IU_Xray_EDA.ipynb

Download this file

986 lines (986 with data), 59.5 kB

{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "collapsed_sections": [],
      "authorship_tag": "ABX9TyPaxWDmXP+eW1nbC2gcFIE5",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/zaaachos/Thesis-Diagnostic-Captioning/blob/main/notebooks/IU_Xray_EDA.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from google.colab import drive\n",
        "drive.mount('/content/drive')"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "rKTWUMPVugDP",
        "outputId": "46d534c9-e365-44d6-a0c1-cf8bd6552913"
      },
      "execution_count": 1,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Mounted at /content/drive\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "%cd drive/MyDrive"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Gzl3ALvDujPH",
        "outputId": "b60a4dd4-60ef-4d01-a4c7-cb1e65f53b9c"
      },
      "execution_count": 2,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "/content/drive/MyDrive\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "!ls"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "ywbUILg1uork",
        "outputId": "af2d5233-e968-407e-a3e9-739125551886"
      },
      "execution_count": 3,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            " 46bff9d5-95d4-4362-be98-ef59819ec3af_ImageCLEFmedCaption_2022_concept_detection_valid.csv\n",
            " 72d678c1-c535-491d-bc42-c2ba11f47165_concepts.csv\n",
            " b47c4f80-9432-408c-b69a-956a3382a0da_ImageCLEFmedCaption_2022_concept_detection_train.csv\n",
            " c856ae07-029b-449e-bd06-99c04d3ad1e0_ImageCLEFmedCaption_2022_caption_prediction_train.csv\n",
            " cc3d9c72-6c2b-4bd3-9d10-4e133031be48_ImageCLEFmedCaption_2022_caption_prediction_valid.csv\n",
            "'Colab Notebooks'\n",
            " iu_xray.csv\n",
            " two_captions.json\n",
            " VID20220808185241.mp4\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import pandas as pd\n",
        "import json\n",
        "\n",
        "all_data = 'iu_xray.csv'\n",
        "two_patients = 'two_captions.json'\n",
        "\n",
        "all_df = pd.read_csv(all_data, sep='\\t', names=['ID', 'caption'])\n",
        "all_df.head()"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 206
        },
        "id": "PBQ-fq9tvB50",
        "outputId": "d0392fc8-cca8-4de6-e079-d1dde3ef087a"
      },
      "execution_count": 4,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "                        ID                                            caption\n",
              "0  CXR1_1_IM-0001-3001.png  Normal chest x-XXXX. The cardiac silhouette an...\n",
              "1  CXR1_1_IM-0001-4001.png  Normal chest x-XXXX. The cardiac silhouette an...\n",
              "2   CXR10_IM-0002-1001.png  No acute cardiopulmonary process. The cardiome...\n",
              "3   CXR10_IM-0002-2001.png  No acute cardiopulmonary process. The cardiome...\n",
              "4  CXR100_IM-0002-1001.png  No active disease. Both lungs are clear and ex..."
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-b80fed9e-daf4-486b-9217-301b2b4144d9\">\n",
              "    <div class=\"colab-df-container\">\n",
              "      <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>ID</th>\n",
              "      <th>caption</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>CXR1_1_IM-0001-3001.png</td>\n",
              "      <td>Normal chest x-XXXX. The cardiac silhouette an...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>CXR1_1_IM-0001-4001.png</td>\n",
              "      <td>Normal chest x-XXXX. The cardiac silhouette an...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>CXR10_IM-0002-1001.png</td>\n",
              "      <td>No acute cardiopulmonary process. The cardiome...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>CXR10_IM-0002-2001.png</td>\n",
              "      <td>No acute cardiopulmonary process. The cardiome...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>CXR100_IM-0002-1001.png</td>\n",
              "      <td>No active disease. Both lungs are clear and ex...</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-b80fed9e-daf4-486b-9217-301b2b4144d9')\"\n",
              "              title=\"Convert this dataframe to an interactive table.\"\n",
              "              style=\"display:none;\">\n",
              "        \n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "       width=\"24px\">\n",
              "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
              "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
              "  </svg>\n",
              "      </button>\n",
              "      \n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      flex-wrap:wrap;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "      <script>\n",
              "        const buttonEl =\n",
              "          document.querySelector('#df-b80fed9e-daf4-486b-9217-301b2b4144d9 button.colab-df-convert');\n",
              "        buttonEl.style.display =\n",
              "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "        async function convertToInteractive(key) {\n",
              "          const element = document.querySelector('#df-b80fed9e-daf4-486b-9217-301b2b4144d9');\n",
              "          const dataTable =\n",
              "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                     [key], {});\n",
              "          if (!dataTable) return;\n",
              "\n",
              "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "            + ' to learn more about interactive tables.';\n",
              "          element.innerHTML = '';\n",
              "          dataTable['output_type'] = 'display_data';\n",
              "          await google.colab.output.renderOutput(dataTable, element);\n",
              "          const docLink = document.createElement('div');\n",
              "          docLink.innerHTML = docLinkHtml;\n",
              "          element.appendChild(docLink);\n",
              "        }\n",
              "      </script>\n",
              "    </div>\n",
              "  </div>\n",
              "  "
            ]
          },
          "metadata": {},
          "execution_count": 4
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "all_df.describe()"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 175
        },
        "id": "5BRxtsirvXt_",
        "outputId": "466c816e-4160-4839-cba8-3fdf4ad424e3"
      },
      "execution_count": 5,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "                             ID  \\\n",
              "count                      7430   \n",
              "unique                     7430   \n",
              "top     CXR1_1_IM-0001-3001.png   \n",
              "freq                          1   \n",
              "\n",
              "                                                  caption  \n",
              "count                                                7430  \n",
              "unique                                               3066  \n",
              "top     No acute disease. The heart is normal in size....  \n",
              "freq                                                   96  "
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-f14d8968-2f34-426c-98f3-5309812599a9\">\n",
              "    <div class=\"colab-df-container\">\n",
              "      <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>ID</th>\n",
              "      <th>caption</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>count</th>\n",
              "      <td>7430</td>\n",
              "      <td>7430</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>unique</th>\n",
              "      <td>7430</td>\n",
              "      <td>3066</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>top</th>\n",
              "      <td>CXR1_1_IM-0001-3001.png</td>\n",
              "      <td>No acute disease. The heart is normal in size....</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>freq</th>\n",
              "      <td>1</td>\n",
              "      <td>96</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-f14d8968-2f34-426c-98f3-5309812599a9')\"\n",
              "              title=\"Convert this dataframe to an interactive table.\"\n",
              "              style=\"display:none;\">\n",
              "        \n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "       width=\"24px\">\n",
              "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
              "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
              "  </svg>\n",
              "      </button>\n",
              "      \n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      flex-wrap:wrap;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "      <script>\n",
              "        const buttonEl =\n",
              "          document.querySelector('#df-f14d8968-2f34-426c-98f3-5309812599a9 button.colab-df-convert');\n",
              "        buttonEl.style.display =\n",
              "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "        async function convertToInteractive(key) {\n",
              "          const element = document.querySelector('#df-f14d8968-2f34-426c-98f3-5309812599a9');\n",
              "          const dataTable =\n",
              "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                     [key], {});\n",
              "          if (!dataTable) return;\n",
              "\n",
              "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "            + ' to learn more about interactive tables.';\n",
              "          element.innerHTML = '';\n",
              "          dataTable['output_type'] = 'display_data';\n",
              "          await google.colab.output.renderOutput(dataTable, element);\n",
              "          const docLink = document.createElement('div');\n",
              "          docLink.innerHTML = docLinkHtml;\n",
              "          element.appendChild(docLink);\n",
              "        }\n",
              "      </script>\n",
              "    </div>\n",
              "  </div>\n",
              "  "
            ]
          },
          "metadata": {},
          "execution_count": 5
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import matplotlib.pyplot as plt\n",
        "patient_images = {}\n",
        "for visit in all_df.ID.to_list():\n",
        "    patient = visit[3:].split(\"_\")[0]\n",
        "    if patient in patient_images:\n",
        "        patient_images[patient].append(visit)\n",
        "    else:\n",
        "        patient_images[patient] = [visit]\n",
        "\n",
        "\n",
        "iuxray_ids_img1 = [patient_images[patient][0] for patient in patient_images if len(patient_images[patient])==1]\n",
        "iuxray_ids_img2 = [patient_images[patient][0] for patient in patient_images if len(patient_images[patient])==2]\n",
        "iuxray_ids_img3 = [patient_images[patient][0] for patient in patient_images if len(patient_images[patient])==3]\n",
        "iuxray_ids_img4 = [patient_images[patient][0] for patient in patient_images if len(patient_images[patient])==4]\n",
        "iuxray_ids_img5 = [patient_images[patient][0] for patient in patient_images if len(patient_images[patient])==5]\n",
        "samples = [len(iuxray_ids_img1), len(iuxray_ids_img2), len(iuxray_ids_img3), len(iuxray_ids_img4), len(iuxray_ids_img5)]\n",
        "number = [1, 2, 3, 4, 5]\n",
        "\n",
        "plt.xlabel('Number of images per patient')\n",
        "plt.ylabel('Number of patients')\n",
        "plt.bar(number, samples)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 296
        },
        "id": "nGsgpDYkC3a8",
        "outputId": "2520f321-62cf-4c7b-a1fb-7913d3639592"
      },
      "execution_count": 6,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "<BarContainer object of 5 artists>"
            ]
          },
          "metadata": {},
          "execution_count": 6
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<Figure size 432x288 with 1 Axes>"
            ],
            "image/png": "\n"
          },
          "metadata": {
            "needs_background": "light"
          }
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import numpy as np\n",
        "\n",
        "with open(two_patients) as f:\n",
        "  data = json.load(f)\n",
        "captions = list(data.values())\n",
        "words_per_sent = list()\n",
        "for caption in captions:\n",
        "  tokenized_caption = caption.split()\n",
        "  words_per_sent.append(len(tokenized_caption))\n",
        "\n",
        "print(f'Minimum word-tokens in a caption: {min(words_per_sent)} (Occurences: {sum(np.array(words_per_sent)==min(words_per_sent))} times)')\n",
        "print(f'Maximum word-tokens in a caption: {max(words_per_sent)} (Occurences: {sum(np.array(words_per_sent)==max(words_per_sent))} times)')\n",
        "print('mean number of words in captions:', round(np.mean(words_per_sent),2))"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "PhdS3Sd7vwI1",
        "outputId": "699e821b-4236-4095-e70a-71a7d439aa2d"
      },
      "execution_count": 7,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Minimum word-tokens in a caption: 3 (Occurences: 10 times)\n",
            "Maximum word-tokens in a caption: 176 (Occurences: 1 times)\n",
            "mean number of words in captions: 37.27\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import matplotlib.pyplot as plt\n",
        "plt.figure(figsize=(30,15))\n",
        "fig, (ax1, ax2) = plt.subplots(1, 2,  figsize=(11, 4))\n",
        "\n",
        "ax1.hist(words_per_sent, bins=50, align='left', edgecolor='black',\n",
        "              linewidth=0.5)\n",
        " \n",
        "# Add axis labels\n",
        "ax1.set(xlabel='Number of words', ylabel='Images')\n",
        "\n",
        " \n",
        "# plt.show()\n",
        "ax2.set(xlabel='Number of words', ylabel='')\n",
        "ax2.set_yticklabels(' ')\n",
        "ax2.boxplot(words_per_sent, vert=False)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 435
        },
        "id": "DfT7ErR8yBTd",
        "outputId": "f5eb1aa1-b6d5-4981-d4e6-5df156074a70"
      },
      "execution_count": 8,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "{'whiskers': [<matplotlib.lines.Line2D at 0x7f12f385d990>,\n",
              "  <matplotlib.lines.Line2D at 0x7f12f385ded0>],\n",
              " 'caps': [<matplotlib.lines.Line2D at 0x7f12f3867450>,\n",
              "  <matplotlib.lines.Line2D at 0x7f12f38548d0>],\n",
              " 'boxes': [<matplotlib.lines.Line2D at 0x7f12f385d3d0>],\n",
              " 'medians': [<matplotlib.lines.Line2D at 0x7f12f3867950>],\n",
              " 'fliers': [<matplotlib.lines.Line2D at 0x7f12f3867cd0>],\n",
              " 'means': []}"
            ]
          },
          "metadata": {},
          "execution_count": 8
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<Figure size 2160x1080 with 0 Axes>"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<Figure size 792x288 with 2 Axes>"
            ],
            "image/png": "\n"
          },
          "metadata": {
            "needs_background": "light"
          }
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import collections\n",
        "counters = collections.Counter(captions)\n",
        "\n",
        "df = pd.DataFrame.from_records(counters.most_common(), columns=['caption','count'])\n",
        "df.head()"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 206
        },
        "id": "quPfpjrsxMH3",
        "outputId": "1f80b41b-ec0d-4028-8368-ed3c04bbd719"
      },
      "execution_count": 9,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "                                             caption  count\n",
              "0  No acute disease. The heart is normal in size....     42\n",
              "1  No active disease. The heart and lungs have XX...     37\n",
              "2  Normal chest Heart size normal. Lungs are clea...     30\n",
              "3  No active disease. Both lungs are clear and ex...     26\n",
              "4  No acute cardiopulmonary abnormality.. The lun...     25"
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-b7772fd9-1b26-4ab9-bc21-87cc3e0a8c0a\">\n",
              "    <div class=\"colab-df-container\">\n",
              "      <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>caption</th>\n",
              "      <th>count</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>No acute disease. The heart is normal in size....</td>\n",
              "      <td>42</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>No active disease. The heart and lungs have XX...</td>\n",
              "      <td>37</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>Normal chest Heart size normal. Lungs are clea...</td>\n",
              "      <td>30</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>No active disease. Both lungs are clear and ex...</td>\n",
              "      <td>26</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>No acute cardiopulmonary abnormality.. The lun...</td>\n",
              "      <td>25</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-b7772fd9-1b26-4ab9-bc21-87cc3e0a8c0a')\"\n",
              "              title=\"Convert this dataframe to an interactive table.\"\n",
              "              style=\"display:none;\">\n",
              "        \n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "       width=\"24px\">\n",
              "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
              "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
              "  </svg>\n",
              "      </button>\n",
              "      \n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      flex-wrap:wrap;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "      <script>\n",
              "        const buttonEl =\n",
              "          document.querySelector('#df-b7772fd9-1b26-4ab9-bc21-87cc3e0a8c0a button.colab-df-convert');\n",
              "        buttonEl.style.display =\n",
              "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "        async function convertToInteractive(key) {\n",
              "          const element = document.querySelector('#df-b7772fd9-1b26-4ab9-bc21-87cc3e0a8c0a');\n",
              "          const dataTable =\n",
              "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                     [key], {});\n",
              "          if (!dataTable) return;\n",
              "\n",
              "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "            + ' to learn more about interactive tables.';\n",
              "          element.innerHTML = '';\n",
              "          dataTable['output_type'] = 'display_data';\n",
              "          await google.colab.output.renderOutput(dataTable, element);\n",
              "          const docLink = document.createElement('div');\n",
              "          docLink.innerHTML = docLinkHtml;\n",
              "          element.appendChild(docLink);\n",
              "        }\n",
              "      </script>\n",
              "    </div>\n",
              "  </div>\n",
              "  "
            ]
          },
          "metadata": {},
          "execution_count": 9
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "freqs_list = {}\n",
        "for counts in df['count'].to_list():\n",
        "  if counts in freqs_list:\n",
        "    freqs_list[counts] += 1\n",
        "  else:\n",
        "    freqs_list[counts] = 1\n",
        "from pprint import pprint\n",
        "pprint(freqs_list)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "b785abzk38q9",
        "outputId": "9283973b-9c6b-4848-c890-ae121f5fa40f"
      },
      "execution_count": 10,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "{1: 2415,\n",
            " 2: 47,\n",
            " 3: 31,\n",
            " 4: 10,\n",
            " 5: 18,\n",
            " 6: 4,\n",
            " 7: 6,\n",
            " 8: 9,\n",
            " 9: 3,\n",
            " 10: 4,\n",
            " 11: 2,\n",
            " 12: 1,\n",
            " 13: 1,\n",
            " 16: 2,\n",
            " 19: 1,\n",
            " 25: 1,\n",
            " 26: 1,\n",
            " 30: 1,\n",
            " 37: 1,\n",
            " 42: 1}\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "print(f'About {round(round(freqs_list[1]/len(captions)*100,2))}% of the captions are unique.')"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "njcijvUQ4gHn",
        "outputId": "c41a9dba-f82e-4019-9a27-21619500e453"
      },
      "execution_count": 11,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "About 76% of the captions are unique.\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import tensorflow\n",
        "import keras\n",
        "from keras.preprocessing.text import Tokenizer\n",
        "from collections import Counter\n",
        "tokenizer = Tokenizer()\n",
        "tokenizer.fit_on_texts(captions)\n",
        "vocab_size = len(tokenizer.word_index) + 1\n",
        "print('Vocabulary Size: %d' % vocab_size)\n",
        "ordered = Counter(tokenizer.word_counts)\n",
        "# top 10\n",
        "print('Top10 frequent')\n",
        "ordered.most_common(10)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "rW6fTzDK5JHJ",
        "outputId": "814f77bc-93d3-42f8-cde8-1268d33279a3"
      },
      "execution_count": 12,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Vocabulary Size: 1874\n",
            "Top10 frequent\n"
          ]
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "[('no', 6235),\n",
              " ('the', 5639),\n",
              " ('are', 3819),\n",
              " ('is', 3770),\n",
              " ('normal', 3372),\n",
              " ('xxxx', 2820),\n",
              " ('of', 2737),\n",
              " ('and', 2736),\n",
              " ('acute', 2511),\n",
              " ('pleural', 2330)]"
            ]
          },
          "metadata": {},
          "execution_count": 12
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from nltk.corpus import stopwords # Import the stop word list\n",
        "from nltk.tokenize import wordpunct_tokenize\n",
        "import nltk\n",
        "nltk.download('stopwords')\n",
        "stop_words = set(stopwords.words('english')) \n",
        "dictionary_wo_stopwords = {k:v for k,v in tokenizer.word_counts.items() if k not in stop_words}\n",
        "\n",
        "ordered_wo = Counter(dictionary_wo_stopwords)\n",
        "# top 10\n",
        "print('Most frequent words w/o stopwords')\n",
        "ordered_wo.most_common(10)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "qRVSyiXT5TpG",
        "outputId": "9f7d2bb9-41f3-445e-88cb-4090439f4d06"
      },
      "execution_count": 13,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Most frequent words w/o stopwords\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
            "[nltk_data]   Unzipping corpora/stopwords.zip.\n"
          ]
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "[('normal', 3372),\n",
              " ('xxxx', 2820),\n",
              " ('acute', 2511),\n",
              " ('pleural', 2330),\n",
              " ('pneumothorax', 2102),\n",
              " ('effusion', 2049),\n",
              " ('heart', 2028),\n",
              " ('lungs', 1983),\n",
              " ('size', 1854),\n",
              " ('clear', 1585)]"
            ]
          },
          "metadata": {},
          "execution_count": 13
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "print(f'There are {sum(np.array(list(ordered.values()))==1)} words with only 1 occurence')"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "LzdwEkDI7cG_",
        "outputId": "c7d1aa3d-4040-4a49-b051-3541e8279543"
      },
      "execution_count": 14,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "There are 529 words with only 1 occurence\n"
          ]
        }
      ]
    }
  ]
}