[03245f]: / notebooks / IU_Xray_EDA.ipynb

Download this file

986 lines (986 with data), 59.5 kB

{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "collapsed_sections": [],
      "authorship_tag": "ABX9TyPaxWDmXP+eW1nbC2gcFIE5",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/zaaachos/Thesis-Diagnostic-Captioning/blob/main/notebooks/IU_Xray_EDA.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from google.colab import drive\n",
        "drive.mount('/content/drive')"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "rKTWUMPVugDP",
        "outputId": "46d534c9-e365-44d6-a0c1-cf8bd6552913"
      },
      "execution_count": 1,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Mounted at /content/drive\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "%cd drive/MyDrive"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Gzl3ALvDujPH",
        "outputId": "b60a4dd4-60ef-4d01-a4c7-cb1e65f53b9c"
      },
      "execution_count": 2,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "/content/drive/MyDrive\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "!ls"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "ywbUILg1uork",
        "outputId": "af2d5233-e968-407e-a3e9-739125551886"
      },
      "execution_count": 3,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            " 46bff9d5-95d4-4362-be98-ef59819ec3af_ImageCLEFmedCaption_2022_concept_detection_valid.csv\n",
            " 72d678c1-c535-491d-bc42-c2ba11f47165_concepts.csv\n",
            " b47c4f80-9432-408c-b69a-956a3382a0da_ImageCLEFmedCaption_2022_concept_detection_train.csv\n",
            " c856ae07-029b-449e-bd06-99c04d3ad1e0_ImageCLEFmedCaption_2022_caption_prediction_train.csv\n",
            " cc3d9c72-6c2b-4bd3-9d10-4e133031be48_ImageCLEFmedCaption_2022_caption_prediction_valid.csv\n",
            "'Colab Notebooks'\n",
            " iu_xray.csv\n",
            " two_captions.json\n",
            " VID20220808185241.mp4\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import pandas as pd\n",
        "import json\n",
        "\n",
        "all_data = 'iu_xray.csv'\n",
        "two_patients = 'two_captions.json'\n",
        "\n",
        "all_df = pd.read_csv(all_data, sep='\\t', names=['ID', 'caption'])\n",
        "all_df.head()"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 206
        },
        "id": "PBQ-fq9tvB50",
        "outputId": "d0392fc8-cca8-4de6-e079-d1dde3ef087a"
      },
      "execution_count": 4,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "                        ID                                            caption\n",
              "0  CXR1_1_IM-0001-3001.png  Normal chest x-XXXX. The cardiac silhouette an...\n",
              "1  CXR1_1_IM-0001-4001.png  Normal chest x-XXXX. The cardiac silhouette an...\n",
              "2   CXR10_IM-0002-1001.png  No acute cardiopulmonary process. The cardiome...\n",
              "3   CXR10_IM-0002-2001.png  No acute cardiopulmonary process. The cardiome...\n",
              "4  CXR100_IM-0002-1001.png  No active disease. Both lungs are clear and ex..."
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-b80fed9e-daf4-486b-9217-301b2b4144d9\">\n",
              "    <div class=\"colab-df-container\">\n",
              "      <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>ID</th>\n",
              "      <th>caption</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>CXR1_1_IM-0001-3001.png</td>\n",
              "      <td>Normal chest x-XXXX. The cardiac silhouette an...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>CXR1_1_IM-0001-4001.png</td>\n",
              "      <td>Normal chest x-XXXX. The cardiac silhouette an...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>CXR10_IM-0002-1001.png</td>\n",
              "      <td>No acute cardiopulmonary process. The cardiome...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>CXR10_IM-0002-2001.png</td>\n",
              "      <td>No acute cardiopulmonary process. The cardiome...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>CXR100_IM-0002-1001.png</td>\n",
              "      <td>No active disease. Both lungs are clear and ex...</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-b80fed9e-daf4-486b-9217-301b2b4144d9')\"\n",
              "              title=\"Convert this dataframe to an interactive table.\"\n",
              "              style=\"display:none;\">\n",
              "        \n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "       width=\"24px\">\n",
              "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
              "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
              "  </svg>\n",
              "      </button>\n",
              "      \n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      flex-wrap:wrap;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "      <script>\n",
              "        const buttonEl =\n",
              "          document.querySelector('#df-b80fed9e-daf4-486b-9217-301b2b4144d9 button.colab-df-convert');\n",
              "        buttonEl.style.display =\n",
              "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "        async function convertToInteractive(key) {\n",
              "          const element = document.querySelector('#df-b80fed9e-daf4-486b-9217-301b2b4144d9');\n",
              "          const dataTable =\n",
              "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                     [key], {});\n",
              "          if (!dataTable) return;\n",
              "\n",
              "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "            + ' to learn more about interactive tables.';\n",
              "          element.innerHTML = '';\n",
              "          dataTable['output_type'] = 'display_data';\n",
              "          await google.colab.output.renderOutput(dataTable, element);\n",
              "          const docLink = document.createElement('div');\n",
              "          docLink.innerHTML = docLinkHtml;\n",
              "          element.appendChild(docLink);\n",
              "        }\n",
              "      </script>\n",
              "    </div>\n",
              "  </div>\n",
              "  "
            ]
          },
          "metadata": {},
          "execution_count": 4
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "all_df.describe()"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 175
        },
        "id": "5BRxtsirvXt_",
        "outputId": "466c816e-4160-4839-cba8-3fdf4ad424e3"
      },
      "execution_count": 5,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "                             ID  \\\n",
              "count                      7430   \n",
              "unique                     7430   \n",
              "top     CXR1_1_IM-0001-3001.png   \n",
              "freq                          1   \n",
              "\n",
              "                                                  caption  \n",
              "count                                                7430  \n",
              "unique                                               3066  \n",
              "top     No acute disease. The heart is normal in size....  \n",
              "freq                                                   96  "
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-f14d8968-2f34-426c-98f3-5309812599a9\">\n",
              "    <div class=\"colab-df-container\">\n",
              "      <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>ID</th>\n",
              "      <th>caption</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>count</th>\n",
              "      <td>7430</td>\n",
              "      <td>7430</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>unique</th>\n",
              "      <td>7430</td>\n",
              "      <td>3066</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>top</th>\n",
              "      <td>CXR1_1_IM-0001-3001.png</td>\n",
              "      <td>No acute disease. The heart is normal in size....</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>freq</th>\n",
              "      <td>1</td>\n",
              "      <td>96</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-f14d8968-2f34-426c-98f3-5309812599a9')\"\n",
              "              title=\"Convert this dataframe to an interactive table.\"\n",
              "              style=\"display:none;\">\n",
              "        \n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "       width=\"24px\">\n",
              "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
              "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
              "  </svg>\n",
              "      </button>\n",
              "      \n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      flex-wrap:wrap;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "      <script>\n",
              "        const buttonEl =\n",
              "          document.querySelector('#df-f14d8968-2f34-426c-98f3-5309812599a9 button.colab-df-convert');\n",
              "        buttonEl.style.display =\n",
              "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "        async function convertToInteractive(key) {\n",
              "          const element = document.querySelector('#df-f14d8968-2f34-426c-98f3-5309812599a9');\n",
              "          const dataTable =\n",
              "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                     [key], {});\n",
              "          if (!dataTable) return;\n",
              "\n",
              "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "            + ' to learn more about interactive tables.';\n",
              "          element.innerHTML = '';\n",
              "          dataTable['output_type'] = 'display_data';\n",
              "          await google.colab.output.renderOutput(dataTable, element);\n",
              "          const docLink = document.createElement('div');\n",
              "          docLink.innerHTML = docLinkHtml;\n",
              "          element.appendChild(docLink);\n",
              "        }\n",
              "      </script>\n",
              "    </div>\n",
              "  </div>\n",
              "  "
            ]
          },
          "metadata": {},
          "execution_count": 5
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import matplotlib.pyplot as plt\n",
        "patient_images = {}\n",
        "for visit in all_df.ID.to_list():\n",
        "    patient = visit[3:].split(\"_\")[0]\n",
        "    if patient in patient_images:\n",
        "        patient_images[patient].append(visit)\n",
        "    else:\n",
        "        patient_images[patient] = [visit]\n",
        "\n",
        "\n",
        "iuxray_ids_img1 = [patient_images[patient][0] for patient in patient_images if len(patient_images[patient])==1]\n",
        "iuxray_ids_img2 = [patient_images[patient][0] for patient in patient_images if len(patient_images[patient])==2]\n",
        "iuxray_ids_img3 = [patient_images[patient][0] for patient in patient_images if len(patient_images[patient])==3]\n",
        "iuxray_ids_img4 = [patient_images[patient][0] for patient in patient_images if len(patient_images[patient])==4]\n",
        "iuxray_ids_img5 = [patient_images[patient][0] for patient in patient_images if len(patient_images[patient])==5]\n",
        "samples = [len(iuxray_ids_img1), len(iuxray_ids_img2), len(iuxray_ids_img3), len(iuxray_ids_img4), len(iuxray_ids_img5)]\n",
        "number = [1, 2, 3, 4, 5]\n",
        "\n",
        "plt.xlabel('Number of images per patient')\n",
        "plt.ylabel('Number of patients')\n",
        "plt.bar(number, samples)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 296
        },
        "id": "nGsgpDYkC3a8",
        "outputId": "2520f321-62cf-4c7b-a1fb-7913d3639592"
      },
      "execution_count": 6,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "<BarContainer object of 5 artists>"
            ]
          },
          "metadata": {},
          "execution_count": 6
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<Figure size 432x288 with 1 Axes>"
            ],
            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYsAAAEGCAYAAACUzrmNAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAar0lEQVR4nO3de7RdZXnv8e+PcBUTQkzMCElwR4x6og4D3eVSqEUoIYASpGBJFQJSg6fggaot0cpNQHFYRKmCBskBKpITQSVANKTIpTiAZIdLrqRsMUpyItmWW9DCaeA5f8x34WSz95ozmz3XWpv1+4wxx5rznbdnbhjryXzfd72vIgIzM7N6tmt2AGZm1vqcLMzMrJCThZmZFXKyMDOzQk4WZmZWaPtmB1CF0aNHR0dHR7PDMDMbUpYvX/67iBjT1743ZLLo6Oigq6ur2WGYmQ0pkn7d3z5XQ5mZWSEnCzMzK+RkYWZmhZwszMyskJOFmZkVcrIwM7NCThZmZlbIycLMzAo5WZiZWaE35C+4beA65tzW7BAGzfpLjmp2CGZvGH6zMDOzQk4WZmZWyMnCzMwKOVmYmVkhJwszMyvkZGFmZoWcLMzMrJCThZmZFaosWUjaWdJSSY9IWi3pglQ+SdIDkrol/R9JO6byndJ2d9rfkbvW51P5OkmHVxWzmZn1rco3ixeBQyLi/cBUYLqk/YGvApdFxDuAp4FT0/GnAk+n8svScUiaApwAvAeYDlwhaViFcZuZWS+VJYvIPJ82d0hLAIcAN6bya4Fj0vqMtE3af6gkpfL5EfFiRPwK6Ab2rSpuMzN7rUrbLCQNk/QwsBlYAvwSeCYitqZDNgDj0/p44AmAtP9Z4C358j7Oyd9rtqQuSV09PT1VPI6ZWduqNFlExEsRMRWYQPY28O4K7zU3IjojonPMmDFV3cbMrC01pDdURDwD3AkcAIyUVBvtdgKwMa1vBCYCpP27Af+ZL+/jHDMza4Aqe0ONkTQyre8CHAasJUsax6XDZgE3p/WFaZu0/+cREan8hNRbahIwGVhaVdxmZvZaVc5nMQ64NvVc2g5YEBG3SloDzJd0EfAQcHU6/mrgXyV1A0+R9YAiIlZLWgCsAbYCp0fESxXGbWZmvVSWLCJiBbB3H+WP00dvpoh4ATi+n2tdDFw82DGamVk5/gW3mZkVcrIwM7NCThZmZlbIycLMzAo5WZiZWSEnCzMzK+RkYWZmhZwszMyskJOFmZkVcrIwM7NCThZmZlbIycLMzAo5WZiZWSEnCzMzK+RkYWZmhZwszMyskJOFmZkVcrIwM7NCThZmZlbIycLMzAo5WZiZWSEnCzMzK+RkYWZmhZwszMysUGXJQtJESXdKWiNptaQzU/n5kjZKejgtR+bO+bykbknrJB2eK5+eyrolzakqZjMz69v2FV57K/DZiHhQ0nBguaQlad9lEfHP+YMlTQFOAN4D7AH8m6R3pt3fBg4DNgDLJC2MiDUVxm5mZjmVJYuI2ARsSutbJK0Fxtc5ZQYwPyJeBH4lqRvYN+3rjojHASTNT8c6WZiZNUhD2iwkdQB7Aw+kojMkrZA0T9LuqWw88ETutA2prL/y3veYLalLUldPT88gP4GZWXurPFlIejNwE3BWRDwHXAnsBUwle/O4dDDuExFzI6IzIjrHjBkzGJc0M7OkyjYLJO1Aliiuj4gfAUTEk7n9VwG3ps2NwMTc6RNSGXXKzcysAarsDSXgamBtRHw9Vz4ud9hHgFVpfSFwgqSdJE0CJgNLgWXAZEmTJO1I1gi+sKq4zczstap8szgQOBFYKenhVPYFYKakqUAA64HTACJitaQFZA3XW4HTI+IlAElnAIuBYcC8iFhdYdxmZtZLlb2h7gXUx65Fdc65GLi4j/JF9c4zM7NqbVM1lKTtJI2oKhgzM2tNhclC0g8kjZC0K1n7whpJ/1B9aGZm1irKvFlMSV1ejwF+Ckwia4swM7M2USZZ7JC6wB4DLIyI/644JjMzazFlksV3yXot7QrcI+ltwLNVBmVmZq2lTLK4JSLGR8SRERHAb4BPVByXmZm1kDLJ4qb8RkoY86sJx8zMWlG/v7OQ9G6y4cJ3k3RsbtcIYOeqAzMzs9ZR70d57wI+BIwEPpwr3wJ8ssqgzMystfSbLCLiZuBmSQdExH0NjMnMzFpMmeE+uiV9AejIHx8RbuQ2M2sTZZLFzcC/A/8GvFRtOGZm1orKJIs3RcTZlUdiZmYtq0zX2VslHVl5JGZm1rLKJIszyRLGC5Kek7RF0nNVB2ZmZq2jsBoqIoY3IhAzM2tdZYYol6SPSzonbU+UtG/1oZmZWasoUw11BXAA8Ddp+3ng25VFZGZmLadMb6j9ImIfSQ8BRMTTknasOC4zM2shZd4s/lvSMCAAJI0BXq40KjMzayllksXlwI+Bt0q6GLgX+HKlUZmZWUsp0xvqeknLgUMBAcdExNrKIzMzs5ZRb4jyERHxnKRRwGbghty+URHxVCMCNDOz5qtXDfWD9Lkc6Motte26UhfbOyWtkbRa0pmpfJSkJZIeS5+7p3JJulxSt6QVkvbJXWtWOv4xSbMG+KxmZjZA9YYo/1D6nDTAa28FPhsRD0oaDiyXtAQ4GbgjIi6RNAeYA5wNHAFMTst+wJXAfunN5jygk6yRfbmkhRHx9ADjMjOzbVTmR3l3lCnrLSI2RcSDaX0LsBYYD8wArk2HXQsck9ZnANdF5n5gpKRxwOHAkoh4KiWIJcD0wiczM7NBU6/NYmfgTcDoVFWktGsE2Zd+aZI6gL2BB4CxEbEp7fotMDatjweeyJ22IZX1V977HrOB2QB77rnntoRnZmYF6vWGOg04C9iDrJ2iliyeA75V9gaS3gzcBJyVGsxf2RcRISm2Nei+RMRcYC5AZ2fnoFzTzMwy/VZDRcQ3U3vF5yLi7RExKS3vj4hSyULSDmSJ4vqI+FEqfjJVL5E+N6fyjcDE3OkTUll/5WZm1iCFbRYR8S+S3ivpo5JOqi1F5yl7hbgaWBsRX8/tWgjUejTNIpuJr1Z+UuoVtT/wbKquWgxMk7R7qg6blsrMzKxBCn+UJ+k84GBgCrCIrNfSvcB1BaceCJwIrJT0cCr7AnAJsEDSqcCvgY+mfYuAI4Fu4A/AKQAR8ZSkC4Fl6bgv+TceZmaNVWYgweOA9wMPRcQpksYC3y86KSLu5Y/tHL0d2sfxAZzez7XmAfNKxGpmZhUoMzbUf0XEy8BWSSPI2hgmFpxjZmZvIGXeLLokjQSuIusV9TxwX6VRmZlZSykzkODfpdXvSPoZMCIiVlQblpmZtZIybxZIOhY4iGy4jXsBJwszszZSZriPK4BPASuBVcBpkjytqplZGynzZnEI8D9SbyUkXQusrjQqMzNrKWV6Q3UD+cGWJqYyMzNrE2XeLIYDayUtJWuz2Jesh9RCgIg4usL4zMysBZRJFudWHoWZmbW0Ml1n725EIGZm1rrKtFmYmVmbc7IwM7NC/SaL2tSpkr7auHDMzKwV1WuzGCfpz4CjJc2n1wiytfm1zczsja9esjgXOIdsZrqv99oXZD/WMzOzNtBvsoiIG4EbJZ0TERc2MCYzM2sxZbrOXijpaOADqeiuiLi12rDMzKyVlBlI8CvAmcCatJwp6ctVB2ZmZq2jzC+4jwKmptnyagMJPkQ2n7aZmbWBsr+zGJlb362KQMzMrHWVebP4CvCQpDvJus9+AJhTaVRmZtZSyjRw3yDpLuBPU9HZEfHbSqMyM7OWUmpa1YjYBCysOBYzM2tRlY0NJWmepM2SVuXKzpe0UdLDaTkyt+/zkrolrZN0eK58eirrluTqLzOzJqhyIMFrgOl9lF8WEVPTsghA0hTgBOA96ZwrJA2TNAz4NnAEMAWYmY41M7MGqpss0hf2owO5cETcAzxV8vAZwPyIeDEifkU2beu+aemOiMcj4v8B89OxZmbWQHWTRUS8BKyTtGe947bRGZJWpGqq3VPZeOCJ3DEbUll/5WZm1kBlqqF2B1ZLukPSwtoywPtdCewFTAU2AZcO8DqvIWm2pC5JXT09PYN1WTMzo1xvqHMG62YR8WRtXdJVQG2MqY3AxNyhE1IZdcp7X3suMBegs7MzBilkMzOjxJtFmoN7PbBDWl8GDGguC0njcpsfAWo9pRYCJ0jaSdIkYDKwNN1rsqRJknYkawR3F14zswYrfLOQ9ElgNjCKrAppPPAd4NCC824ADgZGS9oAnAccLGkq2XwY64HTACJitaQFZAMVbgVOT+0lSDoDWAwMA+ZFxOptfkozM3tdylRDnU7WK+kBgIh4TNJbi06KiJl9FF9d5/iLgYv7KF8ELCoRp5mZVaRMA/eLqdsqAJK2J3szMDOzNlEmWdwt6QvALpIOA34I3FJtWGZm1krKJIs5QA+wkqyNYRHwxSqDMjOz1lJm1NmX04RHD5BVP62LCFdDmZm1kTK9oY4i6/30S7L5LCZJOi0iflp1cGZm1hrK9Ia6FPhgRHQDSNoLuA1wsjAzaxNl2iy21BJF8jiwpaJ4zMysBfX7ZiHp2LTaJWkRsICszeJ4sl9Wm5lZm6hXDfXh3PqTwF+k9R5gl8oiMjOzltNvsoiIUxoZiJmZta4yvaEmAZ8GOvLHR8TR1YVlZmatpExvqJ+Qjel0C/ByteGYmVkrKpMsXoiIyyuPxMzMWlaZZPFNSecBtwMv1gojYkBzWpiZ2dBTJlm8DzgROIQ/VkNF2jYzszZQJlkcD7w9P0y5mZm1lzK/4F4FjKw6EDMza11l3ixGAo9KWsar2yzcddbMrE2USRbnVR6FmZm1tDLzWdzdiEDMzKx1lfkF9xb+OOf2jsAOwO8jYkSVgZmZWeso82YxvLYuScAMYP8qgzIzs9ZSpjfUKyLzE+DwiuIxM7MWVKYa6tjc5nZAJ/BCZRGZmVnLKfNm8eHccjjZLHkzik6SNE/SZkmrcmWjJC2R9Fj63D2VS9LlkrolrZC0T+6cWen4xyTN2tYHNDOz169Mm8VA57W4BvgWcF2ubA5wR0RcImlO2j4bOAKYnJb9gCuB/SSNIuu620nWyL5c0sKIeHqAMZmZ2QDUm1b13DrnRURcWO/CEXGPpI5exTOAg9P6tcBdZMliBnBdRARwv6SRksalY5dExFMppiXAdOCGevc2M7PBVa8a6vd9LACnkn3BD8TYiNiU1n8LjE3r44EncsdtSGX9lb+GpNmSuiR19fT0DDA8MzPrS71pVS+trUsaDpwJnALMBy7t77yyIiIkRfGRpa83F5gL0NnZOWjXNTOzggbu1CB9EbCCLLHsExFnR8TmAd7vyVS9RPqsXWcjMDF33IRU1l+5mZk1UL/JQtLXgGVkvZ/eFxHnD0LD8kKg1qNpFnBzrvyk1Ctqf+DZVF21GJgmaffUc2paKjMzswaq1xvqs2SjzH4R+Kfsx9sAiKwWqe5wH5JuIGugHi1pA1mvpkuABZJOBX4NfDQdvgg4EugG/kBW3UVEPCXpQrKkBfClWmO3mZk1Tr02i236dXcf58/sZ9ehfRwbwOn9XGceMO/1xGJmZq/P60oIZmbWHpwszMyskJOFmZkVcrIwM7NCThZmZlbIycLMzAo5WZiZWSEnCzMzK+RkYWZmhZwszMyskJOFmZkVcrIwM7NCThZmZlbIycLMzAo5WZiZWSEnCzMzK+RkYWZmhZwszMyskJOFmZkVcrIwM7NCThZmZlbIycLMzAo5WZiZWaGmJAtJ6yWtlPSwpK5UNkrSEkmPpc/dU7kkXS6pW9IKSfs0I2Yzs3bWzDeLD0bE1IjoTNtzgDsiYjJwR9oGOAKYnJbZwJUNj9TMrM21UjXUDODatH4tcEyu/LrI3A+MlDSuGQGambWrZiWLAG6XtFzS7FQ2NiI2pfXfAmPT+njgidy5G1LZq0iaLalLUldPT09VcZuZtaXtm3TfgyJio6S3AkskPZrfGREhKbblghExF5gL0NnZuU3nmplZfU15s4iIjelzM/BjYF/gyVr1UvrcnA7fCEzMnT4hlZmZWYM0PFlI2lXS8No6MA1YBSwEZqXDZgE3p/WFwEmpV9T+wLO56iozM2uAZlRDjQV+LKl2/x9ExM8kLQMWSDoV+DXw0XT8IuBIoBv4A3BK40M2M2tvDU8WEfE48P4+yv8TOLSP8gBOb0Bor+iYc1sjb1eZ9Zcc1ewQzOwNopW6zpqZWYtysjAzs0JOFmZmVsjJwszMCjlZmJlZIScLMzMr5GRhZmaFnCzMzKyQk4WZmRVysjAzs0JOFmZmVsjJwszMCjlZmJlZIScLMzMr5GRhZmaFmjUHt1nLeaPMYwKey8QGn98szMyskJOFmZkVcrIwM7NCThZmZlbIycLMzAo5WZiZWSEnCzMzK+RkYWZmhYZMspA0XdI6Sd2S5jQ7HjOzdjIkkoWkYcC3gSOAKcBMSVOaG5WZWfsYKsN97At0R8TjAJLmAzOANU2NyuwNxMOdWD2KiGbHUEjSccD0iPjbtH0isF9EnJE7ZjYwO22+C1jX8EC3zWjgd80Ookna+dmhvZ+/nZ8dWv/53xYRY/raMVTeLApFxFxgbrPjKEtSV0R0NjuOZmjnZ4f2fv52fnYY2s8/JNosgI3AxNz2hFRmZmYNMFSSxTJgsqRJknYETgAWNjkmM7O2MSSqoSJiq6QzgMXAMGBeRKxucliv15CpMqtAOz87tPfzt/OzwxB+/iHRwG1mZs01VKqhzMysiZwszMyskJNFg0maJ2mzpFXNjqXRJE2UdKekNZJWSzqz2TE1iqSdJS2V9Eh69guaHVMzSBom6SFJtzY7lkaStF7SSkkPS+pqdjwD4TaLBpP0AeB54LqIeG+z42kkSeOAcRHxoKThwHLgmIh4w/8SX5KAXSPieUk7APcCZ0bE/U0OraEkfQboBEZExIeaHU+jSFoPdEZEK/8gry6/WTRYRNwDPNXsOJohIjZFxINpfQuwFhjf3KgaIzLPp80d0tJW/1KTNAE4Cvhes2OxbedkYU0hqQPYG3iguZE0TqqCeRjYDCyJiLZ59uQbwD8CLzc7kCYI4HZJy9PQREOOk4U1nKQ3AzcBZ0XEc82Op1Ei4qWImEo2AsG+ktqmGlLSh4DNEbG82bE0yUERsQ/ZyNmnp+roIcXJwhoq1dffBFwfET9qdjzNEBHPAHcC05sdSwMdCByd6u7nA4dI+n5zQ2qciNiYPjcDPyYbSXtIcbKwhkmNvFcDayPi682Op5EkjZE0Mq3vAhwGPNrcqBonIj4fERMiooNsuJ6fR8THmxxWQ0jaNXXoQNKuwDRgyPWGdLJoMEk3APcB75K0QdKpzY6pgQ4ETiT7V+XDaTmy2UE1yDjgTkkryMY6WxIRbdV9tI2NBe6V9AiwFLgtIn7W5Ji2mbvOmplZIb9ZmJlZIScLMzMr5GRhZmaFnCzMzKyQk4WZmRVysrBBIykkXZrb/pyk8wfp2tdIOm4wrlVwn+MlrZV0Z6/yPSTdWPX9hypJJ0vaI7f9PUlTBnitgyX92eBFZ4PBycIG04vAsZJGNzuQPEnbMn3wqcAnI+KD+cKI+L8RUXmyaoZt/Pv052TglWQREX/7OkYTPhhwsmgxThY2mLaSzTH897139H4zkPR8+jxY0t2Sbpb0uKRLJH0szf2wUtJeucv8paQuSf+RxhqqDc73NUnLJK2QdFruuv8uaSHwmi8tSTPT9VdJ+moqOxc4CLha0td6Hd9Rm4Mk/Sv6J5KWpHkKzpD0mTRPw/2SRqXjPpniekTSTZLelMr3SsetlHRR7W+R9v1D7lkuSGW7SrotXWeVpL/u43nukvTN9EPHVZL2zZ07L/09H5I0I/cMCyX9HLijj2d9VNL16S3rxlzs56b4Vkmaq8xxZMOOX5/uv0uKpzOdM03SfZIelPRDZWOD1eZ4uCCVr5T0bmUDTH4K+Pt0rT/v/azWJBHhxcugLGTzdIwA1gO7AZ8Dzk/7rgGOyx+bPg8GniH7hfNOwEbggrTvTOAbufN/RvYPnMnABmBnYDbwxXTMTkAXMCld9/fApD7i3AP4DTAG2B74Odm8GgB3kc070PucDmBVWj8Z6AaGp2s8C3wq7buMbIBEgLfkzr8I+HRavxWYmdY/lftbTCNLtkrPeSvwAeCvgKty19qtj/juqh2TzqnF+mXg42l9JPAfwK7pGTYAo/p51gAOTNvzgM+l9VG54/4V+HBff7faNjAauIdsLg+As4Fz0/r63N/k74DvpfXza/fz0jqL3yxsUEU2iux1wP/ahtOWRTbXxYvAL4HbU/lKsi+umgUR8XJEPAY8Dryb7Av2JGVDfz8AvIUsmQAsjYhf9XG/PwXuioieiNgKXE/2Bbst7oyILRHRQ5Ysbukj5vemt5uVwMeA96TyA4AfpvUf5K45LS0PAQ+m55ucrnmYpK9K+vOIeLafmG6AV+ZMGaFsLKppwJz097mLLMHumY5fEhH9za3yRET8Iq1/n+yNC+CDkh5Iz3RI7pn6sz8wBfhFimEW8Lbc/tpgkst59X9razGDUVdp1ts3yL7s/neubCup2lPSdsCOuX0v5tZfzm2/zKv/H+09Nk2Q/Sv80xGxOL9D0sFkbxZVKRPzNWRvLI9IOpnsbaceAV+JiO++Zoe0D3AkcJGkOyLiS32c39/f568iYl2v6+1H/b/Pa64laWfgCrI3iCeUdV7Yud4DpfsviYiZ/eyv/d1ewt9HLc1vFjbo0r9WF5A1FtesB/4krR9NNlPctjpe0napHePtwDpgMfA/lQ19jqR3KhvZs56lwF9IGi1pGDATuHsA8RQZDmxKsX0sV34/WdUSZCOw1iwGPpGr0x8v6a3Kehn9ISK+D3wN2Kef+/11Ou8g4Nn0BrIY+LQkpX17l4x9T0kHpPW/IZsGtpYYfpdizDf4b0nP29v9wIGS3pHuv6ukdxbcu79rWRM5WVhVLiWrr665iuwL+hGyapiB/Kv/N2Rf9D8layN4gWyKzjXAg6kB+rsU/As1IjYBc8jmlHgEWB4RNw8gniLnkFWN/YJXD0d+FvAZZSPQvoOsGouIuJ2sWuq+VM1zI9mX5vuApaka5zyy9o++vCDpIeA7/DFRX0iWmFdIWp22y1hHNknPWmB34MrI5uG4imx47cVko+fWXAN8p9bAXStM1XQnAzek572PrHqtnluAj7iBu7V41FmzBks9i/4rIkLSCWSN3TNe5zXvImsU7hqE+DqAWyOibWbys2KuIzRrvD8BvpWqhp4BPtHkeMwK+c3CzMwKuc3CzMwKOVmYmVkhJwszMyvkZGFmZoWcLMzMrND/Bzss79YY0ZCDAAAAAElFTkSuQmCC\n"
          },
          "metadata": {
            "needs_background": "light"
          }
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import numpy as np\n",
        "\n",
        "with open(two_patients) as f:\n",
        "  data = json.load(f)\n",
        "captions = list(data.values())\n",
        "words_per_sent = list()\n",
        "for caption in captions:\n",
        "  tokenized_caption = caption.split()\n",
        "  words_per_sent.append(len(tokenized_caption))\n",
        "\n",
        "print(f'Minimum word-tokens in a caption: {min(words_per_sent)} (Occurences: {sum(np.array(words_per_sent)==min(words_per_sent))} times)')\n",
        "print(f'Maximum word-tokens in a caption: {max(words_per_sent)} (Occurences: {sum(np.array(words_per_sent)==max(words_per_sent))} times)')\n",
        "print('mean number of words in captions:', round(np.mean(words_per_sent),2))"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "PhdS3Sd7vwI1",
        "outputId": "699e821b-4236-4095-e70a-71a7d439aa2d"
      },
      "execution_count": 7,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Minimum word-tokens in a caption: 3 (Occurences: 10 times)\n",
            "Maximum word-tokens in a caption: 176 (Occurences: 1 times)\n",
            "mean number of words in captions: 37.27\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import matplotlib.pyplot as plt\n",
        "plt.figure(figsize=(30,15))\n",
        "fig, (ax1, ax2) = plt.subplots(1, 2,  figsize=(11, 4))\n",
        "\n",
        "ax1.hist(words_per_sent, bins=50, align='left', edgecolor='black',\n",
        "              linewidth=0.5)\n",
        " \n",
        "# Add axis labels\n",
        "ax1.set(xlabel='Number of words', ylabel='Images')\n",
        "\n",
        " \n",
        "# plt.show()\n",
        "ax2.set(xlabel='Number of words', ylabel='')\n",
        "ax2.set_yticklabels(' ')\n",
        "ax2.boxplot(words_per_sent, vert=False)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 435
        },
        "id": "DfT7ErR8yBTd",
        "outputId": "f5eb1aa1-b6d5-4981-d4e6-5df156074a70"
      },
      "execution_count": 8,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "{'whiskers': [<matplotlib.lines.Line2D at 0x7f12f385d990>,\n",
              "  <matplotlib.lines.Line2D at 0x7f12f385ded0>],\n",
              " 'caps': [<matplotlib.lines.Line2D at 0x7f12f3867450>,\n",
              "  <matplotlib.lines.Line2D at 0x7f12f38548d0>],\n",
              " 'boxes': [<matplotlib.lines.Line2D at 0x7f12f385d3d0>],\n",
              " 'medians': [<matplotlib.lines.Line2D at 0x7f12f3867950>],\n",
              " 'fliers': [<matplotlib.lines.Line2D at 0x7f12f3867cd0>],\n",
              " 'means': []}"
            ]
          },
          "metadata": {},
          "execution_count": 8
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<Figure size 2160x1080 with 0 Axes>"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<Figure size 792x288 with 2 Axes>"
            ],
            "image/png": "iVBORw0KGgoAAAANSUhEUgAAApwAAAEGCAYAAADIXkUdAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nO3de5RdVZ3g8e8vRZlgAiSRgCEPAkrboZnViGnE7vaBLwIKxB50jAzSmpGG0TQ2SotmLWSYjgPNgKNZGho6juggSNOtxEcjSqfBjItHQEQgRGKAkZiQKCWQYEIev/njngq3kqrkVlL73ltV389aZ9U5++x9zu+eW9n51XntyEwkSZKkUka0OgBJkiQNbSackiRJKsqEU5IkSUWZcEqSJKkoE05JkiQVtV+rA9gXBx98cE6bNq3VYUgaYu67777fZOaEVsdRin2npBJ213cO6oRz2rRpLFu2rNVhSBpiIuLJVsdQkn2npBJ213d6SV2SJElFmXBKkiSpKBNOSZIkFWXCKUmSpKJMOCVJklSUCackSZKKMuGUJElSUSackiRJKsqEU5IkSUUN6pGGBpPzLryYVWu7epQd+cpxLLzi0hZFJEmS1BwmnE2yam0XKybN7Fm4+tbWBCNJktREXlKXJElSUSackiRJKsqEU5IkSUWZcEqSJKkoE05JkiQVZcIpSZKkokw4JUmSVJQJpyRJkooqlnBGxKiIuCcifhYRD0fEf6vKvxoRj0fEA9V0bFUeEfHFiFgZEQ9GxHGlYpMkSVLzlBxpaDPw1szcEBGdwNKI+Ndq3YWZefNO9U8Gjqqm1wMLq5/DSm9DYILDYEqSpMGrWMKZmQlsqBY7qyl30+R04GtVu7siYmxETMzMNaVibEe9DoEJDoMpSZIGraL3cEZER0Q8AKwDfpiZd1er5leXzT8fESOrsknAr+qaP1WV7bzNcyJiWUQsW79+fcnwJUmSNACKJpyZuS0zjwUmA8dHxDHAp4E/BP4EGA98qp/bvCYzZ2TmjAkTJgx4zJIkSRpYTXlKPTN/BywBZmbmmqzZDPxv4Piq2mpgSl2zyVWZJEmSBrGST6lPiIix1fz+wDuARyNiYlUWwCzgoarJYuCD1dPqJwDPDrf7NyVJkoaikk+pTwSui4gOaontTZn53Yj4t4iYAATwAHBuVf/7wCnASuAF4EMFY5MkSVKTlHxK/UHgtb2Uv7WP+gl8tFQ8kiRJag1HGpIkSVJRJpySJEkqyoRTkiRJRZlwSpIkqSgTTkmSJBVlwilJkqSiTDglSZJUlAmnJEmSijLhlCRJUlElh7YcFs678GJWre3qUXbkK8ex8IpLWxSRJElSezHh3Eer1naxYtLMnoWrb21NMJIkSW3IS+qSJEkqyoRTkiRJRZlwSpIkqSgTTkmSJBVlwilJkqSiTDglSZJUlAmnJEmSijLhlCRJUlHFEs6IGBUR90TEzyLi4Yj4b1X5ERFxd0SsjIhvRsTLqvKR1fLKav20UrFJkiSpeUqe4dwMvDUz/xg4FpgZEScAlwOfz8xXA13AnKr+HKCrKv98VU+SJEmDXLGEM2s2VIud1ZTAW4Gbq/LrgFnV/OnVMtX6t0VElIpPkiRJzVH0Hs6I6IiIB4B1wA+BXwK/y8ytVZWngEnV/CTgVwDV+meBV/SyzXMiYllELFu/fn3J8CVJkjQAiiacmbktM48FJgPHA384ANu8JjNnZOaMCRMm7HOMkiRJKqspT6ln5u+AJcAbgLERsV+1ajKwuppfDUwBqNYfBPy2GfFJkiSpnJJPqU+IiLHV/P7AO4Dl1BLPM6pqZwO3VPOLq2Wq9f+WmVkqPkmSJDXHfnuustcmAtdFRAe1xPamzPxuRDwC3BgRfwf8FFhU1V8EfD0iVgLPAO8vGJskSZKapFjCmZkPAq/tpXwVtfs5dy7fBLy3VDySJElqDUcakiRJUlEmnJIkSSrKhFOSJElFmXBKkiSpKBNOSZIkFWXCKUmSpKJMOCVJklSUCackSZKKMuGUJElSUSackiRJKsqEU5IkSUWZcEqSJKkoE05JkiQVZcIpSZKkokw4JUmSVJQJpyRJkorar9UBDGcrlj/CSWfN7VH22KrHYVKLApIkSSrAhLOFNkcnKybN7Fn26AJGtigeSZKkEopdUo+IKRGxJCIeiYiHI+L8qvySiFgdEQ9U0yl1bT4dESsjYkVEnFQqNkmSJDVPyTOcW4FPZOb9EXEAcF9E/LBa9/nM/J/1lSPiaOD9wB8BhwE/iog/yMxtBWOUJElSYcXOcGbmmsy8v5p/HljO7u9OPB24MTM3Z+bjwErg+FLxSZIkqTmacg9nREwDXgvcDfwZ8LGI+CCwjNpZ0C5qyehddc2eopcENSLOAc4BmDp1atG42915F17MqrVdPcqOfOU4Fl5xaYsikiRJ2lXxhDMixgD/DHw8M5+LiIXAfwey+nkl8OFGt5eZ1wDXAMyYMSMHPuLBY9Xarl0eOmL1ra0JRpIkqQ9F38MZEZ3Uks3rM/NfADLz6czclpnbgWt56bL5amBKXfPJVZkkSZIGsWJnOCMigEXA8sy8qq58YmauqRbfAzxUzS8GvhERV1F7aOgo4J5S8Q02vrNTkiQNViUvqf8ZcBbw84h4oCr7DDA7Io6ldkn9CeCvADLz4Yi4CXiE2hPuH/UJ9Zf4zk5JkjRYFUs4M3MpEL2s+v5u2swH5peKSZIkSc3nWOqSJEkqyoRTkiRJRZlwSpIkqSgTTkmSJBVlwilJkqSiTDglSZJUlAmnJEmSijLhlCRJUlEmnJKktjV+/Hgiot8Tlxy0V+3qp/Hjx7f640tDRkMJZ0ScHxEHRs2iiLg/It5ZOjhJ0vDW1dVFZvZ7AvaqXf3U1dXV4k8vDR2NDm354cz8QkScBIyjNkb614HbikU2iK1Y/ggnnTW3R9ljqx6HSS0KSJIkqYUaTTi7x0Q/Bfh6Zj4cEb2Nky5gc3SyYtLMnmWPLmBki+KRJElqpUbv4bwvIm6jlnD+ICIOALaXC0uSJElDRaNnOOcAxwKrMvOFiHgF8KFyYUmSJGmoaPQMZwJHA39dLY8GRhWJSJIkSUNKownnl4E3ALOr5eeBLxWJSJIkSUNKo5fUX5+Zx0XETwEysysiXlYwLklSG4mIHa8bUnvzu1I7avQM55aI6KB2aZ2ImIAPDUmSJKkBjSacXwS+BRwSEfOBpcDnikUlSZKkIaOhhDMzrwf+FvgfwBpgVmb+0+7aRMSUiFgSEY9ExMMRcX5VPj4ifhgRj1U/x1XlERFfjIiVEfFgRBy3bx9NkiRJ7aDRoS3HA+uAG4BvAE9HROcemm0FPpGZRwMnAB+NiKOBi4DbM/Mo4PZqGeBk4KhqOgdY2M/PIkmSpDbU6CX1+4H1wC+Ax6r5J6ox1V/XW4PMXJOZ91fzzwPLqQ3ueDpwXVXtOmBWNX868LWsuQsYGxET9+IzSZI0LI0ZMwaoPTg0lKdRo0bR0dHBlClTmDJlCiNGjKCzs3PH+v3222/H/JQpU7jhhht6PV433HADxxxzzC7te2vTXbejo4Njjjmmz202Wre3OnPnzmXUqFE7PuPcuXN72Xrj+hPzvrRpRKMJ5w+BUzLz4Mx8BbWzkd8F/iu1VybtVkRMA14L3A0cmplrqlVrgUOr+UnAr+qaPUUvo49HxDkRsSwilq1fv77B8CVJGtrGjBnDxo0bW7Lvww47rM91HR0du5SNGNEz/ehrtOzp06f3WPfmN7+Z/fffn82bN3PYYYexbds2nn/+ecaMGcOBBx7IQQcdxMtf/nIykwMPPJCLLrqIbdu2cf755/eaQM6bN49Zs2Zx8MEHc+CBB3LooYdy0UUXsXXr1h5tuusuWLCATZs2sWDBAubNm9dnIrmnur3VOffcc/nyl7/M5z73OTZu3MjnPvc5rr766r1OOvsT8760aVSjCecJmfmD7oXMvA14Q3UmcrdDhEfEGOCfgY9n5nP167L23oZ+vbshM6/JzBmZOWPChAn9aSpJ0pC1ceNGpk2btmP5tNNO2zHfV0K3s/pEsH5b9fN/+qd/uku7NWvWMGLECA444IBd6m3bto3zzjtvR/l5553H9u3be9TJzB7bHTFiBGPGjGHVqlVkJp2dney3337ceeedfO973+O0007jqaee4vrrr2fcuHFs2rSJm2++mXHjxrFlyxauuOIKxo8fz3e+8x2uv/56Ro8ezfz583vEPH/+fBYtWsS3v/1tRo8ezc0338wNN9zAd77zHb7xjW/0aNNd98QTT6Szs5MTTzyRRYsW7bLNRuv2Vuf3v/89hxxyCBdccAEvf/nLueCCC7j88su59tpr+/7CdqM/Me9Lm0ZFI+/qqsZRvx24sSr6T8A7gJnAvZnZ6wM+1X2e3wV+kJlXVWUrgLdk5prqkvm/Z+ZrIuIfqvkbdq7XV1wzZszIZcuWNfhRyzjprLmsmDSzR9nm2xcw8m1zB6ysP3Vfs/pWfvD1Bf3+HJJeEhH3ZeaMVscxkCLiHGr3xzN16tTXPfnkk/1tXyKshuzVOyUvOQgueXaf9tvKzzwQ1q9fT/eJmVtvvZWZM2fuoQX86Ec/4u1vfzsAK1eu5NWvfvUu87/+9a/7PKM5YsSIHclkfb3f/e53jB07dpf5+jo7b7ev94m++OKLPPvss0yYMIEXX3yRkSNHkpm8+OKLjBo1iu3bt7Nx40bGjBlDRLBp0yZGjaoNjrht27Yd2+no6OixbtOmTQCMGjWKTZs2MXLkSCKCbdu27ajb2fnS4ytbtmxh1KhRPbZZv93d1e2tTkQwYsSIHtt74YUXGD169F79G+hPzPvSpt7u+s5Gz3B+AJgMfLuaplZlHcD7+thpAIuA5d3JZmUxcHY1fzZwS135B6un1U8Ant1dsilJatxAXB3KzKZPrdaKz7wvx6r+TOScOXN2zJ988skNfd53vvOdO+a7E8+d588444xd2nUnS6NHj+613qc//ele5+vr1M93b2vkyNpF1O4znBHB0qVLd3y2pUuXcvjhh9PZ2cnSpUuZOnUqnZ2dXH311Rx++OFMnz59R/n06dN7xNy9bvr06UydOpWlS5fuWO7ebneb7rJ63XV31kjd3up0dnZyyCGH9Ci7+uqrdxyD/upPzPvSplGNvhbpN5k5NzNfW00fy8z1mfliZq7so9mfAWcBb42IB6rpFOAy4B0R8Rjw9moZ4PvAKmAlcC21+0MlSVIDRo8ezRNPPLFjefHixTvmG03eu89OAj22VT//k5/8ZJd2EydOZPv27Tz//PO71Ovo6GDhwpdePLNw4cIdl+6760REj+1u376dDRs2cOSRRxIRbNmyha1bt/KmN72Jd73rXSxevJjJkydz5pln0tXVxahRozjjjDPo6uqis7OTCy+8kGeeeYZTTz2VM888k40bNzJv3rweMc+bN485c+Ywa9YsNm7cyBlnnMHs2bM59dRT+cAHPtCjTXfdJUuWsGXLFpYsWcKcOXN22WajdXurs//++7Nu3TquuuoqXnjhBa666io+9alP8ZGPfKTvL2w3+hPzvrRpVENDW1YjC/0t8EfAqO7yzHxrX20ycynQ1/WIt/VSP4GPNhKPJEnqacOGDS17cOjXv/51n+t6uxRbn9hC3wnx8uXLeyzfcccdAIwcOXLHJfgDDjiA1atX09HRwdatW4Fakvvcc89x2WWXMXnyZK688kpmz57dY1vdy/Pnz+c3v/nNjvbdba666qoddbp/zp07l+XLlzN9+nTmz5+/yzYbrdtbnauvvpqf/OQnfOYzn+ETn/gEI0eO5Nxzz2XBgr27Ta4/Me9Lm0Y1Opb69cA3gXcD51K7FO4j4pIktZENGzY4lno/zJ49u+FkaqDr9lZn9uzZe51g7m0cA9GmEY3ew/mKzFwEbMnMOzLzw0CfZzclSZKkbo2e4dxS/VwTEe8Cfg2MLxOSJEmShpJGE86/i4iDgE8AC4ADgb8pFpUkSZKGjIYSzsz8bjX7LHBiuXAkSZI01DT6lPoRwFxgWn2bzDytrzaSpKHDh1AGD78rtaNGL6l/m9pL3L8DbN9DXUmSJGmHRhPOTZn5xaKRSJIkaUhqNOH8QkR8FrgN2NxdmJn3F4lKkiRJQ0ajCed/oBqmkpcuqSe+i1OSJEl70GjC+V7gyMx8sWQwkiRJGnoaHWnoIWBsyUAkSZI0NDV6hnMs8GhE3EvPezh9LVKbWbH8EU46a26PsiNfOY6FV1zaoogkad9ERL/b5GcP3Kt29caNG7dP7SW9pNGE87NFo9CA2RydrJg0s2fh6ltbE4wk7aN9eadkXjJwcUjaN42ONHRH6UAkSZI0NO024YyI56k9jb7LKiAz88AiUUmSJGnI2G3CmZkHNCsQSZIkDU2NPqUuSZIk7RUTTkmSJBVVLOGMiK9ExLqIeKiu7JKIWB0RD1TTKXXrPh0RKyNiRUScVCouSZIkNVfJM5xfBWb2Uv75zDy2mr4PEBFHA+8H/qhq8+WI6CgYmyRJkpqkWMKZmXcCzzRY/XTgxszcnJmPAyuB40vFJkmSpOZpxT2cH4uIB6tL7t3DOEwCflVX56mqbBcRcU5ELIuIZevXry8dqyRJkvZRsxPOhcCrgGOBNcCV/d1AZl6TmTMyc8aECRMGOj5JkiQNsKYmnJn5dGZuy8ztwLW8dNl8NTClrurkqkySJEmDXFMTzoiYWLf4HqD7CfbFwPsjYmREHAEcBdzTzNgkSZJURkNjqe+NiLgBeAtwcEQ8BXwWeEtEHEttuMwngL8CyMyHI+Im4BFgK/DRzNxWKjZJkiQ1T7GEMzNn91K8aDf15wPzS8UjSZKk1nCkIUmSJBVlwilJkqSiTDglSZJUlAmnJEmSijLhlCRJUlEmnJIkSSrKhFOSJElFFXsPp9rbeRdezKq1XT3KjnzlOBZecWmLIpIkSUOVCecwtWptFysmzexZuPrW1gQjSZKGNC+pS5IkqSgTTkmSJBVlwilJkqSiTDglSZJUlAmnJEmSijLhlCRJUlEmnJIkSSrKhFOSJElFmXBKkiSpqGIjDUXEV4B3A+sy85iqbDzwTWAa8ATwvszsiogAvgCcArwA/GVm3l8qtuFmxfJHOOmsuT3KHlv1OExqUUCSJGlYKXmG86vATmMnchFwe2YeBdxeLQOcDBxVTecACwvGNexsjk5WTJrZY/r9i1tbHZYkSRomiiWcmXkn8MxOxacD11Xz1wGz6sq/ljV3AWMjYmKp2CRJktQ8xS6p9+HQzFxTza8FDq3mJwG/qqv3VFW2hp1ExDnUzoIyderUcpEKgPMuvJhVa7t6lB35ynEsvOLSFkUkSZIGm2YnnDtkZkZE7kW7a4BrAGbMmNHv9uqfVWu7WDFppzsjVt/ammAkSdKg1Oyn1J/uvlRe/VxXla8GptTVm1yVSZIkaZBrdsK5GDi7mj8buKWu/INRcwLwbN2ld0mSJA1iJV+LdAPwFuDgiHgK+CxwGXBTRMwBngTeV1X/PrVXIq2k9lqkD5WKS5IkSc1VLOHMzNl9rHpbL3UT+GipWNQY39cpSZJKaNlDQ2o/3e/r7FH26AJGtigeSZI0NDi0pSRJkooy4ZQkSVJRJpySJEkqyoRTkiRJRZlwSpIkqSifUu+DY4hLkiQNDBPOPjiGuCRJ0sDwkrokSZKKMuGUJElSUSackiRJKsqEU5IkSUX50FA/rFj+CCedNbdH2WOrHodJLQpIkiRpEDDh7IfN0bnLk+ubH13AyBbFI0mSNBh4SV2SJElFmXBKkiSpKBNOSZIkFeU9nBoQDgUqSZL6YsKpAeFQoJIkqS8tSTgj4gngeWAbsDUzZ0TEeOCbwDTgCeB9mdnV1zYkSZI0OLTyHs4TM/PYzJxRLV8E3J6ZRwG3V8uSJEka5NrpoaHTgeuq+euAWS2MRZIkSQOkVfdwJnBbRCTwD5l5DXBoZq6p1q8FDu2tYUScA5wDMHXq1GbEqp044pIkSeqPViWcf56ZqyPiEOCHEfFo/crMzCoZ3UWVnF4DMGPGjF7rqCxHXJIkSf3RkoQzM1dXP9dFxLeA44GnI2JiZq6JiInAulbEprJ8fZIkScNP0xPOiBgNjMjM56v5dwKXAouBs4HLqp+3NDs2lefrkyRJGn5acYbzUOBbEdG9/29k5q0RcS9wU0TMAZ4E3teC2CRJkjTAmp5wZuYq4I97Kf8t8LZmxyNJkqSy2um1SJIkSRqCHNpSxfj6JEmSBCacKqjR1yf1lpj65LokSUOHCadarrfE1CfXJUkaOryHU5IkSUWZcEqSJKkoL6mrLXlfpyRJQ4cJp9qS93VKkjR0eEldkiRJRQ27M5znXXgxq9Z29Shb8+QvmXj4q3qU+b5ISZKkgTHsEs5Va7t6fTfkcw28L1Ltqbc/IrzfU5Kk9jHsEk4NXr09SAS1s9Hb33hez0Lv95QkqW2YcGrQ6PVBIjwbLUlSu/OhIUmSJBXlGU4NG97rKUlSa5hwatjo7YEx7/WUJKk8L6lLkiSpKM9wakjq7Yn23t6t6hCakiSVZ8KpIam3J9p7e5q9t3orfnTVLklob4MDNFpmAitJGu7aLuGMiJnAF4AO4B8z87IWh6Rhpq9ktbfBARop6+0+UR9gkiQNJ22VcEZEB/Al4B3AU8C9EbE4Mx9pbWTS3uvr8v7OL6vv7cxqf5JQk1hJUrtqq4QTOB5YmZmrACLiRuB0wIRTg9ZAX97vK4ls9Cn8RhPTfak30LcW7Ess+7pvSdK+i8xsdQw7RMQZwMzM/C/V8lnA6zPzY3V1zgHOqRZfA6xocPMHA78ZwHAHQrvF1G7xgDE1ot3igcEf0+GZOaFkMK0UEeuBJ3dTZbB/f83SbjG1WzxgTI1ot3hg72Pqs+9stzOce5SZ1wDX9LddRCzLzBkFQtpr7RZTu8UDxtSIdosHjKnd7SmZbsdjZUx71m7xgDE1ot3igTIxtdt7OFcDU+qWJ1dlkiRJGqTaLeG8FzgqIo6IiJcB7wcWtzgmSZIk7YO2uqSemVsj4mPAD6i9FukrmfnwAG2+35fhm6DdYmq3eMCYGtFu8YAxDXbteKyMac/aLR4wpka0WzxQIKa2emhIkiRJQ0+7XVKXJEnSEGPCKUmSpKKGRcIZETMjYkVErIyIi1qw/ykRsSQiHomIhyPi/Kr8kohYHREPVNMpTY7riYj4ebXvZVXZ+Ij4YUQ8Vv0c16RYXlN3HB6IiOci4uPNPkYR8ZWIWBcRD9WV9XpMouaL1e/VgxFxXBNjuiIiHq32+62IGFuVT4uI39cdr6ubGFOf31VEfLo6Tisi4qQmxfPNulieiIgHqvKmHKPByv6yz7japr+s9m2f2Xg89pd7jqd8f5mZQ3qi9vDRL4EjgZcBPwOObnIME4HjqvkDgF8ARwOXAJ9s4bF5Ajh4p7K/By6q5i8CLm/Rd7YWOLzZxwh4E3Ac8NCejglwCvCvQAAnAHc3MaZ3AvtV85fXxTStvl6Tj1Ov31X1u/4zYCRwRPXvsaN0PDutvxK4uJnHaDBO9pe7jast+8u6780+s+947C/3EM9O64v0l8PhDOeO4TIz80Wge7jMpsnMNZl5fzX/PLAcmNTMGPrhdOC6av46YFYLYngb8MvM3N1IKEVk5p3AMzsV93VMTge+ljV3AWMjYmIzYsrM2zJza7V4F7V31jZNH8epL6cDN2bm5sx8HFhJ7d9lU+KJiADeB9wwkPscouwv+6cd+kuwz9xtPPaXjcdTsr8cDgnnJOBXdctP0cLOKyKmAa8F7q6KPlad5v9KMy/HVBK4LSLui9qQoQCHZuaaan4tcGiTY4La+1frf9lbeYyg72PSLr9bH6Z21qDbERHx04i4IyLe2ORYevuuWn2c3gg8nZmP1ZW18hi1s1Z/Vz3YXzbMPrNx9pe7V6y/HA4JZ9uIiDHAPwMfz8zngIXAq4BjgTXUTmM3059n5nHAycBHI+JN9Suzdj69qe/NitoL/08D/qkqavUx6qEVx2R3ImIesBW4vipaA0zNzNcCFwDfiIgDmxROW31XdWbT8z/jVh4jNcj+sjH2mY2zv2xIsf5yOCScbTFcZkR0Uus8r8/MfwHIzKczc1tmbgeuZYBPm+9JZq6ufq4DvlXt/+nuSxzVz3XNjIlaZ35/Zj5dxdbSY1Tp65i09HcrIv4SeDdwZtWpU12G+W01fx+1+3/+oBnx7Oa7atlxioj9gL8AvlkXZ8uO0SBgf9mHNu0vwT6zIfaXe1a6vxwOCWfLh8us7olYBCzPzKvqyuvvXXkP8NDObQvGNDoiDuiep3ZT9UPUjs3ZVbWzgVuaFVOlx19XrTxGdfo6JouBD0bNCcCzdZeRioqImcDfAqdl5gt15RMioqOaPxI4CljVpJj6+q4WA++PiJERcUQV0z3NiAl4O/BoZj5VF2fLjtEgYH/Ze0zt2l+CfeYe2V82rGx/ORBPHrX7RO3JuF9Qy8zntWD/f07tksKDwAPVdArwdeDnVfliYGITYzqS2pNwPwMe7j4uwCuA24HHgB8B45sY02jgt8BBdWVNPUbUOu41wBZq987M6euYUHvS8kvV79XPgRlNjGkltft8un+frq7q/sfq+3wAuB84tYkx9fldAfOq47QCOLkZ8VTlXwXO3aluU47RYJ3sL3uNqe36y2r/9pmNxWN/uYd4qvKi/aVDW0qSJKmo4XBJXZIkSS1kwilJkqSiTDglSZJUlAmnJEmSijLhlCRJUlEmnNpnEZERcWXd8icj4pIB2vZXI+KMgdjWHvbz3ohYHhFLSu+r2t8lEfHJZuxLUnuy79yr/dl3DlImnBoIm4G/iIiDWx1IvWrUhEbNAT6SmScWiCMiwn9rknZm37n7OOw7hxC/SA2ErcA1wN/svGLnv7IjYkP18y0RcUdE3BIRqyLisog4MyLuiYifR8Sr6jbz9ohYFhG/iIh3V+07IuKKiLg3Ih6MiL+q2+6PI2Ix8Egv8cyutv9QRFxelV1M7WXTiyLiip3qfykiTqvmvxURX6nmPxwR86v5C6rtPRQRH6/KpkXEioj4GrURJKZExLzqMywFXlO3j7+OiEeqz3FjP4+9pMHLvtO+c9joz18x0u58CXgwIv6+H23+GJgOPENtqKx/zMzjIxZwOi4AAALHSURBVOJ8YC7w8areNGrjzL4KWBIRrwY+SG1YtD+JiJHA/42I26r6xwHHZObj9TuLiMOAy4HXAV3AbRExKzMvjYi3Ap/MzGU7xfhj4I3URoKYBHQPR/ZG4MaIeB3wIeD11EbRuDsi7qi2fxRwdmbeVdV7P3AstX939wP3Vdu6CDgiMzdHxNh+HD9Jg599p33nsOAZTg2IzHwO+Brw1/1odm9mrsnMzdSG8eru9H5OraPsdlNmbs/Mx6h1rn9IbSzjD0bEA8Dd1IZSO6qqf8/OHWblT4B/z8z1mbkVuB540x5i/DHwxog4mtpf/U9HbQzcNwA/ofbX/bcyc2NmbgD+hVqHCvBkZt5Vzb+xqvdCdazqx6d+ELg+Iv4ztTMekoYJ+077zuHChFMD6X9Ru59ndF3ZVqrfs6jdi/OyunWb6+a31y1vp+fZ953HX01qfxHPzcxjq+mIzOzudDfu06eo31HmamAsMBO4k1on+j5gQ2Y+v4fmjcbxLmpnOY4D7o3+3T8lafCz7+zJvnMIMuHUgMnMZ4CbqHWc3Z6gdhkG4DSgcy82/d6IGFHdm3QksAL4AXBeRHQCRMQfRMTo3W0EuAd4c0QcHBEdwGzgjgb2fxe1S1TdneYnq59UP2dFxMur/b+nbl29O6t6+0fEAcCpVdwjgCmZuQT4FHAQMKaBmCQNEfad9p3DgX8NaKBdCXysbvla4JaI+BlwK3v3F/T/o9bhHQicm5mbIuIfqV06uj8iAlgPzNrdRjJzTURcBCyh9lf+9zLzlgb2/2PgnZm5MiKeBMZXZWTm/RHx1So+qN1L9dOImLbTvu+PiG8CPwPWAfdWqzqA/xMRB1UxfTEzf9dATJKGFvtO+84hLTJ3PuMuSZIkDRwvqUuSJKkoE05JkiQVZcIpSZKkokw4JUmSVJQJpyRJkooy4ZQkSVJRJpySJEkq6v8D96IYae1r2+MAAAAASUVORK5CYII=\n"
          },
          "metadata": {
            "needs_background": "light"
          }
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import collections\n",
        "counters = collections.Counter(captions)\n",
        "\n",
        "df = pd.DataFrame.from_records(counters.most_common(), columns=['caption','count'])\n",
        "df.head()"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 206
        },
        "id": "quPfpjrsxMH3",
        "outputId": "1f80b41b-ec0d-4028-8368-ed3c04bbd719"
      },
      "execution_count": 9,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "                                             caption  count\n",
              "0  No acute disease. The heart is normal in size....     42\n",
              "1  No active disease. The heart and lungs have XX...     37\n",
              "2  Normal chest Heart size normal. Lungs are clea...     30\n",
              "3  No active disease. Both lungs are clear and ex...     26\n",
              "4  No acute cardiopulmonary abnormality.. The lun...     25"
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-b7772fd9-1b26-4ab9-bc21-87cc3e0a8c0a\">\n",
              "    <div class=\"colab-df-container\">\n",
              "      <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>caption</th>\n",
              "      <th>count</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>No acute disease. The heart is normal in size....</td>\n",
              "      <td>42</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>No active disease. The heart and lungs have XX...</td>\n",
              "      <td>37</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>Normal chest Heart size normal. Lungs are clea...</td>\n",
              "      <td>30</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>No active disease. Both lungs are clear and ex...</td>\n",
              "      <td>26</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>No acute cardiopulmonary abnormality.. The lun...</td>\n",
              "      <td>25</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-b7772fd9-1b26-4ab9-bc21-87cc3e0a8c0a')\"\n",
              "              title=\"Convert this dataframe to an interactive table.\"\n",
              "              style=\"display:none;\">\n",
              "        \n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "       width=\"24px\">\n",
              "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
              "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
              "  </svg>\n",
              "      </button>\n",
              "      \n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      flex-wrap:wrap;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "      <script>\n",
              "        const buttonEl =\n",
              "          document.querySelector('#df-b7772fd9-1b26-4ab9-bc21-87cc3e0a8c0a button.colab-df-convert');\n",
              "        buttonEl.style.display =\n",
              "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "        async function convertToInteractive(key) {\n",
              "          const element = document.querySelector('#df-b7772fd9-1b26-4ab9-bc21-87cc3e0a8c0a');\n",
              "          const dataTable =\n",
              "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                     [key], {});\n",
              "          if (!dataTable) return;\n",
              "\n",
              "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "            + ' to learn more about interactive tables.';\n",
              "          element.innerHTML = '';\n",
              "          dataTable['output_type'] = 'display_data';\n",
              "          await google.colab.output.renderOutput(dataTable, element);\n",
              "          const docLink = document.createElement('div');\n",
              "          docLink.innerHTML = docLinkHtml;\n",
              "          element.appendChild(docLink);\n",
              "        }\n",
              "      </script>\n",
              "    </div>\n",
              "  </div>\n",
              "  "
            ]
          },
          "metadata": {},
          "execution_count": 9
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "freqs_list = {}\n",
        "for counts in df['count'].to_list():\n",
        "  if counts in freqs_list:\n",
        "    freqs_list[counts] += 1\n",
        "  else:\n",
        "    freqs_list[counts] = 1\n",
        "from pprint import pprint\n",
        "pprint(freqs_list)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "b785abzk38q9",
        "outputId": "9283973b-9c6b-4848-c890-ae121f5fa40f"
      },
      "execution_count": 10,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "{1: 2415,\n",
            " 2: 47,\n",
            " 3: 31,\n",
            " 4: 10,\n",
            " 5: 18,\n",
            " 6: 4,\n",
            " 7: 6,\n",
            " 8: 9,\n",
            " 9: 3,\n",
            " 10: 4,\n",
            " 11: 2,\n",
            " 12: 1,\n",
            " 13: 1,\n",
            " 16: 2,\n",
            " 19: 1,\n",
            " 25: 1,\n",
            " 26: 1,\n",
            " 30: 1,\n",
            " 37: 1,\n",
            " 42: 1}\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "print(f'About {round(round(freqs_list[1]/len(captions)*100,2))}% of the captions are unique.')"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "njcijvUQ4gHn",
        "outputId": "c41a9dba-f82e-4019-9a27-21619500e453"
      },
      "execution_count": 11,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "About 76% of the captions are unique.\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import tensorflow\n",
        "import keras\n",
        "from keras.preprocessing.text import Tokenizer\n",
        "from collections import Counter\n",
        "tokenizer = Tokenizer()\n",
        "tokenizer.fit_on_texts(captions)\n",
        "vocab_size = len(tokenizer.word_index) + 1\n",
        "print('Vocabulary Size: %d' % vocab_size)\n",
        "ordered = Counter(tokenizer.word_counts)\n",
        "# top 10\n",
        "print('Top10 frequent')\n",
        "ordered.most_common(10)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "rW6fTzDK5JHJ",
        "outputId": "814f77bc-93d3-42f8-cde8-1268d33279a3"
      },
      "execution_count": 12,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Vocabulary Size: 1874\n",
            "Top10 frequent\n"
          ]
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "[('no', 6235),\n",
              " ('the', 5639),\n",
              " ('are', 3819),\n",
              " ('is', 3770),\n",
              " ('normal', 3372),\n",
              " ('xxxx', 2820),\n",
              " ('of', 2737),\n",
              " ('and', 2736),\n",
              " ('acute', 2511),\n",
              " ('pleural', 2330)]"
            ]
          },
          "metadata": {},
          "execution_count": 12
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from nltk.corpus import stopwords # Import the stop word list\n",
        "from nltk.tokenize import wordpunct_tokenize\n",
        "import nltk\n",
        "nltk.download('stopwords')\n",
        "stop_words = set(stopwords.words('english')) \n",
        "dictionary_wo_stopwords = {k:v for k,v in tokenizer.word_counts.items() if k not in stop_words}\n",
        "\n",
        "ordered_wo = Counter(dictionary_wo_stopwords)\n",
        "# top 10\n",
        "print('Most frequent words w/o stopwords')\n",
        "ordered_wo.most_common(10)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "qRVSyiXT5TpG",
        "outputId": "9f7d2bb9-41f3-445e-88cb-4090439f4d06"
      },
      "execution_count": 13,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Most frequent words w/o stopwords\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
            "[nltk_data]   Unzipping corpora/stopwords.zip.\n"
          ]
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "[('normal', 3372),\n",
              " ('xxxx', 2820),\n",
              " ('acute', 2511),\n",
              " ('pleural', 2330),\n",
              " ('pneumothorax', 2102),\n",
              " ('effusion', 2049),\n",
              " ('heart', 2028),\n",
              " ('lungs', 1983),\n",
              " ('size', 1854),\n",
              " ('clear', 1585)]"
            ]
          },
          "metadata": {},
          "execution_count": 13
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "print(f'There are {sum(np.array(list(ordered.values()))==1)} words with only 1 occurence')"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "LzdwEkDI7cG_",
        "outputId": "c7d1aa3d-4040-4a49-b051-3541e8279543"
      },
      "execution_count": 14,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "There are 529 words with only 1 occurence\n"
          ]
        }
      ]
    }
  ]
}