986 lines (986 with data), 59.5 kB
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"collapsed_sections": [],
"authorship_tag": "ABX9TyPaxWDmXP+eW1nbC2gcFIE5",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/github/zaaachos/Thesis-Diagnostic-Captioning/blob/main/notebooks/IU_Xray_EDA.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"source": [
"from google.colab import drive\n",
"drive.mount('/content/drive')"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "rKTWUMPVugDP",
"outputId": "46d534c9-e365-44d6-a0c1-cf8bd6552913"
},
"execution_count": 1,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Mounted at /content/drive\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"%cd drive/MyDrive"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Gzl3ALvDujPH",
"outputId": "b60a4dd4-60ef-4d01-a4c7-cb1e65f53b9c"
},
"execution_count": 2,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"/content/drive/MyDrive\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"!ls"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ywbUILg1uork",
"outputId": "af2d5233-e968-407e-a3e9-739125551886"
},
"execution_count": 3,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
" 46bff9d5-95d4-4362-be98-ef59819ec3af_ImageCLEFmedCaption_2022_concept_detection_valid.csv\n",
" 72d678c1-c535-491d-bc42-c2ba11f47165_concepts.csv\n",
" b47c4f80-9432-408c-b69a-956a3382a0da_ImageCLEFmedCaption_2022_concept_detection_train.csv\n",
" c856ae07-029b-449e-bd06-99c04d3ad1e0_ImageCLEFmedCaption_2022_caption_prediction_train.csv\n",
" cc3d9c72-6c2b-4bd3-9d10-4e133031be48_ImageCLEFmedCaption_2022_caption_prediction_valid.csv\n",
"'Colab Notebooks'\n",
" iu_xray.csv\n",
" two_captions.json\n",
" VID20220808185241.mp4\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import pandas as pd\n",
"import json\n",
"\n",
"all_data = 'iu_xray.csv'\n",
"two_patients = 'two_captions.json'\n",
"\n",
"all_df = pd.read_csv(all_data, sep='\\t', names=['ID', 'caption'])\n",
"all_df.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"id": "PBQ-fq9tvB50",
"outputId": "d0392fc8-cca8-4de6-e079-d1dde3ef087a"
},
"execution_count": 4,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" ID caption\n",
"0 CXR1_1_IM-0001-3001.png Normal chest x-XXXX. The cardiac silhouette an...\n",
"1 CXR1_1_IM-0001-4001.png Normal chest x-XXXX. The cardiac silhouette an...\n",
"2 CXR10_IM-0002-1001.png No acute cardiopulmonary process. The cardiome...\n",
"3 CXR10_IM-0002-2001.png No acute cardiopulmonary process. The cardiome...\n",
"4 CXR100_IM-0002-1001.png No active disease. Both lungs are clear and ex..."
],
"text/html": [
"\n",
" <div id=\"df-b80fed9e-daf4-486b-9217-301b2b4144d9\">\n",
" <div class=\"colab-df-container\">\n",
" <div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ID</th>\n",
" <th>caption</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>CXR1_1_IM-0001-3001.png</td>\n",
" <td>Normal chest x-XXXX. The cardiac silhouette an...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>CXR1_1_IM-0001-4001.png</td>\n",
" <td>Normal chest x-XXXX. The cardiac silhouette an...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>CXR10_IM-0002-1001.png</td>\n",
" <td>No acute cardiopulmonary process. The cardiome...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>CXR10_IM-0002-2001.png</td>\n",
" <td>No acute cardiopulmonary process. The cardiome...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>CXR100_IM-0002-1001.png</td>\n",
" <td>No active disease. Both lungs are clear and ex...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>\n",
" <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-b80fed9e-daf4-486b-9217-301b2b4144d9')\"\n",
" title=\"Convert this dataframe to an interactive table.\"\n",
" style=\"display:none;\">\n",
" \n",
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
" width=\"24px\">\n",
" <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
" <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
" </svg>\n",
" </button>\n",
" \n",
" <style>\n",
" .colab-df-container {\n",
" display:flex;\n",
" flex-wrap:wrap;\n",
" gap: 12px;\n",
" }\n",
"\n",
" .colab-df-convert {\n",
" background-color: #E8F0FE;\n",
" border: none;\n",
" border-radius: 50%;\n",
" cursor: pointer;\n",
" display: none;\n",
" fill: #1967D2;\n",
" height: 32px;\n",
" padding: 0 0 0 0;\n",
" width: 32px;\n",
" }\n",
"\n",
" .colab-df-convert:hover {\n",
" background-color: #E2EBFA;\n",
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
" fill: #174EA6;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-convert {\n",
" background-color: #3B4455;\n",
" fill: #D2E3FC;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-convert:hover {\n",
" background-color: #434B5C;\n",
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
" fill: #FFFFFF;\n",
" }\n",
" </style>\n",
"\n",
" <script>\n",
" const buttonEl =\n",
" document.querySelector('#df-b80fed9e-daf4-486b-9217-301b2b4144d9 button.colab-df-convert');\n",
" buttonEl.style.display =\n",
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
"\n",
" async function convertToInteractive(key) {\n",
" const element = document.querySelector('#df-b80fed9e-daf4-486b-9217-301b2b4144d9');\n",
" const dataTable =\n",
" await google.colab.kernel.invokeFunction('convertToInteractive',\n",
" [key], {});\n",
" if (!dataTable) return;\n",
"\n",
" const docLinkHtml = 'Like what you see? Visit the ' +\n",
" '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
" + ' to learn more about interactive tables.';\n",
" element.innerHTML = '';\n",
" dataTable['output_type'] = 'display_data';\n",
" await google.colab.output.renderOutput(dataTable, element);\n",
" const docLink = document.createElement('div');\n",
" docLink.innerHTML = docLinkHtml;\n",
" element.appendChild(docLink);\n",
" }\n",
" </script>\n",
" </div>\n",
" </div>\n",
" "
]
},
"metadata": {},
"execution_count": 4
}
]
},
{
"cell_type": "code",
"source": [
"all_df.describe()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 175
},
"id": "5BRxtsirvXt_",
"outputId": "466c816e-4160-4839-cba8-3fdf4ad424e3"
},
"execution_count": 5,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" ID \\\n",
"count 7430 \n",
"unique 7430 \n",
"top CXR1_1_IM-0001-3001.png \n",
"freq 1 \n",
"\n",
" caption \n",
"count 7430 \n",
"unique 3066 \n",
"top No acute disease. The heart is normal in size.... \n",
"freq 96 "
],
"text/html": [
"\n",
" <div id=\"df-f14d8968-2f34-426c-98f3-5309812599a9\">\n",
" <div class=\"colab-df-container\">\n",
" <div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ID</th>\n",
" <th>caption</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>7430</td>\n",
" <td>7430</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unique</th>\n",
" <td>7430</td>\n",
" <td>3066</td>\n",
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>CXR1_1_IM-0001-3001.png</td>\n",
" <td>No acute disease. The heart is normal in size....</td>\n",
" </tr>\n",
" <tr>\n",
" <th>freq</th>\n",
" <td>1</td>\n",
" <td>96</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>\n",
" <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-f14d8968-2f34-426c-98f3-5309812599a9')\"\n",
" title=\"Convert this dataframe to an interactive table.\"\n",
" style=\"display:none;\">\n",
" \n",
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
" width=\"24px\">\n",
" <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
" <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
" </svg>\n",
" </button>\n",
" \n",
" <style>\n",
" .colab-df-container {\n",
" display:flex;\n",
" flex-wrap:wrap;\n",
" gap: 12px;\n",
" }\n",
"\n",
" .colab-df-convert {\n",
" background-color: #E8F0FE;\n",
" border: none;\n",
" border-radius: 50%;\n",
" cursor: pointer;\n",
" display: none;\n",
" fill: #1967D2;\n",
" height: 32px;\n",
" padding: 0 0 0 0;\n",
" width: 32px;\n",
" }\n",
"\n",
" .colab-df-convert:hover {\n",
" background-color: #E2EBFA;\n",
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
" fill: #174EA6;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-convert {\n",
" background-color: #3B4455;\n",
" fill: #D2E3FC;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-convert:hover {\n",
" background-color: #434B5C;\n",
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
" fill: #FFFFFF;\n",
" }\n",
" </style>\n",
"\n",
" <script>\n",
" const buttonEl =\n",
" document.querySelector('#df-f14d8968-2f34-426c-98f3-5309812599a9 button.colab-df-convert');\n",
" buttonEl.style.display =\n",
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
"\n",
" async function convertToInteractive(key) {\n",
" const element = document.querySelector('#df-f14d8968-2f34-426c-98f3-5309812599a9');\n",
" const dataTable =\n",
" await google.colab.kernel.invokeFunction('convertToInteractive',\n",
" [key], {});\n",
" if (!dataTable) return;\n",
"\n",
" const docLinkHtml = 'Like what you see? Visit the ' +\n",
" '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
" + ' to learn more about interactive tables.';\n",
" element.innerHTML = '';\n",
" dataTable['output_type'] = 'display_data';\n",
" await google.colab.output.renderOutput(dataTable, element);\n",
" const docLink = document.createElement('div');\n",
" docLink.innerHTML = docLinkHtml;\n",
" element.appendChild(docLink);\n",
" }\n",
" </script>\n",
" </div>\n",
" </div>\n",
" "
]
},
"metadata": {},
"execution_count": 5
}
]
},
{
"cell_type": "code",
"source": [
"import matplotlib.pyplot as plt\n",
"patient_images = {}\n",
"for visit in all_df.ID.to_list():\n",
" patient = visit[3:].split(\"_\")[0]\n",
" if patient in patient_images:\n",
" patient_images[patient].append(visit)\n",
" else:\n",
" patient_images[patient] = [visit]\n",
"\n",
"\n",
"iuxray_ids_img1 = [patient_images[patient][0] for patient in patient_images if len(patient_images[patient])==1]\n",
"iuxray_ids_img2 = [patient_images[patient][0] for patient in patient_images if len(patient_images[patient])==2]\n",
"iuxray_ids_img3 = [patient_images[patient][0] for patient in patient_images if len(patient_images[patient])==3]\n",
"iuxray_ids_img4 = [patient_images[patient][0] for patient in patient_images if len(patient_images[patient])==4]\n",
"iuxray_ids_img5 = [patient_images[patient][0] for patient in patient_images if len(patient_images[patient])==5]\n",
"samples = [len(iuxray_ids_img1), len(iuxray_ids_img2), len(iuxray_ids_img3), len(iuxray_ids_img4), len(iuxray_ids_img5)]\n",
"number = [1, 2, 3, 4, 5]\n",
"\n",
"plt.xlabel('Number of images per patient')\n",
"plt.ylabel('Number of patients')\n",
"plt.bar(number, samples)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 296
},
"id": "nGsgpDYkC3a8",
"outputId": "2520f321-62cf-4c7b-a1fb-7913d3639592"
},
"execution_count": 6,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"<BarContainer object of 5 artists>"
]
},
"metadata": {},
"execution_count": 6
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
],
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYsAAAEGCAYAAACUzrmNAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAar0lEQVR4nO3de7RdZXnv8e+PcBUTQkzMCElwR4x6og4D3eVSqEUoIYASpGBJFQJSg6fggaot0cpNQHFYRKmCBskBKpITQSVANKTIpTiAZIdLrqRsMUpyItmWW9DCaeA5f8x34WSz95ozmz3XWpv1+4wxx5rznbdnbhjryXzfd72vIgIzM7N6tmt2AGZm1vqcLMzMrJCThZmZFXKyMDOzQk4WZmZWaPtmB1CF0aNHR0dHR7PDMDMbUpYvX/67iBjT1743ZLLo6Oigq6ur2WGYmQ0pkn7d3z5XQ5mZWSEnCzMzK+RkYWZmhZwszMyskJOFmZkVcrIwM7NCThZmZlbIycLMzAo5WZiZWaE35C+4beA65tzW7BAGzfpLjmp2CGZvGH6zMDOzQk4WZmZWyMnCzMwKOVmYmVkhJwszMyvkZGFmZoWcLMzMrJCThZmZFaosWUjaWdJSSY9IWi3pglQ+SdIDkrol/R9JO6byndJ2d9rfkbvW51P5OkmHVxWzmZn1rco3ixeBQyLi/cBUYLqk/YGvApdFxDuAp4FT0/GnAk+n8svScUiaApwAvAeYDlwhaViFcZuZWS+VJYvIPJ82d0hLAIcAN6bya4Fj0vqMtE3af6gkpfL5EfFiRPwK6Ab2rSpuMzN7rUrbLCQNk/QwsBlYAvwSeCYitqZDNgDj0/p44AmAtP9Z4C358j7Oyd9rtqQuSV09PT1VPI6ZWduqNFlExEsRMRWYQPY28O4K7zU3IjojonPMmDFV3cbMrC01pDdURDwD3AkcAIyUVBvtdgKwMa1vBCYCpP27Af+ZL+/jHDMza4Aqe0ONkTQyre8CHAasJUsax6XDZgE3p/WFaZu0/+cREan8hNRbahIwGVhaVdxmZvZaVc5nMQ64NvVc2g5YEBG3SloDzJd0EfAQcHU6/mrgXyV1A0+R9YAiIlZLWgCsAbYCp0fESxXGbWZmvVSWLCJiBbB3H+WP00dvpoh4ATi+n2tdDFw82DGamVk5/gW3mZkVcrIwM7NCThZmZlbIycLMzAo5WZiZWSEnCzMzK+RkYWZmhZwszMyskJOFmZkVcrIwM7NCThZmZlbIycLMzAo5WZiZWSEnCzMzK+RkYWZmhZwszMyskJOFmZkVcrIwM7NCThZmZlbIycLMzAo5WZiZWSEnCzMzK+RkYWZmhZwszMysUGXJQtJESXdKWiNptaQzU/n5kjZKejgtR+bO+bykbknrJB2eK5+eyrolzakqZjMz69v2FV57K/DZiHhQ0nBguaQlad9lEfHP+YMlTQFOAN4D7AH8m6R3pt3fBg4DNgDLJC2MiDUVxm5mZjmVJYuI2ARsSutbJK0Fxtc5ZQYwPyJeBH4lqRvYN+3rjojHASTNT8c6WZiZNUhD2iwkdQB7Aw+kojMkrZA0T9LuqWw88ETutA2prL/y3veYLalLUldPT88gP4GZWXurPFlIejNwE3BWRDwHXAnsBUwle/O4dDDuExFzI6IzIjrHjBkzGJc0M7OkyjYLJO1Aliiuj4gfAUTEk7n9VwG3ps2NwMTc6RNSGXXKzcysAarsDSXgamBtRHw9Vz4ud9hHgFVpfSFwgqSdJE0CJgNLgWXAZEmTJO1I1gi+sKq4zczstap8szgQOBFYKenhVPYFYKakqUAA64HTACJitaQFZA3XW4HTI+IlAElnAIuBYcC8iFhdYdxmZtZLlb2h7gXUx65Fdc65GLi4j/JF9c4zM7NqbVM1lKTtJI2oKhgzM2tNhclC0g8kjZC0K1n7whpJ/1B9aGZm1irKvFlMSV1ejwF+Ckwia4swM7M2USZZ7JC6wB4DLIyI/644JjMzazFlksV3yXot7QrcI+ltwLNVBmVmZq2lTLK4JSLGR8SRERHAb4BPVByXmZm1kDLJ4qb8RkoY86sJx8zMWlG/v7OQ9G6y4cJ3k3RsbtcIYOeqAzMzs9ZR70d57wI+BIwEPpwr3wJ8ssqgzMystfSbLCLiZuBmSQdExH0NjMnMzFpMmeE+uiV9AejIHx8RbuQ2M2sTZZLFzcC/A/8GvFRtOGZm1orKJIs3RcTZlUdiZmYtq0zX2VslHVl5JGZm1rLKJIszyRLGC5Kek7RF0nNVB2ZmZq2jsBoqIoY3IhAzM2tdZYYol6SPSzonbU+UtG/1oZmZWasoUw11BXAA8Ddp+3ng25VFZGZmLadMb6j9ImIfSQ8BRMTTknasOC4zM2shZd4s/lvSMCAAJI0BXq40KjMzayllksXlwI+Bt0q6GLgX+HKlUZmZWUsp0xvqeknLgUMBAcdExNrKIzMzs5ZRb4jyERHxnKRRwGbghty+URHxVCMCNDOz5qtXDfWD9Lkc6Motte26UhfbOyWtkbRa0pmpfJSkJZIeS5+7p3JJulxSt6QVkvbJXWtWOv4xSbMG+KxmZjZA9YYo/1D6nDTAa28FPhsRD0oaDiyXtAQ4GbgjIi6RNAeYA5wNHAFMTst+wJXAfunN5jygk6yRfbmkhRHx9ADjMjOzbVTmR3l3lCnrLSI2RcSDaX0LsBYYD8wArk2HXQsck9ZnANdF5n5gpKRxwOHAkoh4KiWIJcD0wiczM7NBU6/NYmfgTcDoVFWktGsE2Zd+aZI6gL2BB4CxEbEp7fotMDatjweeyJ22IZX1V977HrOB2QB77rnntoRnZmYF6vWGOg04C9iDrJ2iliyeA75V9gaS3gzcBJyVGsxf2RcRISm2Nei+RMRcYC5AZ2fnoFzTzMwy/VZDRcQ3U3vF5yLi7RExKS3vj4hSyULSDmSJ4vqI+FEqfjJVL5E+N6fyjcDE3OkTUll/5WZm1iCFbRYR8S+S3ivpo5JOqi1F5yl7hbgaWBsRX8/tWgjUejTNIpuJr1Z+UuoVtT/wbKquWgxMk7R7qg6blsrMzKxBCn+UJ+k84GBgCrCIrNfSvcB1BaceCJwIrJT0cCr7AnAJsEDSqcCvgY+mfYuAI4Fu4A/AKQAR8ZSkC4Fl6bgv+TceZmaNVWYgweOA9wMPRcQpksYC3y86KSLu5Y/tHL0d2sfxAZzez7XmAfNKxGpmZhUoMzbUf0XEy8BWSSPI2hgmFpxjZmZvIGXeLLokjQSuIusV9TxwX6VRmZlZSykzkODfpdXvSPoZMCIiVlQblpmZtZIybxZIOhY4iGy4jXsBJwszszZSZriPK4BPASuBVcBpkjytqplZGynzZnEI8D9SbyUkXQusrjQqMzNrKWV6Q3UD+cGWJqYyMzNrE2XeLIYDayUtJWuz2Jesh9RCgIg4usL4zMysBZRJFudWHoWZmbW0Ml1n725EIGZm1rrKtFmYmVmbc7IwM7NC/SaL2tSpkr7auHDMzKwV1WuzGCfpz4CjJc2n1wiytfm1zczsja9esjgXOIdsZrqv99oXZD/WMzOzNtBvsoiIG4EbJZ0TERc2MCYzM2sxZbrOXijpaOADqeiuiLi12rDMzKyVlBlI8CvAmcCatJwp6ctVB2ZmZq2jzC+4jwKmptnyagMJPkQ2n7aZmbWBsr+zGJlb362KQMzMrHWVebP4CvCQpDvJus9+AJhTaVRmZtZSyjRw3yDpLuBPU9HZEfHbSqMyM7OWUmpa1YjYBCysOBYzM2tRlY0NJWmepM2SVuXKzpe0UdLDaTkyt+/zkrolrZN0eK58eirrluTqLzOzJqhyIMFrgOl9lF8WEVPTsghA0hTgBOA96ZwrJA2TNAz4NnAEMAWYmY41M7MGqpss0hf2owO5cETcAzxV8vAZwPyIeDEifkU2beu+aemOiMcj4v8B89OxZmbWQHWTRUS8BKyTtGe947bRGZJWpGqq3VPZeOCJ3DEbUll/5WZm1kBlqqF2B1ZLukPSwtoywPtdCewFTAU2AZcO8DqvIWm2pC5JXT09PYN1WTMzo1xvqHMG62YR8WRtXdJVQG2MqY3AxNyhE1IZdcp7X3suMBegs7MzBilkMzOjxJtFmoN7PbBDWl8GDGguC0njcpsfAWo9pRYCJ0jaSdIkYDKwNN1rsqRJknYkawR3F14zswYrfLOQ9ElgNjCKrAppPPAd4NCC824ADgZGS9oAnAccLGkq2XwY64HTACJitaQFZAMVbgVOT+0lSDoDWAwMA+ZFxOptfkozM3tdylRDnU7WK+kBgIh4TNJbi06KiJl9FF9d5/iLgYv7KF8ELCoRp5mZVaRMA/eLqdsqAJK2J3szMDOzNlEmWdwt6QvALpIOA34I3FJtWGZm1krKJIs5QA+wkqyNYRHwxSqDMjOz1lJm1NmX04RHD5BVP62LCFdDmZm1kTK9oY4i6/30S7L5LCZJOi0iflp1cGZm1hrK9Ia6FPhgRHQDSNoLuA1wsjAzaxNl2iy21BJF8jiwpaJ4zMysBfX7ZiHp2LTaJWkRsICszeJ4sl9Wm5lZm6hXDfXh3PqTwF+k9R5gl8oiMjOzltNvsoiIUxoZiJmZta4yvaEmAZ8GOvLHR8TR1YVlZmatpExvqJ+Qjel0C/ByteGYmVkrKpMsXoiIyyuPxMzMWlaZZPFNSecBtwMv1gojYkBzWpiZ2dBTJlm8DzgROIQ/VkNF2jYzszZQJlkcD7w9P0y5mZm1lzK/4F4FjKw6EDMza11l3ixGAo9KWsar2yzcddbMrE2USRbnVR6FmZm1tDLzWdzdiEDMzKx1lfkF9xb+OOf2jsAOwO8jYkSVgZmZWeso82YxvLYuScAMYP8qgzIzs9ZSpjfUKyLzE+DwiuIxM7MWVKYa6tjc5nZAJ/BCZRGZmVnLKfNm8eHccjjZLHkzik6SNE/SZkmrcmWjJC2R9Fj63D2VS9LlkrolrZC0T+6cWen4xyTN2tYHNDOz169Mm8VA57W4BvgWcF2ubA5wR0RcImlO2j4bOAKYnJb9gCuB/SSNIuu620nWyL5c0sKIeHqAMZmZ2QDUm1b13DrnRURcWO/CEXGPpI5exTOAg9P6tcBdZMliBnBdRARwv6SRksalY5dExFMppiXAdOCGevc2M7PBVa8a6vd9LACnkn3BD8TYiNiU1n8LjE3r44EncsdtSGX9lb+GpNmSuiR19fT0DDA8MzPrS71pVS+trUsaDpwJnALMBy7t77yyIiIkRfGRpa83F5gL0NnZOWjXNTOzggbu1CB9EbCCLLHsExFnR8TmAd7vyVS9RPqsXWcjMDF33IRU1l+5mZk1UL/JQtLXgGVkvZ/eFxHnD0LD8kKg1qNpFnBzrvyk1Ctqf+DZVF21GJgmaffUc2paKjMzswaq1xvqs2SjzH4R+Kfsx9sAiKwWqe5wH5JuIGugHi1pA1mvpkuABZJOBX4NfDQdvgg4EugG/kBW3UVEPCXpQrKkBfClWmO3mZk1Tr02i236dXcf58/sZ9ehfRwbwOn9XGceMO/1xGJmZq/P60oIZmbWHpwszMyskJOFmZkVcrIwM7NCThZmZlbIycLMzAo5WZiZWSEnCzMzK+RkYWZmhZwszMyskJOFmZkVcrIwM7NCThZmZlbIycLMzAo5WZiZWSEnCzMzK+RkYWZmhZwszMyskJOFmZkVcrIwM7NCThZmZlbIycLMzAo5WZiZWaGmJAtJ6yWtlPSwpK5UNkrSEkmPpc/dU7kkXS6pW9IKSfs0I2Yzs3bWzDeLD0bE1IjoTNtzgDsiYjJwR9oGOAKYnJbZwJUNj9TMrM21UjXUDODatH4tcEyu/LrI3A+MlDSuGQGambWrZiWLAG6XtFzS7FQ2NiI2pfXfAmPT+njgidy5G1LZq0iaLalLUldPT09VcZuZtaXtm3TfgyJio6S3AkskPZrfGREhKbblghExF5gL0NnZuU3nmplZfU15s4iIjelzM/BjYF/gyVr1UvrcnA7fCEzMnT4hlZmZWYM0PFlI2lXS8No6MA1YBSwEZqXDZgE3p/WFwEmpV9T+wLO56iozM2uAZlRDjQV+LKl2/x9ExM8kLQMWSDoV+DXw0XT8IuBIoBv4A3BK40M2M2tvDU8WEfE48P4+yv8TOLSP8gBOb0Bor+iYc1sjb1eZ9Zcc1ewQzOwNopW6zpqZWYtysjAzs0JOFmZmVsjJwszMCjlZmJlZIScLMzMr5GRhZmaFnCzMzKyQk4WZmRVysjAzs0JOFmZmVsjJwszMCjlZmJlZIScLMzMr5GRhZmaFmjUHt1nLeaPMYwKey8QGn98szMyskJOFmZkVcrIwM7NCThZmZlbIycLMzAo5WZiZWSEnCzMzK+RkYWZmhYZMspA0XdI6Sd2S5jQ7HjOzdjIkkoWkYcC3gSOAKcBMSVOaG5WZWfsYKsN97At0R8TjAJLmAzOANU2NyuwNxMOdWD2KiGbHUEjSccD0iPjbtH0isF9EnJE7ZjYwO22+C1jX8EC3zWjgd80Ookna+dmhvZ+/nZ8dWv/53xYRY/raMVTeLApFxFxgbrPjKEtSV0R0NjuOZmjnZ4f2fv52fnYY2s8/JNosgI3AxNz2hFRmZmYNMFSSxTJgsqRJknYETgAWNjkmM7O2MSSqoSJiq6QzgMXAMGBeRKxucliv15CpMqtAOz87tPfzt/OzwxB+/iHRwG1mZs01VKqhzMysiZwszMyskJNFg0maJ2mzpFXNjqXRJE2UdKekNZJWSzqz2TE1iqSdJS2V9Eh69guaHVMzSBom6SFJtzY7lkaStF7SSkkPS+pqdjwD4TaLBpP0AeB54LqIeG+z42kkSeOAcRHxoKThwHLgmIh4w/8SX5KAXSPieUk7APcCZ0bE/U0OraEkfQboBEZExIeaHU+jSFoPdEZEK/8gry6/WTRYRNwDPNXsOJohIjZFxINpfQuwFhjf3KgaIzLPp80d0tJW/1KTNAE4Cvhes2OxbedkYU0hqQPYG3iguZE0TqqCeRjYDCyJiLZ59uQbwD8CLzc7kCYI4HZJy9PQREOOk4U1nKQ3AzcBZ0XEc82Op1Ei4qWImEo2AsG+ktqmGlLSh4DNEbG82bE0yUERsQ/ZyNmnp+roIcXJwhoq1dffBFwfET9qdjzNEBHPAHcC05sdSwMdCByd6u7nA4dI+n5zQ2qciNiYPjcDPyYbSXtIcbKwhkmNvFcDayPi682Op5EkjZE0Mq3vAhwGPNrcqBonIj4fERMiooNsuJ6fR8THmxxWQ0jaNXXoQNKuwDRgyPWGdLJoMEk3APcB75K0QdKpzY6pgQ4ETiT7V+XDaTmy2UE1yDjgTkkryMY6WxIRbdV9tI2NBe6V9AiwFLgtIn7W5Ji2mbvOmplZIb9ZmJlZIScLMzMr5GRhZmaFnCzMzKyQk4WZmRVysrBBIykkXZrb/pyk8wfp2tdIOm4wrlVwn+MlrZV0Z6/yPSTdWPX9hypJJ0vaI7f9PUlTBnitgyX92eBFZ4PBycIG04vAsZJGNzuQPEnbMn3wqcAnI+KD+cKI+L8RUXmyaoZt/Pv052TglWQREX/7OkYTPhhwsmgxThY2mLaSzTH897139H4zkPR8+jxY0t2Sbpb0uKRLJH0szf2wUtJeucv8paQuSf+RxhqqDc73NUnLJK2QdFruuv8uaSHwmi8tSTPT9VdJ+moqOxc4CLha0td6Hd9Rm4Mk/Sv6J5KWpHkKzpD0mTRPw/2SRqXjPpniekTSTZLelMr3SsetlHRR7W+R9v1D7lkuSGW7SrotXWeVpL/u43nukvTN9EPHVZL2zZ07L/09H5I0I/cMCyX9HLijj2d9VNL16S3rxlzs56b4Vkmaq8xxZMOOX5/uv0uKpzOdM03SfZIelPRDZWOD1eZ4uCCVr5T0bmUDTH4K+Pt0rT/v/azWJBHhxcugLGTzdIwA1gO7AZ8Dzk/7rgGOyx+bPg8GniH7hfNOwEbggrTvTOAbufN/RvYPnMnABmBnYDbwxXTMTkAXMCld9/fApD7i3AP4DTAG2B74Odm8GgB3kc070PucDmBVWj8Z6AaGp2s8C3wq7buMbIBEgLfkzr8I+HRavxWYmdY/lftbTCNLtkrPeSvwAeCvgKty19qtj/juqh2TzqnF+mXg42l9JPAfwK7pGTYAo/p51gAOTNvzgM+l9VG54/4V+HBff7faNjAauIdsLg+As4Fz0/r63N/k74DvpfXza/fz0jqL3yxsUEU2iux1wP/ahtOWRTbXxYvAL4HbU/lKsi+umgUR8XJEPAY8Dryb7Av2JGVDfz8AvIUsmQAsjYhf9XG/PwXuioieiNgKXE/2Bbst7oyILRHRQ5Ysbukj5vemt5uVwMeA96TyA4AfpvUf5K45LS0PAQ+m55ucrnmYpK9K+vOIeLafmG6AV+ZMGaFsLKppwJz097mLLMHumY5fEhH9za3yRET8Iq1/n+yNC+CDkh5Iz3RI7pn6sz8wBfhFimEW8Lbc/tpgkst59X9razGDUVdp1ts3yL7s/neubCup2lPSdsCOuX0v5tZfzm2/zKv/H+09Nk2Q/Sv80xGxOL9D0sFkbxZVKRPzNWRvLI9IOpnsbaceAV+JiO++Zoe0D3AkcJGkOyLiS32c39/f568iYl2v6+1H/b/Pa64laWfgCrI3iCeUdV7Yud4DpfsviYiZ/eyv/d1ewt9HLc1vFjbo0r9WF5A1FtesB/4krR9NNlPctjpe0napHePtwDpgMfA/lQ19jqR3KhvZs56lwF9IGi1pGDATuHsA8RQZDmxKsX0sV34/WdUSZCOw1iwGPpGr0x8v6a3Kehn9ISK+D3wN2Kef+/11Ou8g4Nn0BrIY+LQkpX17l4x9T0kHpPW/IZsGtpYYfpdizDf4b0nP29v9wIGS3pHuv6ukdxbcu79rWRM5WVhVLiWrr665iuwL+hGyapiB/Kv/N2Rf9D8layN4gWyKzjXAg6kB+rsU/As1IjYBc8jmlHgEWB4RNw8gniLnkFWN/YJXD0d+FvAZZSPQvoOsGouIuJ2sWuq+VM1zI9mX5vuApaka5zyy9o++vCDpIeA7/DFRX0iWmFdIWp22y1hHNknPWmB34MrI5uG4imx47cVko+fWXAN8p9bAXStM1XQnAzek572PrHqtnluAj7iBu7V41FmzBks9i/4rIkLSCWSN3TNe5zXvImsU7hqE+DqAWyOibWbys2KuIzRrvD8BvpWqhp4BPtHkeMwK+c3CzMwKuc3CzMwKOVmYmVkhJwszMyvkZGFmZoWcLMzMrND/Bzss79YY0ZCDAAAAAElFTkSuQmCC\n"
},
"metadata": {
"needs_background": "light"
}
}
]
},
{
"cell_type": "code",
"source": [
"import numpy as np\n",
"\n",
"with open(two_patients) as f:\n",
" data = json.load(f)\n",
"captions = list(data.values())\n",
"words_per_sent = list()\n",
"for caption in captions:\n",
" tokenized_caption = caption.split()\n",
" words_per_sent.append(len(tokenized_caption))\n",
"\n",
"print(f'Minimum word-tokens in a caption: {min(words_per_sent)} (Occurences: {sum(np.array(words_per_sent)==min(words_per_sent))} times)')\n",
"print(f'Maximum word-tokens in a caption: {max(words_per_sent)} (Occurences: {sum(np.array(words_per_sent)==max(words_per_sent))} times)')\n",
"print('mean number of words in captions:', round(np.mean(words_per_sent),2))"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "PhdS3Sd7vwI1",
"outputId": "699e821b-4236-4095-e70a-71a7d439aa2d"
},
"execution_count": 7,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Minimum word-tokens in a caption: 3 (Occurences: 10 times)\n",
"Maximum word-tokens in a caption: 176 (Occurences: 1 times)\n",
"mean number of words in captions: 37.27\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import matplotlib.pyplot as plt\n",
"plt.figure(figsize=(30,15))\n",
"fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(11, 4))\n",
"\n",
"ax1.hist(words_per_sent, bins=50, align='left', edgecolor='black',\n",
" linewidth=0.5)\n",
" \n",
"# Add axis labels\n",
"ax1.set(xlabel='Number of words', ylabel='Images')\n",
"\n",
" \n",
"# plt.show()\n",
"ax2.set(xlabel='Number of words', ylabel='')\n",
"ax2.set_yticklabels(' ')\n",
"ax2.boxplot(words_per_sent, vert=False)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 435
},
"id": "DfT7ErR8yBTd",
"outputId": "f5eb1aa1-b6d5-4981-d4e6-5df156074a70"
},
"execution_count": 8,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'whiskers': [<matplotlib.lines.Line2D at 0x7f12f385d990>,\n",
" <matplotlib.lines.Line2D at 0x7f12f385ded0>],\n",
" 'caps': [<matplotlib.lines.Line2D at 0x7f12f3867450>,\n",
" <matplotlib.lines.Line2D at 0x7f12f38548d0>],\n",
" 'boxes': [<matplotlib.lines.Line2D at 0x7f12f385d3d0>],\n",
" 'medians': [<matplotlib.lines.Line2D at 0x7f12f3867950>],\n",
" 'fliers': [<matplotlib.lines.Line2D at 0x7f12f3867cd0>],\n",
" 'means': []}"
]
},
"metadata": {},
"execution_count": 8
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"<Figure size 2160x1080 with 0 Axes>"
]
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"<Figure size 792x288 with 2 Axes>"
],
"image/png": "iVBORw0KGgoAAAANSUhEUgAAApwAAAEGCAYAAADIXkUdAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nO3de5RdVZ3g8e8vRZlgAiSRgCEPAkrboZnViGnE7vaBLwIKxB50jAzSmpGG0TQ2SotmLWSYjgPNgKNZGho6juggSNOtxEcjSqfBjItHQEQgRGKAkZiQKCWQYEIev/njngq3kqrkVlL73ltV389aZ9U5++x9zu+eW9n51XntyEwkSZKkUka0OgBJkiQNbSackiRJKsqEU5IkSUWZcEqSJKkoE05JkiQVtV+rA9gXBx98cE6bNq3VYUgaYu67777fZOaEVsdRin2npBJ213cO6oRz2rRpLFu2rNVhSBpiIuLJVsdQkn2npBJ213d6SV2SJElFmXBKkiSpKBNOSZIkFWXCKUmSpKJMOCVJklSUCackSZKKMuGUJElSUSackiRJKsqEU5IkSUUN6pGGBpPzLryYVWu7epQd+cpxLLzi0hZFJEmS1BwmnE2yam0XKybN7Fm4+tbWBCNJktREXlKXJElSUSackiRJKsqEU5IkSUWZcEqSJKkoE05JkiQVZcIpSZKkokw4JUmSVJQJpyRJkooqlnBGxKiIuCcifhYRD0fEf6vKvxoRj0fEA9V0bFUeEfHFiFgZEQ9GxHGlYpMkSVLzlBxpaDPw1szcEBGdwNKI+Ndq3YWZefNO9U8Gjqqm1wMLq5/DSm9DYILDYEqSpMGrWMKZmQlsqBY7qyl30+R04GtVu7siYmxETMzMNaVibEe9DoEJDoMpSZIGraL3cEZER0Q8AKwDfpiZd1er5leXzT8fESOrsknAr+qaP1WV7bzNcyJiWUQsW79+fcnwJUmSNACKJpyZuS0zjwUmA8dHxDHAp4E/BP4EGA98qp/bvCYzZ2TmjAkTJgx4zJIkSRpYTXlKPTN/BywBZmbmmqzZDPxv4Piq2mpgSl2zyVWZJEmSBrGST6lPiIix1fz+wDuARyNiYlUWwCzgoarJYuCD1dPqJwDPDrf7NyVJkoaikk+pTwSui4gOaontTZn53Yj4t4iYAATwAHBuVf/7wCnASuAF4EMFY5MkSVKTlHxK/UHgtb2Uv7WP+gl8tFQ8kiRJag1HGpIkSVJRJpySJEkqyoRTkiRJRZlwSpIkqSgTTkmSJBVlwilJkqSiTDglSZJUlAmnJEmSijLhlCRJUlElh7YcFs678GJWre3qUXbkK8ex8IpLWxSRJElSezHh3Eer1naxYtLMnoWrb21NMJIkSW3IS+qSJEkqyoRTkiRJRZlwSpIkqSgTTkmSJBVlwilJkqSiTDglSZJUlAmnJEmSijLhlCRJUlHFEs6IGBUR90TEzyLi4Yj4b1X5ERFxd0SsjIhvRsTLqvKR1fLKav20UrFJkiSpeUqe4dwMvDUz/xg4FpgZEScAlwOfz8xXA13AnKr+HKCrKv98VU+SJEmDXLGEM2s2VIud1ZTAW4Gbq/LrgFnV/OnVMtX6t0VElIpPkiRJzVH0Hs6I6IiIB4B1wA+BXwK/y8ytVZWngEnV/CTgVwDV+meBV/SyzXMiYllELFu/fn3J8CVJkjQAiiacmbktM48FJgPHA384ANu8JjNnZOaMCRMm7HOMkiRJKqspT6ln5u+AJcAbgLERsV+1ajKwuppfDUwBqNYfBPy2GfFJkiSpnJJPqU+IiLHV/P7AO4Dl1BLPM6pqZwO3VPOLq2Wq9f+WmVkqPkmSJDXHfnuustcmAtdFRAe1xPamzPxuRDwC3BgRfwf8FFhU1V8EfD0iVgLPAO8vGJskSZKapFjCmZkPAq/tpXwVtfs5dy7fBLy3VDySJElqDUcakiRJUlEmnJIkSSrKhFOSJElFmXBKkiSpKBNOSZIkFWXCKUmSpKJMOCVJklSUCackSZKKMuGUJElSUSackiRJKsqEU5IkSUWZcEqSJKkoE05JkiQVZcIpSZKkokw4JUmSVJQJpyRJkorar9UBDGcrlj/CSWfN7VH22KrHYVKLApIkSSrAhLOFNkcnKybN7Fn26AJGtigeSZKkEopdUo+IKRGxJCIeiYiHI+L8qvySiFgdEQ9U0yl1bT4dESsjYkVEnFQqNkmSJDVPyTOcW4FPZOb9EXEAcF9E/LBa9/nM/J/1lSPiaOD9wB8BhwE/iog/yMxtBWOUJElSYcXOcGbmmsy8v5p/HljO7u9OPB24MTM3Z+bjwErg+FLxSZIkqTmacg9nREwDXgvcDfwZ8LGI+CCwjNpZ0C5qyehddc2eopcENSLOAc4BmDp1atG42915F17MqrVdPcqOfOU4Fl5xaYsikiRJ2lXxhDMixgD/DHw8M5+LiIXAfwey+nkl8OFGt5eZ1wDXAMyYMSMHPuLBY9Xarl0eOmL1ra0JRpIkqQ9F38MZEZ3Uks3rM/NfADLz6czclpnbgWt56bL5amBKXfPJVZkkSZIGsWJnOCMigEXA8sy8qq58YmauqRbfAzxUzS8GvhERV1F7aOgo4J5S8Q02vrNTkiQNViUvqf8ZcBbw84h4oCr7DDA7Io6ldkn9CeCvADLz4Yi4CXiE2hPuH/UJ9Zf4zk5JkjRYFUs4M3MpEL2s+v5u2swH5peKSZIkSc3nWOqSJEkqyoRTkiRJRZlwSpIkqSgTTkmSJBVlwilJkqSiTDglSZJUlAmnJEmSijLhlCRJUlEmnJKktjV+/Hgiot8Tlxy0V+3qp/Hjx7f640tDRkMJZ0ScHxEHRs2iiLg/It5ZOjhJ0vDW1dVFZvZ7AvaqXf3U1dXV4k8vDR2NDm354cz8QkScBIyjNkb614HbikU2iK1Y/ggnnTW3R9ljqx6HSS0KSJIkqYUaTTi7x0Q/Bfh6Zj4cEb2Nky5gc3SyYtLMnmWPLmBki+KRJElqpUbv4bwvIm6jlnD+ICIOALaXC0uSJElDRaNnOOcAxwKrMvOFiHgF8KFyYUmSJGmoaPQMZwJHA39dLY8GRhWJSJIkSUNKownnl4E3ALOr5eeBLxWJSJIkSUNKo5fUX5+Zx0XETwEysysiXlYwLklSG4mIHa8bUnvzu1I7avQM55aI6KB2aZ2ImIAPDUmSJKkBjSacXwS+BRwSEfOBpcDnikUlSZKkIaOhhDMzrwf+FvgfwBpgVmb+0+7aRMSUiFgSEY9ExMMRcX5VPj4ifhgRj1U/x1XlERFfjIiVEfFgRBy3bx9NkiRJ7aDRoS3HA+uAG4BvAE9HROcemm0FPpGZRwMnAB+NiKOBi4DbM/Mo4PZqGeBk4KhqOgdY2M/PIkmSpDbU6CX1+4H1wC+Ax6r5J6ox1V/XW4PMXJOZ91fzzwPLqQ3ueDpwXVXtOmBWNX868LWsuQsYGxET9+IzSZI0LI0ZMwaoPTg0lKdRo0bR0dHBlClTmDJlCiNGjKCzs3PH+v3222/H/JQpU7jhhht6PV433HADxxxzzC7te2vTXbejo4Njjjmmz202Wre3OnPnzmXUqFE7PuPcuXN72Xrj+hPzvrRpRKMJ5w+BUzLz4Mx8BbWzkd8F/iu1VybtVkRMA14L3A0cmplrqlVrgUOr+UnAr+qaPUUvo49HxDkRsSwilq1fv77B8CVJGtrGjBnDxo0bW7Lvww47rM91HR0du5SNGNEz/ehrtOzp06f3WPfmN7+Z/fffn82bN3PYYYexbds2nn/+ecaMGcOBBx7IQQcdxMtf/nIykwMPPJCLLrqIbdu2cf755/eaQM6bN49Zs2Zx8MEHc+CBB3LooYdy0UUXsXXr1h5tuusuWLCATZs2sWDBAubNm9dnIrmnur3VOffcc/nyl7/M5z73OTZu3MjnPvc5rr766r1OOvsT8760aVSjCecJmfmD7oXMvA14Q3UmcrdDhEfEGOCfgY9n5nP167L23oZ+vbshM6/JzBmZOWPChAn9aSpJ0pC1ceNGpk2btmP5tNNO2zHfV0K3s/pEsH5b9fN/+qd/uku7NWvWMGLECA444IBd6m3bto3zzjtvR/l5553H9u3be9TJzB7bHTFiBGPGjGHVqlVkJp2dney3337ceeedfO973+O0007jqaee4vrrr2fcuHFs2rSJm2++mXHjxrFlyxauuOIKxo8fz3e+8x2uv/56Ro8ezfz583vEPH/+fBYtWsS3v/1tRo8ezc0338wNN9zAd77zHb7xjW/0aNNd98QTT6Szs5MTTzyRRYsW7bLNRuv2Vuf3v/89hxxyCBdccAEvf/nLueCCC7j88su59tpr+/7CdqM/Me9Lm0ZFI+/qqsZRvx24sSr6T8A7gJnAvZnZ6wM+1X2e3wV+kJlXVWUrgLdk5prqkvm/Z+ZrIuIfqvkbdq7XV1wzZszIZcuWNfhRyzjprLmsmDSzR9nm2xcw8m1zB6ysP3Vfs/pWfvD1Bf3+HJJeEhH3ZeaMVscxkCLiHGr3xzN16tTXPfnkk/1tXyKshuzVOyUvOQgueXaf9tvKzzwQ1q9fT/eJmVtvvZWZM2fuoQX86Ec/4u1vfzsAK1eu5NWvfvUu87/+9a/7PKM5YsSIHclkfb3f/e53jB07dpf5+jo7b7ev94m++OKLPPvss0yYMIEXX3yRkSNHkpm8+OKLjBo1iu3bt7Nx40bGjBlDRLBp0yZGjaoNjrht27Yd2+no6OixbtOmTQCMGjWKTZs2MXLkSCKCbdu27ajb2fnS4ytbtmxh1KhRPbZZv93d1e2tTkQwYsSIHtt74YUXGD169F79G+hPzPvSpt7u+s5Gz3B+AJgMfLuaplZlHcD7+thpAIuA5d3JZmUxcHY1fzZwS135B6un1U8Ant1dsilJatxAXB3KzKZPrdaKz7wvx6r+TOScOXN2zJ988skNfd53vvOdO+a7E8+d588444xd2nUnS6NHj+613qc//ele5+vr1M93b2vkyNpF1O4znBHB0qVLd3y2pUuXcvjhh9PZ2cnSpUuZOnUqnZ2dXH311Rx++OFMnz59R/n06dN7xNy9bvr06UydOpWlS5fuWO7ebneb7rJ63XV31kjd3up0dnZyyCGH9Ci7+uqrdxyD/upPzPvSplGNvhbpN5k5NzNfW00fy8z1mfliZq7so9mfAWcBb42IB6rpFOAy4B0R8Rjw9moZ4PvAKmAlcC21+0MlSVIDRo8ezRNPPLFjefHixTvmG03eu89OAj22VT//k5/8ZJd2EydOZPv27Tz//PO71Ovo6GDhwpdePLNw4cIdl+6760REj+1u376dDRs2cOSRRxIRbNmyha1bt/KmN72Jd73rXSxevJjJkydz5pln0tXVxahRozjjjDPo6uqis7OTCy+8kGeeeYZTTz2VM888k40bNzJv3rweMc+bN485c+Ywa9YsNm7cyBlnnMHs2bM59dRT+cAHPtCjTXfdJUuWsGXLFpYsWcKcOXN22WajdXurs//++7Nu3TquuuoqXnjhBa666io+9alP8ZGPfKTvL2w3+hPzvrRpVENDW1YjC/0t8EfAqO7yzHxrX20ycynQ1/WIt/VSP4GPNhKPJEnqacOGDS17cOjXv/51n+t6uxRbn9hC3wnx8uXLeyzfcccdAIwcOXLHJfgDDjiA1atX09HRwdatW4Fakvvcc89x2WWXMXnyZK688kpmz57dY1vdy/Pnz+c3v/nNjvbdba666qoddbp/zp07l+XLlzN9+nTmz5+/yzYbrdtbnauvvpqf/OQnfOYzn+ETn/gEI0eO5Nxzz2XBgr27Ta4/Me9Lm0Y1Opb69cA3gXcD51K7FO4j4pIktZENGzY4lno/zJ49u+FkaqDr9lZn9uzZe51g7m0cA9GmEY3ew/mKzFwEbMnMOzLzw0CfZzclSZKkbo2e4dxS/VwTEe8Cfg2MLxOSJEmShpJGE86/i4iDgE8AC4ADgb8pFpUkSZKGjIYSzsz8bjX7LHBiuXAkSZI01DT6lPoRwFxgWn2bzDytrzaSpKHDh1AGD78rtaNGL6l/m9pL3L8DbN9DXUmSJGmHRhPOTZn5xaKRSJIkaUhqNOH8QkR8FrgN2NxdmJn3F4lKkiRJQ0ajCed/oBqmkpcuqSe+i1OSJEl70GjC+V7gyMx8sWQwkiRJGnoaHWnoIWBsyUAkSZI0NDV6hnMs8GhE3EvPezh9LVKbWbH8EU46a26PsiNfOY6FV1zaoogkad9ERL/b5GcP3Kt29caNG7dP7SW9pNGE87NFo9CA2RydrJg0s2fh6ltbE4wk7aN9eadkXjJwcUjaN42ONHRH6UAkSZI0NO024YyI56k9jb7LKiAz88AiUUmSJGnI2G3CmZkHNCsQSZIkDU2NPqUuSZIk7RUTTkmSJBVVLOGMiK9ExLqIeKiu7JKIWB0RD1TTKXXrPh0RKyNiRUScVCouSZIkNVfJM5xfBWb2Uv75zDy2mr4PEBFHA+8H/qhq8+WI6CgYmyRJkpqkWMKZmXcCzzRY/XTgxszcnJmPAyuB40vFJkmSpOZpxT2cH4uIB6tL7t3DOEwCflVX56mqbBcRcU5ELIuIZevXry8dqyRJkvZRsxPOhcCrgGOBNcCV/d1AZl6TmTMyc8aECRMGOj5JkiQNsKYmnJn5dGZuy8ztwLW8dNl8NTClrurkqkySJEmDXFMTzoiYWLf4HqD7CfbFwPsjYmREHAEcBdzTzNgkSZJURkNjqe+NiLgBeAtwcEQ8BXwWeEtEHEttuMwngL8CyMyHI+Im4BFgK/DRzNxWKjZJkiQ1T7GEMzNn91K8aDf15wPzS8UjSZKk1nCkIUmSJBVlwilJkqSiTDglSZJUlAmnJEmSijLhlCRJUlEmnJIkSSrKhFOSJElFFXsPp9rbeRdezKq1XT3KjnzlOBZecWmLIpIkSUOVCecwtWptFysmzexZuPrW1gQjSZKGNC+pS5IkqSgTTkmSJBVlwilJkqSiTDglSZJUlAmnJEmSijLhlCRJUlEmnJIkSSrKhFOSJElFmXBKkiSpqGIjDUXEV4B3A+sy85iqbDzwTWAa8ATwvszsiogAvgCcArwA/GVm3l8qtuFmxfJHOOmsuT3KHlv1OExqUUCSJGlYKXmG86vATmMnchFwe2YeBdxeLQOcDBxVTecACwvGNexsjk5WTJrZY/r9i1tbHZYkSRomiiWcmXkn8MxOxacD11Xz1wGz6sq/ljV3AWMjYmKp2CRJktQ8xS6p9+HQzFxTza8FDq3mJwG/qqv3VFW2hp1ExDnUzoIyderUcpEKgPMuvJhVa7t6lB35ynEsvOLSFkUkSZIGm2YnnDtkZkZE7kW7a4BrAGbMmNHv9uqfVWu7WDFppzsjVt/ammAkSdKg1Oyn1J/uvlRe/VxXla8GptTVm1yVSZIkaZBrdsK5GDi7mj8buKWu/INRcwLwbN2ld0mSJA1iJV+LdAPwFuDgiHgK+CxwGXBTRMwBngTeV1X/PrVXIq2k9lqkD5WKS5IkSc1VLOHMzNl9rHpbL3UT+GipWNQY39cpSZJKaNlDQ2o/3e/r7FH26AJGtigeSZI0NDi0pSRJkooy4ZQkSVJRJpySJEkqyoRTkiRJRZlwSpIkqSifUu+DY4hLkiQNDBPOPjiGuCRJ0sDwkrokSZKKMuGUJElSUSackiRJKsqEU5IkSUX50FA/rFj+CCedNbdH2WOrHodJLQpIkiRpEDDh7IfN0bnLk+ubH13AyBbFI0mSNBh4SV2SJElFmXBKkiSpKBNOSZIkFeU9nBoQDgUqSZL6YsKpAeFQoJIkqS8tSTgj4gngeWAbsDUzZ0TEeOCbwDTgCeB9mdnV1zYkSZI0OLTyHs4TM/PYzJxRLV8E3J6ZRwG3V8uSJEka5NrpoaHTgeuq+euAWS2MRZIkSQOkVfdwJnBbRCTwD5l5DXBoZq6p1q8FDu2tYUScA5wDMHXq1GbEqp044pIkSeqPViWcf56ZqyPiEOCHEfFo/crMzCoZ3UWVnF4DMGPGjF7rqCxHXJIkSf3RkoQzM1dXP9dFxLeA44GnI2JiZq6JiInAulbEprJ8fZIkScNP0xPOiBgNjMjM56v5dwKXAouBs4HLqp+3NDs2lefrkyRJGn5acYbzUOBbEdG9/29k5q0RcS9wU0TMAZ4E3teC2CRJkjTAmp5wZuYq4I97Kf8t8LZmxyNJkqSy2um1SJIkSRqCHNpSxfj6JEmSBCacKqjR1yf1lpj65LokSUOHCadarrfE1CfXJUkaOryHU5IkSUWZcEqSJKkoL6mrLXlfpyRJQ4cJp9qS93VKkjR0eEldkiRJRQ27M5znXXgxq9Z29Shb8+QvmXj4q3qU+b5ISZKkgTHsEs5Va7t6fTfkcw28L1Ltqbc/IrzfU5Kk9jHsEk4NXr09SAS1s9Hb33hez0Lv95QkqW2YcGrQ6PVBIjwbLUlSu/OhIUmSJBXlGU4NG97rKUlSa5hwatjo7YEx7/WUJKk8L6lLkiSpKM9wakjq7Yn23t6t6hCakiSVZ8KpIam3J9p7e5q9t3orfnTVLklob4MDNFpmAitJGu7aLuGMiJnAF4AO4B8z87IWh6Rhpq9ktbfBARop6+0+UR9gkiQNJ22VcEZEB/Al4B3AU8C9EbE4Mx9pbWTS3uvr8v7OL6vv7cxqf5JQk1hJUrtqq4QTOB5YmZmrACLiRuB0wIRTg9ZAX97vK4ls9Cn8RhPTfak30LcW7Ess+7pvSdK+i8xsdQw7RMQZwMzM/C/V8lnA6zPzY3V1zgHOqRZfA6xocPMHA78ZwHAHQrvF1G7xgDE1ot3igcEf0+GZOaFkMK0UEeuBJ3dTZbB/f83SbjG1WzxgTI1ot3hg72Pqs+9stzOce5SZ1wDX9LddRCzLzBkFQtpr7RZTu8UDxtSIdosHjKnd7SmZbsdjZUx71m7xgDE1ot3igTIxtdt7OFcDU+qWJ1dlkiRJGqTaLeG8FzgqIo6IiJcB7wcWtzgmSZIk7YO2uqSemVsj4mPAD6i9FukrmfnwAG2+35fhm6DdYmq3eMCYGtFu8YAxDXbteKyMac/aLR4wpka0WzxQIKa2emhIkiRJQ0+7XVKXJEnSEGPCKUmSpKKGRcIZETMjYkVErIyIi1qw/ykRsSQiHomIhyPi/Kr8kohYHREPVNMpTY7riYj4ebXvZVXZ+Ij4YUQ8Vv0c16RYXlN3HB6IiOci4uPNPkYR8ZWIWBcRD9WV9XpMouaL1e/VgxFxXBNjuiIiHq32+62IGFuVT4uI39cdr6ubGFOf31VEfLo6Tisi4qQmxfPNulieiIgHqvKmHKPByv6yz7japr+s9m2f2Xg89pd7jqd8f5mZQ3qi9vDRL4EjgZcBPwOObnIME4HjqvkDgF8ARwOXAJ9s4bF5Ajh4p7K/By6q5i8CLm/Rd7YWOLzZxwh4E3Ac8NCejglwCvCvQAAnAHc3MaZ3AvtV85fXxTStvl6Tj1Ov31X1u/4zYCRwRPXvsaN0PDutvxK4uJnHaDBO9pe7jast+8u6780+s+947C/3EM9O64v0l8PhDOeO4TIz80Wge7jMpsnMNZl5fzX/PLAcmNTMGPrhdOC6av46YFYLYngb8MvM3N1IKEVk5p3AMzsV93VMTge+ljV3AWMjYmIzYsrM2zJza7V4F7V31jZNH8epL6cDN2bm5sx8HFhJ7d9lU+KJiADeB9wwkPscouwv+6cd+kuwz9xtPPaXjcdTsr8cDgnnJOBXdctP0cLOKyKmAa8F7q6KPlad5v9KMy/HVBK4LSLui9qQoQCHZuaaan4tcGiTY4La+1frf9lbeYyg72PSLr9bH6Z21qDbERHx04i4IyLe2ORYevuuWn2c3gg8nZmP1ZW18hi1s1Z/Vz3YXzbMPrNx9pe7V6y/HA4JZ9uIiDHAPwMfz8zngIXAq4BjgTXUTmM3059n5nHAycBHI+JN9Suzdj69qe/NitoL/08D/qkqavUx6qEVx2R3ImIesBW4vipaA0zNzNcCFwDfiIgDmxROW31XdWbT8z/jVh4jNcj+sjH2mY2zv2xIsf5yOCScbTFcZkR0Uus8r8/MfwHIzKczc1tmbgeuZYBPm+9JZq6ufq4DvlXt/+nuSxzVz3XNjIlaZ35/Zj5dxdbSY1Tp65i09HcrIv4SeDdwZtWpU12G+W01fx+1+3/+oBnx7Oa7atlxioj9gL8AvlkXZ8uO0SBgf9mHNu0vwT6zIfaXe1a6vxwOCWfLh8us7olYBCzPzKvqyuvvXXkP8NDObQvGNDoiDuiep3ZT9UPUjs3ZVbWzgVuaFVOlx19XrTxGdfo6JouBD0bNCcCzdZeRioqImcDfAqdl5gt15RMioqOaPxI4CljVpJj6+q4WA++PiJERcUQV0z3NiAl4O/BoZj5VF2fLjtEgYH/Ze0zt2l+CfeYe2V82rGx/ORBPHrX7RO3JuF9Qy8zntWD/f07tksKDwAPVdArwdeDnVfliYGITYzqS2pNwPwMe7j4uwCuA24HHgB8B45sY02jgt8BBdWVNPUbUOu41wBZq987M6euYUHvS8kvV79XPgRlNjGkltft8un+frq7q/sfq+3wAuB84tYkx9fldAfOq47QCOLkZ8VTlXwXO3aluU47RYJ3sL3uNqe36y2r/9pmNxWN/uYd4qvKi/aVDW0qSJKmo4XBJXZIkSS1kwilJkqSiTDglSZJUlAmnJEmSijLhlCRJUlEmnNpnEZERcWXd8icj4pIB2vZXI+KMgdjWHvbz3ohYHhFLSu+r2t8lEfHJZuxLUnuy79yr/dl3DlImnBoIm4G/iIiDWx1IvWrUhEbNAT6SmScWiCMiwn9rknZm37n7OOw7hxC/SA2ErcA1wN/svGLnv7IjYkP18y0RcUdE3BIRqyLisog4MyLuiYifR8Sr6jbz9ohYFhG/iIh3V+07IuKKiLg3Ih6MiL+q2+6PI2Ix8Egv8cyutv9QRFxelV1M7WXTiyLiip3qfykiTqvmvxURX6nmPxwR86v5C6rtPRQRH6/KpkXEioj4GrURJKZExLzqMywFXlO3j7+OiEeqz3FjP4+9pMHLvtO+c9joz18x0u58CXgwIv6+H23+GJgOPENtqKx/zMzjIxZwOi4AAALHSURBVOJ8YC7w8areNGrjzL4KWBIRrwY+SG1YtD+JiJHA/42I26r6xwHHZObj9TuLiMOAy4HXAV3AbRExKzMvjYi3Ap/MzGU7xfhj4I3URoKYBHQPR/ZG4MaIeB3wIeD11EbRuDsi7qi2fxRwdmbeVdV7P3AstX939wP3Vdu6CDgiMzdHxNh+HD9Jg599p33nsOAZTg2IzHwO+Brw1/1odm9mrsnMzdSG8eru9H5OraPsdlNmbs/Mx6h1rn9IbSzjD0bEA8Dd1IZSO6qqf8/OHWblT4B/z8z1mbkVuB540x5i/DHwxog4mtpf/U9HbQzcNwA/ofbX/bcyc2NmbgD+hVqHCvBkZt5Vzb+xqvdCdazqx6d+ELg+Iv4ztTMekoYJ+077zuHChFMD6X9Ru59ndF3ZVqrfs6jdi/OyunWb6+a31y1vp+fZ953HX01qfxHPzcxjq+mIzOzudDfu06eo31HmamAsMBO4k1on+j5gQ2Y+v4fmjcbxLmpnOY4D7o3+3T8lafCz7+zJvnMIMuHUgMnMZ4CbqHWc3Z6gdhkG4DSgcy82/d6IGFHdm3QksAL4AXBeRHQCRMQfRMTo3W0EuAd4c0QcHBEdwGzgjgb2fxe1S1TdneYnq59UP2dFxMur/b+nbl29O6t6+0fEAcCpVdwjgCmZuQT4FHAQMKaBmCQNEfad9p3DgX8NaKBdCXysbvla4JaI+BlwK3v3F/T/o9bhHQicm5mbIuIfqV06uj8iAlgPzNrdRjJzTURcBCyh9lf+9zLzlgb2/2PgnZm5MiKeBMZXZWTm/RHx1So+qN1L9dOImLbTvu+PiG8CPwPWAfdWqzqA/xMRB1UxfTEzf9dATJKGFvtO+84hLTJ3PuMuSZIkDRwvqUuSJKkoE05JkiQVZcIpSZKkokw4JUmSVJQJpyRJkooy4ZQkSVJRJpySJEkq6v8D96IYae1r2+MAAAAASUVORK5CYII=\n"
},
"metadata": {
"needs_background": "light"
}
}
]
},
{
"cell_type": "code",
"source": [
"import collections\n",
"counters = collections.Counter(captions)\n",
"\n",
"df = pd.DataFrame.from_records(counters.most_common(), columns=['caption','count'])\n",
"df.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"id": "quPfpjrsxMH3",
"outputId": "1f80b41b-ec0d-4028-8368-ed3c04bbd719"
},
"execution_count": 9,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" caption count\n",
"0 No acute disease. The heart is normal in size.... 42\n",
"1 No active disease. The heart and lungs have XX... 37\n",
"2 Normal chest Heart size normal. Lungs are clea... 30\n",
"3 No active disease. Both lungs are clear and ex... 26\n",
"4 No acute cardiopulmonary abnormality.. The lun... 25"
],
"text/html": [
"\n",
" <div id=\"df-b7772fd9-1b26-4ab9-bc21-87cc3e0a8c0a\">\n",
" <div class=\"colab-df-container\">\n",
" <div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>caption</th>\n",
" <th>count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>No acute disease. The heart is normal in size....</td>\n",
" <td>42</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>No active disease. The heart and lungs have XX...</td>\n",
" <td>37</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Normal chest Heart size normal. Lungs are clea...</td>\n",
" <td>30</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>No active disease. Both lungs are clear and ex...</td>\n",
" <td>26</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>No acute cardiopulmonary abnormality.. The lun...</td>\n",
" <td>25</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>\n",
" <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-b7772fd9-1b26-4ab9-bc21-87cc3e0a8c0a')\"\n",
" title=\"Convert this dataframe to an interactive table.\"\n",
" style=\"display:none;\">\n",
" \n",
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
" width=\"24px\">\n",
" <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
" <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
" </svg>\n",
" </button>\n",
" \n",
" <style>\n",
" .colab-df-container {\n",
" display:flex;\n",
" flex-wrap:wrap;\n",
" gap: 12px;\n",
" }\n",
"\n",
" .colab-df-convert {\n",
" background-color: #E8F0FE;\n",
" border: none;\n",
" border-radius: 50%;\n",
" cursor: pointer;\n",
" display: none;\n",
" fill: #1967D2;\n",
" height: 32px;\n",
" padding: 0 0 0 0;\n",
" width: 32px;\n",
" }\n",
"\n",
" .colab-df-convert:hover {\n",
" background-color: #E2EBFA;\n",
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
" fill: #174EA6;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-convert {\n",
" background-color: #3B4455;\n",
" fill: #D2E3FC;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-convert:hover {\n",
" background-color: #434B5C;\n",
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
" fill: #FFFFFF;\n",
" }\n",
" </style>\n",
"\n",
" <script>\n",
" const buttonEl =\n",
" document.querySelector('#df-b7772fd9-1b26-4ab9-bc21-87cc3e0a8c0a button.colab-df-convert');\n",
" buttonEl.style.display =\n",
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
"\n",
" async function convertToInteractive(key) {\n",
" const element = document.querySelector('#df-b7772fd9-1b26-4ab9-bc21-87cc3e0a8c0a');\n",
" const dataTable =\n",
" await google.colab.kernel.invokeFunction('convertToInteractive',\n",
" [key], {});\n",
" if (!dataTable) return;\n",
"\n",
" const docLinkHtml = 'Like what you see? Visit the ' +\n",
" '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
" + ' to learn more about interactive tables.';\n",
" element.innerHTML = '';\n",
" dataTable['output_type'] = 'display_data';\n",
" await google.colab.output.renderOutput(dataTable, element);\n",
" const docLink = document.createElement('div');\n",
" docLink.innerHTML = docLinkHtml;\n",
" element.appendChild(docLink);\n",
" }\n",
" </script>\n",
" </div>\n",
" </div>\n",
" "
]
},
"metadata": {},
"execution_count": 9
}
]
},
{
"cell_type": "code",
"source": [
"freqs_list = {}\n",
"for counts in df['count'].to_list():\n",
" if counts in freqs_list:\n",
" freqs_list[counts] += 1\n",
" else:\n",
" freqs_list[counts] = 1\n",
"from pprint import pprint\n",
"pprint(freqs_list)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "b785abzk38q9",
"outputId": "9283973b-9c6b-4848-c890-ae121f5fa40f"
},
"execution_count": 10,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"{1: 2415,\n",
" 2: 47,\n",
" 3: 31,\n",
" 4: 10,\n",
" 5: 18,\n",
" 6: 4,\n",
" 7: 6,\n",
" 8: 9,\n",
" 9: 3,\n",
" 10: 4,\n",
" 11: 2,\n",
" 12: 1,\n",
" 13: 1,\n",
" 16: 2,\n",
" 19: 1,\n",
" 25: 1,\n",
" 26: 1,\n",
" 30: 1,\n",
" 37: 1,\n",
" 42: 1}\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"print(f'About {round(round(freqs_list[1]/len(captions)*100,2))}% of the captions are unique.')"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "njcijvUQ4gHn",
"outputId": "c41a9dba-f82e-4019-9a27-21619500e453"
},
"execution_count": 11,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"About 76% of the captions are unique.\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import tensorflow\n",
"import keras\n",
"from keras.preprocessing.text import Tokenizer\n",
"from collections import Counter\n",
"tokenizer = Tokenizer()\n",
"tokenizer.fit_on_texts(captions)\n",
"vocab_size = len(tokenizer.word_index) + 1\n",
"print('Vocabulary Size: %d' % vocab_size)\n",
"ordered = Counter(tokenizer.word_counts)\n",
"# top 10\n",
"print('Top10 frequent')\n",
"ordered.most_common(10)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "rW6fTzDK5JHJ",
"outputId": "814f77bc-93d3-42f8-cde8-1268d33279a3"
},
"execution_count": 12,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Vocabulary Size: 1874\n",
"Top10 frequent\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"[('no', 6235),\n",
" ('the', 5639),\n",
" ('are', 3819),\n",
" ('is', 3770),\n",
" ('normal', 3372),\n",
" ('xxxx', 2820),\n",
" ('of', 2737),\n",
" ('and', 2736),\n",
" ('acute', 2511),\n",
" ('pleural', 2330)]"
]
},
"metadata": {},
"execution_count": 12
}
]
},
{
"cell_type": "code",
"source": [
"from nltk.corpus import stopwords # Import the stop word list\n",
"from nltk.tokenize import wordpunct_tokenize\n",
"import nltk\n",
"nltk.download('stopwords')\n",
"stop_words = set(stopwords.words('english')) \n",
"dictionary_wo_stopwords = {k:v for k,v in tokenizer.word_counts.items() if k not in stop_words}\n",
"\n",
"ordered_wo = Counter(dictionary_wo_stopwords)\n",
"# top 10\n",
"print('Most frequent words w/o stopwords')\n",
"ordered_wo.most_common(10)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "qRVSyiXT5TpG",
"outputId": "9f7d2bb9-41f3-445e-88cb-4090439f4d06"
},
"execution_count": 13,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Most frequent words w/o stopwords\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
"[nltk_data] Unzipping corpora/stopwords.zip.\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"[('normal', 3372),\n",
" ('xxxx', 2820),\n",
" ('acute', 2511),\n",
" ('pleural', 2330),\n",
" ('pneumothorax', 2102),\n",
" ('effusion', 2049),\n",
" ('heart', 2028),\n",
" ('lungs', 1983),\n",
" ('size', 1854),\n",
" ('clear', 1585)]"
]
},
"metadata": {},
"execution_count": 13
}
]
},
{
"cell_type": "code",
"source": [
"print(f'There are {sum(np.array(list(ordered.values()))==1)} words with only 1 occurence')"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "LzdwEkDI7cG_",
"outputId": "c7d1aa3d-4040-4a49-b051-3541e8279543"
},
"execution_count": 14,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"There are 529 words with only 1 occurence\n"
]
}
]
}
]
}