3322 lines (3322 with data), 251.0 kB
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"gpuType": "T4"
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU",
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"e41925970dc94c2aa90b4da8acac85cf": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_c1f0f99a4a4341d18e0039a71b305111",
"IPY_MODEL_d54d01c91e694b3eb180170cddd1211c",
"IPY_MODEL_3c707baf63cf4cbf9732e39ddeb84c7c"
],
"layout": "IPY_MODEL_08955c157996426eaa089d878b8d820f"
}
},
"c1f0f99a4a4341d18e0039a71b305111": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_da9c8ff09bed4c6989b0246c7bdc8fd7",
"placeholder": "",
"style": "IPY_MODEL_b78266bbcc0e4ed0bcaab9009b30d98e",
"value": "tokenizer_config.json: 100%"
}
},
"d54d01c91e694b3eb180170cddd1211c": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_2fd8c28ebf1b47f78e2e0093608407cf",
"max": 28,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_8c728d06a0814cc8b83931b24f0115d9",
"value": 28
}
},
"3c707baf63cf4cbf9732e39ddeb84c7c": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_da4b0713ce9f47208a39ec89515c30e4",
"placeholder": "",
"style": "IPY_MODEL_b2fd795e6a534319a87213f05e824315",
"value": " 28.0/28.0 [00:00<00:00, 1.45kB/s]"
}
},
"08955c157996426eaa089d878b8d820f": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"da9c8ff09bed4c6989b0246c7bdc8fd7": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"b78266bbcc0e4ed0bcaab9009b30d98e": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"2fd8c28ebf1b47f78e2e0093608407cf": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"8c728d06a0814cc8b83931b24f0115d9": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"da4b0713ce9f47208a39ec89515c30e4": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"b2fd795e6a534319a87213f05e824315": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"0722a8ceb94345a79e5bdc3b84f71d7f": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_7c04cff87d1041b1878488f5014f9b7c",
"IPY_MODEL_0ef4539514a1462caacf58e5c3fa516b",
"IPY_MODEL_4c61a2b37c4c4ffbb3a10d270140724f"
],
"layout": "IPY_MODEL_d1288d4268e74bd384064c4f2cfbfa7b"
}
},
"7c04cff87d1041b1878488f5014f9b7c": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_1c84ce5cb56a4c7286c0f5981b8fbd5a",
"placeholder": "",
"style": "IPY_MODEL_61a01f78b9714e4aa0d869d8e6563e81",
"value": "config.json: 100%"
}
},
"0ef4539514a1462caacf58e5c3fa516b": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_180753d93be944e98c47bba836397ba0",
"max": 385,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_db137743dfc24b6b84c6c15bcd5b5766",
"value": 385
}
},
"4c61a2b37c4c4ffbb3a10d270140724f": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_16f88feb88fc43b5b381521d91b02150",
"placeholder": "",
"style": "IPY_MODEL_9b2a4006744a4940aa64fd9e41d62d4e",
"value": " 385/385 [00:00<00:00, 21.0kB/s]"
}
},
"d1288d4268e74bd384064c4f2cfbfa7b": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"1c84ce5cb56a4c7286c0f5981b8fbd5a": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"61a01f78b9714e4aa0d869d8e6563e81": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"180753d93be944e98c47bba836397ba0": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"db137743dfc24b6b84c6c15bcd5b5766": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"16f88feb88fc43b5b381521d91b02150": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"9b2a4006744a4940aa64fd9e41d62d4e": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"d55048b2772b47eab27637e9aa42d4dc": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_a70574b040b14832abb25d2eb040fb0a",
"IPY_MODEL_e5c3bf5ade9a405094a1673bc2beb795",
"IPY_MODEL_3d7c0e5ba179442eb417435ff72ae85a"
],
"layout": "IPY_MODEL_b8aea249ccf04a619c1e926e7b6b6bbe"
}
},
"a70574b040b14832abb25d2eb040fb0a": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_73554830b9524f1b8b8714ff33c3b8e8",
"placeholder": "",
"style": "IPY_MODEL_15bf509be6d946e187e14fd6854950a9",
"value": "vocab.txt: 100%"
}
},
"e5c3bf5ade9a405094a1673bc2beb795": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_65855073adcd4434b8e9af65ecc59915",
"max": 226150,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_f3485f93a7374df49cc200d18e9ac49e",
"value": 226150
}
},
"3d7c0e5ba179442eb417435ff72ae85a": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_53d6422031bf48dab34a3b47e237da93",
"placeholder": "",
"style": "IPY_MODEL_1ebb572142e8490199aa8ffce39df341",
"value": " 226k/226k [00:00<00:00, 3.21MB/s]"
}
},
"b8aea249ccf04a619c1e926e7b6b6bbe": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"73554830b9524f1b8b8714ff33c3b8e8": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"15bf509be6d946e187e14fd6854950a9": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"65855073adcd4434b8e9af65ecc59915": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"f3485f93a7374df49cc200d18e9ac49e": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"53d6422031bf48dab34a3b47e237da93": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"1ebb572142e8490199aa8ffce39df341": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"efe460065f0448acbf75200543187bb0": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_0e6e8bc125874564bb46431ac85524dc",
"IPY_MODEL_30d5690beb7341dc8b37de5a1e0812ab",
"IPY_MODEL_e8c76754ea784709bae0c59843c61279"
],
"layout": "IPY_MODEL_67690c975154491b96ed9091db152a0f"
}
},
"0e6e8bc125874564bb46431ac85524dc": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_74bbdf59962d41059f018e6eab297968",
"placeholder": "",
"style": "IPY_MODEL_2c20eb99653d4026ace81ccbe858b479",
"value": "pytorch_model.bin: 100%"
}
},
"30d5690beb7341dc8b37de5a1e0812ab": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_183aee869ce34eaca26f1cf533559caa",
"max": 440474434,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_d4622e6a3301483c91a66836d5724e0e",
"value": 440474434
}
},
"e8c76754ea784709bae0c59843c61279": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_d9c5d0256f534e4cbd82797e1e9f3ccd",
"placeholder": "",
"style": "IPY_MODEL_0dc7017260764c9dbdcb9167e7a40b15",
"value": " 440M/440M [00:04<00:00, 108MB/s]"
}
},
"67690c975154491b96ed9091db152a0f": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"74bbdf59962d41059f018e6eab297968": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"2c20eb99653d4026ace81ccbe858b479": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"183aee869ce34eaca26f1cf533559caa": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"d4622e6a3301483c91a66836d5724e0e": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"d9c5d0256f534e4cbd82797e1e9f3ccd": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"0dc7017260764c9dbdcb9167e7a40b15": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
}
}
}
},
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "yBhXMsR3OE85",
"outputId": "f586f5e1-50a3-446a-8258-0dde1c57e636"
},
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"<ipython-input-1-8174d4ba0823>:8: DtypeWarning: Columns (0,18,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300,301,302,303,304,305,306,307,308,309,310,311,312,313,314,315,316,317,318,319,320,321,322,323,324,325,326,327,328,329,330,331,332,333,334,335,336,337,338,339,340,341,342,343,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,402,403,404,405,406,407,408,409,410,411,412,413,414,415,416,417,418,419,420,421,422,423,424,425,426,427,428,429,430,431,432,433,434,435,436,437,438,439,440,441,442,443,444,445,446,447,448,449,450,451,452,453,454,455,456,457,458,459,460,461,462,463,464,465,466,467,468,469,470,471,472,473,474,475,476,477,478,479,480,481,482,483,484,485,486,487,488,489,490,491,492,493,494,495,496,497,498,499,500,501,502,503,504,505,506,507,508,509,510,511,512,513,514,515,516,517,518,519,520,521,522,523,524,525,526,527,528,529,530,531,532,533,534,535,536,537,538,539,540,541,542,543,544,545,546,547,548,549,550,551,552,553,554,555,556,557,558,559,560,561,562,563,564,565,566,567,568,569,570,571,572,573,574,575,576,577,578,579,580,581,582,583,584,585,586,587,588,589,590,591,592,593,594,595,596,597,598,599,600,601,602,603,604,605,606,607,608,609,610,611,612,613,614,615,616,617,618,619,620,621,622,623,624,625,626,627,628,629,630,631,632,633,634,635,636,637,638,639,640,641,642,643,644,645,646,647,648,649,650,651,652,653,654,655,656,657,658,659,660,661,662,663,664,665,666,667,668,669,670,671,672,673,674,675,676,677,678,679,680,681,682,683,684,685,686,687,688,689,690,691,692,693,694,695,696,697,698,699,700,701,702,703,704,705,706,707,708,709,710,711,712,713,714,715,716,717,718,719,720,721,722,723,724,725,726,727,728,729,730,731,732,733,734,735,736,737,738,739,740,741,742,743,744,745,746,747,748,749,750,751,752,753,754,755,756,757,758,759,760,761,762,763,765,767,768,769,771,772,773,775,776) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" df = pd.read_csv(\"/content/usecase_2_.csv\", quoting=csv.QUOTE_MINIMAL, escapechar='\\\\', on_bad_lines='skip')\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Successfully read the CSV with skipped lines.\n",
"Check the problematic rows (if any) by inspecting the original CSV file around row 785.\n"
]
}
],
"source": [
"import pandas as pd\n",
"import csv\n",
"\n",
"# Use csv.QUOTE_MINIMAL to only quote where necessary\n",
"# Try to read the file with error_bad_lines=False to skip problematic lines\n",
"# and see if you can identify the issue in the skipped rows\n",
"try:\n",
" df = pd.read_csv(\"/content/usecase_2_.csv\", quoting=csv.QUOTE_MINIMAL, escapechar='\\\\', on_bad_lines='skip')\n",
" print(\"Successfully read the CSV with skipped lines.\")\n",
" print(\"Check the problematic rows (if any) by inspecting the original CSV file around row 785.\")\n",
"except pd.errors.ParserError as e:\n",
" print(f\"Error: {e}\")\n",
" print(\"The on_bad_lines='skip' approach also failed. This likely indicates a more severe issue within the CSV file structure.\")\n",
" print(\"Possible solutions:\")\n",
" print(\" 1. Manually inspect row 785 and the surrounding rows in your CSV file for unescaped quotes or incorrect line endings.\")\n",
" print(\" 2. If you have control over the CSV generation process, ensure proper escaping of quotes and consistent line endings.\")\n",
" print(\" 3. Try using a different text editor to open the CSV file and check for any hidden characters or encoding issues.\")\n",
" print(\" 4. If the file is large, consider processing it in smaller chunks to identify the specific area causing the problem.\")\n",
"# escapechar is used to escape special characters (like quotes) within the fields.\n",
"# Using backslash here is a common choice."
]
},
{
"cell_type": "code",
"source": [
"df = df.loc[:, ~df.columns.str.contains('^Unnamed')]"
],
"metadata": {
"id": "ONTOsvE2P2hT"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"print(df.head())"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "0ICHZF9HP4vp",
"outputId": "4fdf71c5-22f2-44c7-b559-8f40b5d8b75b"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
" NCT Number Study Title \\\n",
"0 NCT04841499 Effects of a Seven-day BASIS™ Supplementation ... \n",
"1 NCT03020641 Peritoneal Damage in Laparoscopic Surgery \n",
"2 NCT03727620 Doxycycline in the Treatment of Aggressive Per... \n",
"3 NCT03162926 A Safety and Tolerability Study of VC-02™ Comb... \n",
"4 NCT04434313 Treatment of Hemiparetic Gait Impairments Usin... \n",
"\n",
" Study URL Acronym Study Status \\\n",
"0 https://clinicaltrials.gov/study/NCT04841499 NaN COMPLETED \n",
"1 https://clinicaltrials.gov/study/NCT03020641 NaN COMPLETED \n",
"2 https://clinicaltrials.gov/study/NCT03727620 NaN COMPLETED \n",
"3 https://clinicaltrials.gov/study/NCT03162926 NaN COMPLETED \n",
"4 https://clinicaltrials.gov/study/NCT04434313 NaN COMPLETED \n",
"\n",
" Brief Summary Study Results \\\n",
"0 The purpose of this study is to determine whet... NO \n",
"1 The investigators hypothesized that applying a... YES \n",
"2 The aim of the study was to compare the clinic... NO \n",
"3 The purpose of this trial is to test if VC-02™... NO \n",
"4 The objective of this research is to investiga... NO \n",
"\n",
" Conditions \\\n",
"0 Menopause \n",
"1 Peritoneal Damage \n",
"2 Aggressive Periodontitis \n",
"3 Type 1 Diabetes Mellitus \n",
"4 Telemedicine|Gait, Hemiplegic|Gait Disorders, ... \n",
"\n",
" Interventions \\\n",
"0 DRUG: BASIS™ (Crystalline Nicotinamide Ribosid... \n",
"1 PROCEDURE: Low pressure pneumoperitoneum|PROCE... \n",
"2 DRUG: amoxicillin plus metronidazole|DRUG: Dox... \n",
"3 COMBINATION_PRODUCT: VC-02 Combination Product... \n",
"4 DEVICE: Delivery of iStride™ device gait treat... \n",
"\n",
" Primary Outcome Measures ... Age \\\n",
"0 Production of Estradiol, To determine whether ... ... ADULT, OLDER_ADULT \n",
"1 Inflammatory Peritoneal Markers, logaritmic le... ... ADULT, OLDER_ADULT \n",
"2 Decrease of periodontal pockets ≥ 4mm, • Probi... ... CHILD, ADULT \n",
"3 Incidence of all adverse events reported for s... ... ADULT, OLDER_ADULT \n",
"4 Feasibility of safely implementing the treatme... ... ADULT, OLDER_ADULT \n",
"\n",
" Phases Enrollment Funder Type Study Type \\\n",
"0 NaN 40 OTHER INTERVENTIONAL \n",
"1 NaN 100 OTHER INTERVENTIONAL \n",
"2 PHASE1|PHASE2 24 OTHER INTERVENTIONAL \n",
"3 PHASE1 3 INDUSTRY INTERVENTIONAL \n",
"4 NaN 6 INDUSTRY INTERVENTIONAL \n",
"\n",
" Study Design Other IDs \\\n",
"0 Allocation: NA|Intervention Model: SINGLE_GROU... USAH-EH301 \n",
"1 Allocation: RANDOMIZED|Intervention Model: PAR... A-CGyD-2017 \n",
"2 Allocation: NON_RANDOMIZED|Intervention Model:... DOXYAPG18 \n",
"3 Allocation: NA|Intervention Model: SINGLE_GROU... VC02-102 \n",
"4 Allocation: NA|Intervention Model: SINGLE_GROU... MOT-TELE-2020-04-00 \n",
"\n",
" Locations \\\n",
"0 University of South Alabama, Mobile, Alabama, ... \n",
"1 Ramon y Cajal Hospital, Madrid, 28034, Spain \n",
"2 BENRACHADI Latifa, Rabat, Morocco \n",
"3 University of Alberta, Edmonton, Alberta, Canada \n",
"4 Moterum Technologies, Inc. (study location: ho... \n",
"\n",
" Study Documents Time taken for Enrollment \n",
"0 NaN 3 \n",
"1 Study Protocol and Statistical Analysis Plan, ... 27 \n",
"2 NaN 5 \n",
"3 NaN 7 \n",
"4 NaN 8 \n",
"\n",
"[5 rows x 25 columns]\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"df = df.drop(df.columns[0], axis=1)"
],
"metadata": {
"id": "k-Mb2MQbP60s"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"print(df.dtypes)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "FKeOAM3tP9-M",
"outputId": "eca2d10b-a590-4ecf-e8a6-17816e8eace6"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Study Title object\n",
"Study URL object\n",
"Acronym object\n",
"Study Status object\n",
"Brief Summary object\n",
"Study Results object\n",
"Conditions object\n",
"Interventions object\n",
"Primary Outcome Measures object\n",
"Secondary Outcome Measures object\n",
"Other Outcome Measures object\n",
"Sponsor object\n",
"Collaborators object\n",
"Sex object\n",
"Age object\n",
"Phases object\n",
"Enrollment object\n",
"Funder Type object\n",
"Study Type object\n",
"Study Design object\n",
"Other IDs object\n",
"Locations object\n",
"Study Documents object\n",
"Time taken for Enrollment object\n",
"dtype: object\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import re\n",
"from nltk.corpus import stopwords\n",
"from nltk.tokenize import word_tokenize\n",
"import nltk\n",
"\n",
"# Download necessary NLTK data\n",
"nltk.download('stopwords')\n",
"nltk.download('punkt')\n",
"nltk.download('punkt_tab') # Download punkt_tab data\n",
"\n",
"# Define the columns to process\n",
"text_columns = [\n",
" 'Study Title', 'Study Status', 'Brief Summary', 'Study Results',\n",
" 'Conditions', 'Interventions', 'Primary Outcome Measures',\n",
" 'Funder Type', 'Study Type', 'Study Design', 'Locations'\n",
"]\n",
"\n",
"# Define a function to preprocess text (remove prepositions and stopwords)\n",
"def preprocess_text(text):\n",
" if pd.isnull(text):\n",
" return text # Skip processing for NaN values\n",
" # Tokenize and remove punctuation\n",
" words = word_tokenize(re.sub(r'[^\\w\\s]', '', text.lower()))\n",
" # Remove stopwords (including prepositions)\n",
" filtered_words = [word for word in words if word not in stopwords.words('english')]\n",
" return ' '.join(filtered_words)\n",
"\n",
"# Apply preprocessing to specified columns\n",
"for column in text_columns:\n",
" if column in df.columns:\n",
" df[column] = df[column].apply(preprocess_text)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "EK48M-dFQBJW",
"outputId": "7f1601b1-da77-474b-efc1-9ea5b28607dc"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
"[nltk_data] Unzipping corpora/stopwords.zip.\n",
"[nltk_data] Downloading package punkt to /root/nltk_data...\n",
"[nltk_data] Unzipping tokenizers/punkt.zip.\n",
"[nltk_data] Downloading package punkt_tab to /root/nltk_data...\n",
"[nltk_data] Unzipping tokenizers/punkt_tab.zip.\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"print(df.head)"
],
"metadata": {
"id": "oizcbyiPQE-Q",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "52cd4dd2-688f-42f9-d44a-41c064365da0"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"<bound method NDFrame.head of Study Title \\\n",
"0 effects sevenday basis supplementation menopau... \n",
"1 peritoneal damage laparoscopic surgery \n",
"2 doxycycline treatment aggressive periodontitis \n",
"3 safety tolerability study vc02 combination pro... \n",
"4 treatment hemiparetic gait impairments using t... \n",
"... ... \n",
"3098 comparative effectiveness costeffectiveness ch... \n",
"3099 impact intestinal microbiota treatment ceftria... \n",
"3100 mental balance study \n",
"3101 glymphatic kinetics healthy adult volunteers \n",
"3102 utilizing mychart assess effectiveness interve... \n",
"\n",
" Study URL Acronym Study Status \\\n",
"0 https://clinicaltrials.gov/study/NCT04841499 NaN completed \n",
"1 https://clinicaltrials.gov/study/NCT03020641 NaN completed \n",
"2 https://clinicaltrials.gov/study/NCT03727620 NaN completed \n",
"3 https://clinicaltrials.gov/study/NCT03162926 NaN completed \n",
"4 https://clinicaltrials.gov/study/NCT04434313 NaN completed \n",
"... ... ... ... \n",
"3098 https://clinicaltrials.gov/study/NCT03294785 NaN completed \n",
"3099 https://clinicaltrials.gov/study/NCT03179384 CEFIMPACT completed \n",
"3100 https://clinicaltrials.gov/study/NCT05757050 NaN completed \n",
"3101 https://clinicaltrials.gov/study/NCT03218111 NaN completed \n",
"3102 https://clinicaltrials.gov/study/NCT05222464 NaN completed \n",
"\n",
" Brief Summary Study Results \\\n",
"0 purpose study determine whether short suppleme... \n",
"1 investigators hypothesized applying low intrap... yes \n",
"2 aim study compare clinical effects systemic us... \n",
"3 purpose trial test vc02 combination product im... \n",
"4 objective research investigate feasibility del... \n",
"... ... ... \n",
"3098 multicenter randomized controlled trial assess... \n",
"3099 acute pyelonephritis apn corresponds infection... \n",
"3100 proposed design randomised doubleblind control... \n",
"3101 study done order see gadoliniumbased mri contr... \n",
"3102 vasomotor symptoms vms common consequence syst... \n",
"\n",
" Conditions \\\n",
"0 menopause \n",
"1 peritoneal damage \n",
"2 aggressive periodontitis \n",
"3 type 1 diabetes mellitus \n",
"4 telemedicinegait hemiplegicgait disorders neur... \n",
"... ... \n",
"3098 chronic neck pain \n",
"3099 pyelonephritis acute \n",
"3100 mental health wellness 1work related stress \n",
"3101 healthy \n",
"3102 breast cancer \n",
"\n",
" Interventions \\\n",
"0 drug basis crystalline nicotinamide riboside 2... \n",
"1 procedure low pressure pneumoperitoneumprocedu... \n",
"2 drug amoxicillin plus metronidazoledrug doxycy... \n",
"3 combination_product vc02 combination product a... \n",
"4 device delivery istride device gait treatment ... \n",
"... ... \n",
"3098 procedure chuna manual therapydrug conventiona... \n",
"3099 drug ceftriaxone \n",
"3100 dietary_supplement refocus verum tabletsdietar... \n",
"3101 mr imagingother ctguidance \n",
"3102 standard care treatments \n",
"\n",
" Primary Outcome Measures \\\n",
"0 production estradiol determine whether short s... \n",
"1 inflammatory peritoneal markers logaritmic lev... \n",
"2 decrease periodontal pockets 4mm probing pocke... \n",
"3 incidence adverse events reported subjects thr... \n",
"4 feasibility safely implementing treatment prot... \n",
"... ... \n",
"3098 difference visual analogue scale vas neck pain... \n",
"3099 emergence ceftriaxoneresistant enterobacteriac... \n",
"3100 change cognitive function cognitive domain fac... \n",
"3101 drug distribution time drug distribution gadol... \n",
"3102 patient engagement mychart accessibility user ... \n",
"\n",
" Secondary Outcome Measures ... \\\n",
"0 NaN ... \n",
"1 NaN ... \n",
"2 Plaque index decrease, Plaque index was assess... ... \n",
"3 NaN ... \n",
"4 Feasibility of screening criteria, To enroll p... ... \n",
"... ... ... \n",
"3098 Difference between visual analogue scale (VAS)... ... \n",
"3099 NaN ... \n",
"3100 Profile of Mood States (POMS), 35-item measure... ... \n",
"3101 NaN ... \n",
"3102 Hot Flash Severity (MyChart Feasibility), Hot ... ... \n",
"\n",
" Age Phases Enrollment Funder Type \\\n",
"0 ADULT, OLDER_ADULT NaN 40 \n",
"1 ADULT, OLDER_ADULT NaN 100 \n",
"2 CHILD, ADULT PHASE1|PHASE2 24 \n",
"3 ADULT, OLDER_ADULT PHASE1 3 industry \n",
"4 ADULT, OLDER_ADULT NaN 6 industry \n",
"... ... ... ... ... \n",
"3098 ADULT NaN 108 \n",
"3099 ADULT, OLDER_ADULT PHASE4 9 \n",
"3100 ADULT, OLDER_ADULT NaN 36 industry \n",
"3101 ADULT, OLDER_ADULT NaN 19 \n",
"3102 ADULT, OLDER_ADULT PHASE4 56 \n",
"\n",
" Study Type Study Design \\\n",
"0 interventional allocation naintervention model single_groupma... \n",
"1 interventional allocation randomizedintervention model parall... \n",
"2 interventional allocation non_randomizedintervention model pa... \n",
"3 interventional allocation naintervention model single_groupma... \n",
"4 interventional allocation naintervention model single_groupma... \n",
"... ... ... \n",
"3098 interventional allocation randomizedintervention model parall... \n",
"3099 interventional allocation naintervention model single_groupma... \n",
"3100 interventional allocation randomizedintervention model crosso... \n",
"3101 interventional allocation naintervention model single_groupma... \n",
"3102 interventional allocation naintervention model single_groupma... \n",
"\n",
" Other IDs \\\n",
"0 USAH-EH301 \n",
"1 A-CGyD-2017 \n",
"2 DOXYAPG18 \n",
"3 VC02-102 \n",
"4 MOT-TELE-2020-04-00 \n",
"... ... \n",
"3098 JS-CT-2016-14 \n",
"3099 16-AOI-02 \n",
"3100 5'000'750-1 \n",
"3101 1609017536 \n",
"3102 REaCT-Hot Flashes Pilot \n",
"\n",
" Locations \\\n",
"0 university south alabama mobile alabama 36604 ... \n",
"1 ramon cajal hospital madrid 28034 spain \n",
"2 benrachadi latifa rabat morocco \n",
"3 university alberta edmonton alberta canada \n",
"4 moterum technologies inc study location homes ... \n",
"... ... \n",
"3098 bucheon jaseng hospital korean medicine bucheo... \n",
"3099 chu de nice nice 06003 france \n",
"3100 northumbria university newcastle upon tyne tyn... \n",
"3101 weill cornell medical college new york new yor... \n",
"3102 ottawa hospital cancer centre ottawa ontario c... \n",
"\n",
" Study Documents \\\n",
"0 NaN \n",
"1 Study Protocol and Statistical Analysis Plan, ... \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN \n",
"... ... \n",
"3098 NaN \n",
"3099 NaN \n",
"3100 NaN \n",
"3101 NaN \n",
"3102 NaN \n",
"\n",
" Time taken for Enrollment \n",
"0 3 \n",
"1 27 \n",
"2 5 \n",
"3 7 \n",
"4 8 \n",
"... ... \n",
"3098 10 \n",
"3099 20 \n",
"3100 6 \n",
"3101 12 \n",
"3102 5 \n",
"\n",
"[3103 rows x 24 columns]>\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import nltk\n",
"nltk.download('wordnet') # Download the wordnet dataset\n",
"\n",
"from nltk.corpus import wordnet\n",
"\n",
"# ... (Rest of your code)\n",
"\n",
"# Function to expand synonyms for a given word\n",
"def get_synonyms(word):\n",
" synonyms = set()\n",
" for syn in wordnet.synsets(word):\n",
" for lemma in syn.lemmas():\n",
" synonyms.add(lemma.name().replace('_', ' '))\n",
" return list(synonyms)\n",
"\n",
"# Function to expand synonyms in a text\n",
"def synonym_expansion(text):\n",
" if pd.isnull(text):\n",
" return text # Skip processing for NaN values\n",
" words = text.split() # Split the cleaned text into words\n",
" expanded_text = []\n",
" for word in words:\n",
" expanded_text.append(word) # Add the original word\n",
" expanded_text.extend(get_synonyms(word)) # Add synonyms\n",
" return ' '.join(set(expanded_text)) # Deduplicate and join back to a string\n",
"\n",
"# Apply synonym expansion to the same text columns\n",
"for column in text_columns:\n",
" if column in df.columns:\n",
" df[column] = df[column].apply(synonym_expansion)\n",
"\n",
"\n"
],
"metadata": {
"id": "VxCT0bvfQJF6",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "8997c388-c278-40cb-9d6a-557b1fcea3a6"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"[nltk_data] Downloading package wordnet to /root/nltk_data...\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"from sklearn.preprocessing import LabelEncoder\n",
"\n",
"# List of columns to label encode\n",
"label_columns = ['Sex', 'Age', 'Phases', 'Enrollment']\n",
"\n",
"# Initialize a dictionary to store encoders for each column (optional: for inverse transformations later)\n",
"label_encoders = {}\n",
"\n",
"# Apply label encoding to each specified column\n",
"for column in label_columns:\n",
" if column in df.columns:\n",
" le = LabelEncoder()\n",
" df[column] = le.fit_transform(df[column].astype(str)) # Ensure all data is treated as string\n",
" label_encoders[column] = le"
],
"metadata": {
"id": "tBlxiVGRQMwy"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"columns_to_check = [\n",
" 'Sex', 'Age', 'Phases', 'Enrollment', 'Study Title', 'Study Status', 'Brief Summary',\n",
" 'Study Results', 'Conditions', 'Interventions', 'Primary Outcome Measures',\n",
" 'Funder Type', 'Study Type', 'Study Design', 'Locations', 'Time taken for Enrollment'\n",
"]\n",
"\n",
"# Filter rows where all specified columns have non-null values\n",
"filtered_df = df.dropna(subset=columns_to_check)\n",
"\n",
"# Select the top 1000 rows\n",
"top_1000_rows = filtered_df.head(1000)"
],
"metadata": {
"id": "LFnloowZQPqa"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from transformers import AutoTokenizer, AutoModel\n",
"import torch\n",
"\n",
"# Load the PubMedBERT model and tokenizer\n",
"model_name = \"microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext\"\n",
"tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
"model = AutoModel.from_pretrained(model_name)\n",
"\n",
"# Function to generate embeddings\n",
"def generate_embedding(text, tokenizer, model):\n",
" if pd.isna(text) or text.strip() == \"\": # Handle empty or NaN values\n",
" return torch.zeros(768).tolist() # Return a zero vector if the text is empty\n",
" inputs = tokenizer(text, return_tensors=\"pt\", truncation=True, max_length=512, padding=True)\n",
" with torch.no_grad():\n",
" outputs = model(**inputs)\n",
" return outputs.last_hidden_state.mean(dim=1).squeeze().tolist() # Mean pooling of embeddings\n",
"\n",
"# Columns to process\n",
"text_columns = [\n",
" 'Study Title', 'Study Status', 'Brief Summary', 'Study Results',\n",
" 'Conditions', 'Interventions', 'Primary Outcome Measures',\n",
" 'Funder Type', 'Study Type', 'Study Design', 'Locations'\n",
"]\n",
"\n",
"# Generate embeddings for each column using PubMedBERT\n",
"for col in text_columns:\n",
" print(f\"Generating embeddings for column: {col} using PubMedBERT\")\n",
" top_1000_rows[f\"{col}_embedding\"] = top_1000_rows[col].apply(\n",
" lambda x: generate_embedding(x, tokenizer, model)\n",
" )\n"
],
"metadata": {
"id": "WB6AGY7uQSIR",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000,
"referenced_widgets": [
"e41925970dc94c2aa90b4da8acac85cf",
"c1f0f99a4a4341d18e0039a71b305111",
"d54d01c91e694b3eb180170cddd1211c",
"3c707baf63cf4cbf9732e39ddeb84c7c",
"08955c157996426eaa089d878b8d820f",
"da9c8ff09bed4c6989b0246c7bdc8fd7",
"b78266bbcc0e4ed0bcaab9009b30d98e",
"2fd8c28ebf1b47f78e2e0093608407cf",
"8c728d06a0814cc8b83931b24f0115d9",
"da4b0713ce9f47208a39ec89515c30e4",
"b2fd795e6a534319a87213f05e824315",
"0722a8ceb94345a79e5bdc3b84f71d7f",
"7c04cff87d1041b1878488f5014f9b7c",
"0ef4539514a1462caacf58e5c3fa516b",
"4c61a2b37c4c4ffbb3a10d270140724f",
"d1288d4268e74bd384064c4f2cfbfa7b",
"1c84ce5cb56a4c7286c0f5981b8fbd5a",
"61a01f78b9714e4aa0d869d8e6563e81",
"180753d93be944e98c47bba836397ba0",
"db137743dfc24b6b84c6c15bcd5b5766",
"16f88feb88fc43b5b381521d91b02150",
"9b2a4006744a4940aa64fd9e41d62d4e",
"d55048b2772b47eab27637e9aa42d4dc",
"a70574b040b14832abb25d2eb040fb0a",
"e5c3bf5ade9a405094a1673bc2beb795",
"3d7c0e5ba179442eb417435ff72ae85a",
"b8aea249ccf04a619c1e926e7b6b6bbe",
"73554830b9524f1b8b8714ff33c3b8e8",
"15bf509be6d946e187e14fd6854950a9",
"65855073adcd4434b8e9af65ecc59915",
"f3485f93a7374df49cc200d18e9ac49e",
"53d6422031bf48dab34a3b47e237da93",
"1ebb572142e8490199aa8ffce39df341",
"efe460065f0448acbf75200543187bb0",
"0e6e8bc125874564bb46431ac85524dc",
"30d5690beb7341dc8b37de5a1e0812ab",
"e8c76754ea784709bae0c59843c61279",
"67690c975154491b96ed9091db152a0f",
"74bbdf59962d41059f018e6eab297968",
"2c20eb99653d4026ace81ccbe858b479",
"183aee869ce34eaca26f1cf533559caa",
"d4622e6a3301483c91a66836d5724e0e",
"d9c5d0256f534e4cbd82797e1e9f3ccd",
"0dc7017260764c9dbdcb9167e7a40b15"
]
},
"outputId": "4ca695b1-5a35-4c3d-d856-5e6450e388a2"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: \n",
"The secret `HF_TOKEN` does not exist in your Colab secrets.\n",
"To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n",
"You will be able to reuse this secret in all of your notebooks.\n",
"Please note that authentication is recommended but still optional to access public models or datasets.\n",
" warnings.warn(\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"tokenizer_config.json: 0%| | 0.00/28.0 [00:00<?, ?B/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "e41925970dc94c2aa90b4da8acac85cf"
}
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"config.json: 0%| | 0.00/385 [00:00<?, ?B/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "0722a8ceb94345a79e5bdc3b84f71d7f"
}
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"vocab.txt: 0%| | 0.00/226k [00:00<?, ?B/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "d55048b2772b47eab27637e9aa42d4dc"
}
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"pytorch_model.bin: 0%| | 0.00/440M [00:00<?, ?B/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "efe460065f0448acbf75200543187bb0"
}
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Generating embeddings for column: Study Title using PubMedBERT\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"<ipython-input-11-7092d7125584>:28: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"{col}_embedding\"] = top_1000_rows[col].apply(\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Generating embeddings for column: Study Status using PubMedBERT\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"<ipython-input-11-7092d7125584>:28: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"{col}_embedding\"] = top_1000_rows[col].apply(\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Generating embeddings for column: Brief Summary using PubMedBERT\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"<ipython-input-11-7092d7125584>:28: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"{col}_embedding\"] = top_1000_rows[col].apply(\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Generating embeddings for column: Study Results using PubMedBERT\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"<ipython-input-11-7092d7125584>:28: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"{col}_embedding\"] = top_1000_rows[col].apply(\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Generating embeddings for column: Conditions using PubMedBERT\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"<ipython-input-11-7092d7125584>:28: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"{col}_embedding\"] = top_1000_rows[col].apply(\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Generating embeddings for column: Interventions using PubMedBERT\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"<ipython-input-11-7092d7125584>:28: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"{col}_embedding\"] = top_1000_rows[col].apply(\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Generating embeddings for column: Primary Outcome Measures using PubMedBERT\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"<ipython-input-11-7092d7125584>:28: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"{col}_embedding\"] = top_1000_rows[col].apply(\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Generating embeddings for column: Funder Type using PubMedBERT\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"<ipython-input-11-7092d7125584>:28: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"{col}_embedding\"] = top_1000_rows[col].apply(\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Generating embeddings for column: Study Type using PubMedBERT\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"<ipython-input-11-7092d7125584>:28: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"{col}_embedding\"] = top_1000_rows[col].apply(\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Generating embeddings for column: Study Design using PubMedBERT\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"<ipython-input-11-7092d7125584>:28: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"{col}_embedding\"] = top_1000_rows[col].apply(\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Generating embeddings for column: Locations using PubMedBERT\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"<ipython-input-11-7092d7125584>:28: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"{col}_embedding\"] = top_1000_rows[col].apply(\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"!pip install umap-learn"
],
"metadata": {
"id": "gfPr4CcMQWMx",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "094e9427-18ef-40b5-a216-afd5416b4b69"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Collecting umap-learn\n",
" Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)\n",
"Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from umap-learn) (1.26.4)\n",
"Requirement already satisfied: scipy>=1.3.1 in /usr/local/lib/python3.10/dist-packages (from umap-learn) (1.13.1)\n",
"Requirement already satisfied: scikit-learn>=0.22 in /usr/local/lib/python3.10/dist-packages (from umap-learn) (1.6.0)\n",
"Requirement already satisfied: numba>=0.51.2 in /usr/local/lib/python3.10/dist-packages (from umap-learn) (0.60.0)\n",
"Collecting pynndescent>=0.5 (from umap-learn)\n",
" Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)\n",
"Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from umap-learn) (4.67.1)\n",
"Requirement already satisfied: llvmlite<0.44,>=0.43.0dev0 in /usr/local/lib/python3.10/dist-packages (from numba>=0.51.2->umap-learn) (0.43.0)\n",
"Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.10/dist-packages (from pynndescent>=0.5->umap-learn) (1.4.2)\n",
"Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.22->umap-learn) (3.5.0)\n",
"Downloading umap_learn-0.5.7-py3-none-any.whl (88 kB)\n",
"\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/88.8 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m88.8/88.8 kB\u001b[0m \u001b[31m3.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading pynndescent-0.5.13-py3-none-any.whl (56 kB)\n",
"\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/56.9 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.9/56.9 kB\u001b[0m \u001b[31m4.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hInstalling collected packages: pynndescent, umap-learn\n",
"Successfully installed pynndescent-0.5.13 umap-learn-0.5.7\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"from sklearn.manifold import TSNE\n",
"import umap\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.metrics import pairwise_distances\n",
"import numpy as np\n",
"\n",
"# Combine all embedding columns into a single DataFrame\n",
"embedding_columns = [f\"{col}_embedding\" for col in [\n",
" 'Study Title', 'Study Status', 'Brief Summary', 'Study Results',\n",
" 'Conditions', 'Interventions', 'Primary Outcome Measures',\n",
" 'Funder Type', 'Study Type', 'Study Design', 'Locations'\n",
"]]\n",
"\n",
"# Flatten embeddings into a single numpy array\n",
"embedding_data = np.vstack(\n",
" top_1000_rows[embedding_columns].apply(\n",
" lambda row: np.concatenate(row.values), axis=1\n",
" )\n",
")\n",
"\n",
"# Use pairwise distances for elbow method\n",
"distances = pairwise_distances(embedding_data, metric='euclidean')\n",
"\n",
"# Find optimal dimensions for UMAP using the elbow method\n",
"inertia = []\n",
"dimensions = range(2, 50) # Test dimensions from 2 to 50\n",
"for dim in dimensions:\n",
" umap_reducer = umap.UMAP(n_components=dim, random_state=42)\n",
" transformed = umap_reducer.fit_transform(embedding_data)\n",
" inertia.append(np.sum(np.var(transformed, axis=0))) # Variance in reduced dimensions\n",
"\n",
"# Plot the elbow curve\n",
"plt.figure(figsize=(8, 5))\n",
"plt.plot(dimensions, inertia, marker='o')\n",
"plt.title(\"Elbow Method for UMAP Dimension Reduction\")\n",
"plt.xlabel(\"Number of Components\")\n",
"plt.ylabel(\"Sum of Variance\")\n",
"plt.grid()\n",
"plt.show()"
],
"metadata": {
"id": "g6Jg9xf1Vq8B",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
},
"outputId": "1409e037-88f0-4038-d687-3ead90037e6a"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"<Figure size 800x500 with 1 Axes>"
],
"image/png": "\n"
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"source": [
"optimal_dims = 40 # Example, replace with the value from the plot\n",
"\n",
"# Apply UMAP with optimal dimensions\n",
"umap_reducer = umap.UMAP(n_components=optimal_dims, random_state=42)\n",
"reduced_embeddings = umap_reducer.fit_transform(embedding_data)\n",
"\n",
"# Add reduced embeddings back to the DataFrame\n",
"for i in range(optimal_dims):\n",
" top_1000_rows[f\"UMAP_{i+1}\"] = reduced_embeddings[:, i]"
],
"metadata": {
"id": "XZbYj6Eqp9Hf",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "54e88763-5ff2-401e-e6b1-e6d3f6d1705c"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.10/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
" warn(\n",
"<ipython-input-14-de94449a2449>:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"UMAP_{i+1}\"] = reduced_embeddings[:, i]\n",
"<ipython-input-14-de94449a2449>:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"UMAP_{i+1}\"] = reduced_embeddings[:, i]\n",
"<ipython-input-14-de94449a2449>:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"UMAP_{i+1}\"] = reduced_embeddings[:, i]\n",
"<ipython-input-14-de94449a2449>:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"UMAP_{i+1}\"] = reduced_embeddings[:, i]\n",
"<ipython-input-14-de94449a2449>:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"UMAP_{i+1}\"] = reduced_embeddings[:, i]\n",
"<ipython-input-14-de94449a2449>:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"UMAP_{i+1}\"] = reduced_embeddings[:, i]\n",
"<ipython-input-14-de94449a2449>:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"UMAP_{i+1}\"] = reduced_embeddings[:, i]\n",
"<ipython-input-14-de94449a2449>:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"UMAP_{i+1}\"] = reduced_embeddings[:, i]\n",
"<ipython-input-14-de94449a2449>:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"UMAP_{i+1}\"] = reduced_embeddings[:, i]\n",
"<ipython-input-14-de94449a2449>:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"UMAP_{i+1}\"] = reduced_embeddings[:, i]\n",
"<ipython-input-14-de94449a2449>:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"UMAP_{i+1}\"] = reduced_embeddings[:, i]\n",
"<ipython-input-14-de94449a2449>:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"UMAP_{i+1}\"] = reduced_embeddings[:, i]\n",
"<ipython-input-14-de94449a2449>:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"UMAP_{i+1}\"] = reduced_embeddings[:, i]\n",
"<ipython-input-14-de94449a2449>:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"UMAP_{i+1}\"] = reduced_embeddings[:, i]\n",
"<ipython-input-14-de94449a2449>:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"UMAP_{i+1}\"] = reduced_embeddings[:, i]\n",
"<ipython-input-14-de94449a2449>:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"UMAP_{i+1}\"] = reduced_embeddings[:, i]\n",
"<ipython-input-14-de94449a2449>:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"UMAP_{i+1}\"] = reduced_embeddings[:, i]\n",
"<ipython-input-14-de94449a2449>:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"UMAP_{i+1}\"] = reduced_embeddings[:, i]\n",
"<ipython-input-14-de94449a2449>:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"UMAP_{i+1}\"] = reduced_embeddings[:, i]\n",
"<ipython-input-14-de94449a2449>:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"UMAP_{i+1}\"] = reduced_embeddings[:, i]\n",
"<ipython-input-14-de94449a2449>:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"UMAP_{i+1}\"] = reduced_embeddings[:, i]\n",
"<ipython-input-14-de94449a2449>:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"UMAP_{i+1}\"] = reduced_embeddings[:, i]\n",
"<ipython-input-14-de94449a2449>:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"UMAP_{i+1}\"] = reduced_embeddings[:, i]\n",
"<ipython-input-14-de94449a2449>:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"UMAP_{i+1}\"] = reduced_embeddings[:, i]\n",
"<ipython-input-14-de94449a2449>:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"UMAP_{i+1}\"] = reduced_embeddings[:, i]\n",
"<ipython-input-14-de94449a2449>:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"UMAP_{i+1}\"] = reduced_embeddings[:, i]\n",
"<ipython-input-14-de94449a2449>:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"UMAP_{i+1}\"] = reduced_embeddings[:, i]\n",
"<ipython-input-14-de94449a2449>:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"UMAP_{i+1}\"] = reduced_embeddings[:, i]\n",
"<ipython-input-14-de94449a2449>:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"UMAP_{i+1}\"] = reduced_embeddings[:, i]\n",
"<ipython-input-14-de94449a2449>:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"UMAP_{i+1}\"] = reduced_embeddings[:, i]\n",
"<ipython-input-14-de94449a2449>:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"UMAP_{i+1}\"] = reduced_embeddings[:, i]\n",
"<ipython-input-14-de94449a2449>:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"UMAP_{i+1}\"] = reduced_embeddings[:, i]\n",
"<ipython-input-14-de94449a2449>:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"UMAP_{i+1}\"] = reduced_embeddings[:, i]\n",
"<ipython-input-14-de94449a2449>:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"UMAP_{i+1}\"] = reduced_embeddings[:, i]\n",
"<ipython-input-14-de94449a2449>:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"UMAP_{i+1}\"] = reduced_embeddings[:, i]\n",
"<ipython-input-14-de94449a2449>:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"UMAP_{i+1}\"] = reduced_embeddings[:, i]\n",
"<ipython-input-14-de94449a2449>:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"UMAP_{i+1}\"] = reduced_embeddings[:, i]\n",
"<ipython-input-14-de94449a2449>:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"UMAP_{i+1}\"] = reduced_embeddings[:, i]\n",
"<ipython-input-14-de94449a2449>:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"UMAP_{i+1}\"] = reduced_embeddings[:, i]\n",
"<ipython-input-14-de94449a2449>:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" top_1000_rows[f\"UMAP_{i+1}\"] = reduced_embeddings[:, i]\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
"from sklearn.metrics import mean_squared_error\n",
"import tensorflow as tf\n",
"from tensorflow.keras.models import Sequential\n",
"from tensorflow.keras.layers import SimpleRNN, Dense, Dropout\n",
"from tensorflow.keras.optimizers import Adam\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.pipeline import Pipeline\n",
"\n",
"# Step 1: Prepare features and target\n",
"# Select relevant columns\n",
"input_columns = [\n",
" 'Sex', 'Age', 'Phases', 'Enrollment',\n",
" 'Study Title', 'Study Status', 'Brief Summary', 'Study Results',\n",
" 'Conditions', 'Interventions', 'Primary Outcome Measures',\n",
" 'Funder Type', 'Study Type', 'Study Design'\n",
"]\n",
"\n",
"target_column = 'Time taken for Enrollment'\n",
"\n",
"# Ensure the columns exist in the dataframe after UMAP processing\n",
"input_data = top_1000_rows[input_columns]\n",
"target_data = top_1000_rows[target_column]\n",
"\n",
"# Step 2: Preprocess data\n",
"# Handling categorical features using one-hot encoding and scaling\n",
"# Columns for one-hot encoding\n",
"categorical_columns = ['Sex', 'Age', 'Phases', 'Enrollment']\n",
"\n",
"# Scale numerical features (UMAP embeddings)\n",
"scaler = StandardScaler()\n",
"\n",
"# Identify numerical columns (excluding categorical and text columns)\n",
"numerical_columns = [col for col in input_columns if col not in categorical_columns and col not in text_columns]\n",
"\n",
"# Ensure all columns are of numeric type before applying preprocessing\n",
"# Convert to numeric, replacing non-numeric values with NaN\n",
"for col in categorical_columns + numerical_columns:\n",
" input_data[col] = pd.to_numeric(input_data[col], errors='coerce')\n",
"\n",
"# Impute NaN values if any (you can use a different strategy if needed)\n",
"input_data = input_data.fillna(0)\n",
"\n",
"# Column transformer to apply one-hot encoding to categorical columns and scaling to numerical columns\n",
"preprocessor = ColumnTransformer(\n",
" transformers=[\n",
" ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns),\n",
" ('num', scaler, numerical_columns)\n",
" ])\n",
"\n",
"# Apply the preprocessor to the input data\n",
"X = preprocessor.fit_transform(input_data)\n",
"\n",
"# For the target, make sure it's in a numeric format\n",
"y = target_data.values\n",
"y = pd.to_numeric(y, errors='coerce')\n",
"\n",
"# Impute NaN values if any (you can use a different strategy if needed)\n",
"y = np.nan_to_num(y) # Replace NaN with 0\n",
"# --- End of the code to insert ---\n",
"\n",
"\n",
"# Step 3: Split data into training and testing sets (80:20)\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
],
"metadata": {
"id": "mm6YOeG_p-I0",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "eda3e0ca-7be4-454f-c7cf-3705c158c2dc"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"<ipython-input-26-207c951d05e0>:42: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" input_data[col] = pd.to_numeric(input_data[col], errors='coerce')\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"from tensorflow.keras.layers import Bidirectional, LSTM\n",
"\n",
"# Step 4: Build the BiLSTM model\n",
"bi_lstm_model = Sequential()\n",
"\n",
"# Input layer: BiLSTM expects 3D input (samples, time steps, features)\n",
"# Reshaping input to (samples, time steps, features)\n",
"bi_lstm_model.add(tf.keras.layers.Reshape((1, X_train.shape[1]), input_shape=(X_train.shape[1],)))\n",
"\n",
"# BiLSTM layer with 64 units\n",
"bi_lstm_model.add(Bidirectional(LSTM(64, activation='relu', return_sequences=False)))\n",
"\n",
"# Dropout layer to prevent overfitting\n",
"bi_lstm_model.add(Dropout(0.2))\n",
"\n",
"# Output layer: predicting a continuous value (Time taken for Enrollment)\n",
"bi_lstm_model.add(Dense(1))\n",
"\n",
"# Step 5: Compile the BiLSTM model\n",
"bi_lstm_model.compile(optimizer=Adam(), loss='mean_squared_error')\n",
"\n",
"# Step 6: Train the BiLSTM model\n",
"bi_lstm_history = bi_lstm_model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))\n",
"\n",
"# Step 7: Make predictions with the BiLSTM model and evaluate the RMSE\n",
"y_pred_bilstm = bi_lstm_model.predict(X_test)\n",
"\n",
"# Calculate RMSE (Root Mean Squared Error) for BiLSTM model\n",
"rmse_bilstm = np.sqrt(mean_squared_error(y_test, y_pred_bilstm))\n",
"print(f\"RMSE for BiLSTM: {rmse_bilstm}\")\n"
],
"metadata": {
"id": "0MpM2NkTqGYe",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "ed87b58e-5718-4f35-da94-a8a36b3648f0"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.10/dist-packages/keras/src/layers/reshaping/reshape.py:39: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.\n",
" super().__init__(**kwargs)\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Epoch 1/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 36ms/step - loss: 667.7067 - val_loss: 609.3492\n",
"Epoch 2/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 11ms/step - loss: 617.5209 - val_loss: 591.2498\n",
"Epoch 3/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 9ms/step - loss: 561.6722 - val_loss: 557.8972\n",
"Epoch 4/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 10ms/step - loss: 528.3793 - val_loss: 507.4259\n",
"Epoch 5/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 15ms/step - loss: 455.1757 - val_loss: 443.5311\n",
"Epoch 6/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 17ms/step - loss: 386.5001 - val_loss: 378.3217\n",
"Epoch 7/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 22ms/step - loss: 335.8933 - val_loss: 329.0683\n",
"Epoch 8/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 16ms/step - loss: 278.2339 - val_loss: 300.2128\n",
"Epoch 9/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 21ms/step - loss: 305.0649 - val_loss: 287.5830\n",
"Epoch 10/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 20ms/step - loss: 273.1591 - val_loss: 280.5639\n",
"Epoch 11/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 16ms/step - loss: 234.1577 - val_loss: 276.5512\n",
"Epoch 12/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 22ms/step - loss: 235.5674 - val_loss: 273.2498\n",
"Epoch 13/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 14ms/step - loss: 216.2296 - val_loss: 270.5114\n",
"Epoch 14/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 16ms/step - loss: 216.4830 - val_loss: 268.1384\n",
"Epoch 15/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 16ms/step - loss: 241.6802 - val_loss: 266.0169\n",
"Epoch 16/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 19ms/step - loss: 197.7046 - val_loss: 264.7904\n",
"Epoch 17/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 19ms/step - loss: 254.8440 - val_loss: 263.2259\n",
"Epoch 18/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 22ms/step - loss: 217.4115 - val_loss: 262.4696\n",
"Epoch 19/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 10ms/step - loss: 220.9622 - val_loss: 261.7163\n",
"Epoch 20/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 19ms/step - loss: 203.3700 - val_loss: 261.1432\n",
"Epoch 21/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 9ms/step - loss: 204.3532 - val_loss: 260.6401\n",
"Epoch 22/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 11ms/step - loss: 206.5609 - val_loss: 260.3468\n",
"Epoch 23/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 10ms/step - loss: 200.0678 - val_loss: 259.9577\n",
"Epoch 24/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 10ms/step - loss: 205.6685 - val_loss: 259.8051\n",
"Epoch 25/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 13ms/step - loss: 200.1108 - val_loss: 259.6185\n",
"Epoch 26/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 12ms/step - loss: 190.1116 - val_loss: 259.4397\n",
"Epoch 27/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 11ms/step - loss: 188.8570 - val_loss: 259.6147\n",
"Epoch 28/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 12ms/step - loss: 192.8143 - val_loss: 260.2402\n",
"Epoch 29/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 173.7915 - val_loss: 260.1749\n",
"Epoch 30/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 12ms/step - loss: 184.3625 - val_loss: 260.5374\n",
"Epoch 31/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 11ms/step - loss: 171.2388 - val_loss: 260.4700\n",
"Epoch 32/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 14ms/step - loss: 178.1833 - val_loss: 261.1990\n",
"Epoch 33/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 13ms/step - loss: 178.0006 - val_loss: 261.4209\n",
"Epoch 34/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 13ms/step - loss: 187.6684 - val_loss: 262.2680\n",
"Epoch 35/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 16ms/step - loss: 159.2341 - val_loss: 262.3969\n",
"Epoch 36/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 13ms/step - loss: 160.0466 - val_loss: 262.5146\n",
"Epoch 37/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 14ms/step - loss: 176.0917 - val_loss: 263.1417\n",
"Epoch 38/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 16ms/step - loss: 151.0455 - val_loss: 263.6153\n",
"Epoch 39/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 24ms/step - loss: 146.4944 - val_loss: 264.1271\n",
"Epoch 40/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 19ms/step - loss: 165.9863 - val_loss: 264.9947\n",
"Epoch 41/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 17ms/step - loss: 153.2215 - val_loss: 265.0475\n",
"Epoch 42/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 18ms/step - loss: 154.0866 - val_loss: 265.6365\n",
"Epoch 43/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 26ms/step - loss: 147.2036 - val_loss: 266.3539\n",
"Epoch 44/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 18ms/step - loss: 175.0763 - val_loss: 266.5692\n",
"Epoch 45/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 16ms/step - loss: 175.0344 - val_loss: 267.5154\n",
"Epoch 46/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 16ms/step - loss: 154.6448 - val_loss: 267.3886\n",
"Epoch 47/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 19ms/step - loss: 145.1259 - val_loss: 267.1888\n",
"Epoch 48/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 20ms/step - loss: 160.2450 - val_loss: 267.4206\n",
"Epoch 49/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 15ms/step - loss: 144.1925 - val_loss: 267.6102\n",
"Epoch 50/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 13ms/step - loss: 144.6609 - val_loss: 268.0196\n",
"\u001b[1m7/7\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 51ms/step\n",
"RMSE for BiLSTM: 16.37130237010102\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"model = Sequential()\n",
"\n",
"# Input layer: RNN expects 3D input (samples, time steps, features)\n",
"# Here, we'll use a single time step (1) for each sample (flatten the data to time steps)\n",
"model.add(tf.keras.layers.Reshape((1, X_train.shape[1]), input_shape=(X_train.shape[1],)))\n",
"\n",
"# RNN layer with 64 units\n",
"model.add(SimpleRNN(64, activation='relu', return_sequences=False))\n",
"\n",
"# Dropout layer to prevent overfitting\n",
"model.add(Dropout(0.3))\n",
"\n",
"# Output layer: predicting a continuous value (Time taken for Enrollment)\n",
"model.add(Dense(1))\n",
"\n",
"# Step 5: Compile the model\n",
"model.compile(optimizer=Adam(), loss='mean_squared_error')\n",
"\n",
"# Step 6: Train the model\n",
"history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))\n",
"\n",
"# Step 7: Make predictions and evaluate the model\n",
"y_pred = model.predict(X_test)\n",
"\n",
"# Calculate RMSE (Root Mean Squared Error)\n",
"rmse = np.sqrt(mean_squared_error(y_test, y_pred))\n",
"print(f\"RMSE: {rmse}\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "A6JHsy5sBjRs",
"outputId": "997929af-bf12-4569-90ba-779b22ada812"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.10/dist-packages/keras/src/layers/reshaping/reshape.py:39: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.\n",
" super().__init__(**kwargs)\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Epoch 1/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 26ms/step - loss: 579.7545 - val_loss: 598.7670\n",
"Epoch 2/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 5ms/step - loss: 541.1518 - val_loss: 571.0222\n",
"Epoch 3/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 6ms/step - loss: 532.0207 - val_loss: 529.2407\n",
"Epoch 4/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 6ms/step - loss: 454.2565 - val_loss: 477.3809\n",
"Epoch 5/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 5ms/step - loss: 415.7916 - val_loss: 424.5004\n",
"Epoch 6/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 6ms/step - loss: 452.6029 - val_loss: 376.9656\n",
"Epoch 7/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 5ms/step - loss: 358.6606 - val_loss: 340.9701\n",
"Epoch 8/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 6ms/step - loss: 310.2562 - val_loss: 314.1212\n",
"Epoch 9/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 6ms/step - loss: 305.6849 - val_loss: 297.5801\n",
"Epoch 10/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 9ms/step - loss: 249.7126 - val_loss: 288.5488\n",
"Epoch 11/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 14ms/step - loss: 230.8337 - val_loss: 281.9182\n",
"Epoch 12/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 7ms/step - loss: 264.7994 - val_loss: 277.4290\n",
"Epoch 13/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 12ms/step - loss: 230.6227 - val_loss: 274.1347\n",
"Epoch 14/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 6ms/step - loss: 224.2143 - val_loss: 271.3199\n",
"Epoch 15/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 6ms/step - loss: 218.0156 - val_loss: 269.0135\n",
"Epoch 16/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 5ms/step - loss: 235.3152 - val_loss: 266.9712\n",
"Epoch 17/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 7ms/step - loss: 251.2175 - val_loss: 265.0502\n",
"Epoch 18/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 5ms/step - loss: 251.1464 - val_loss: 263.4379\n",
"Epoch 19/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 6ms/step - loss: 220.6263 - val_loss: 261.9560\n",
"Epoch 20/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 5ms/step - loss: 232.6349 - val_loss: 260.8833\n",
"Epoch 21/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 7ms/step - loss: 230.8253 - val_loss: 259.8926\n",
"Epoch 22/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 7ms/step - loss: 226.6449 - val_loss: 258.7480\n",
"Epoch 23/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 5ms/step - loss: 211.2643 - val_loss: 258.2128\n",
"Epoch 24/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 5ms/step - loss: 204.0192 - val_loss: 257.8778\n",
"Epoch 25/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 7ms/step - loss: 246.8270 - val_loss: 257.0885\n",
"Epoch 26/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 7ms/step - loss: 221.6270 - val_loss: 256.7341\n",
"Epoch 27/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 6ms/step - loss: 190.9916 - val_loss: 256.3143\n",
"Epoch 28/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 5ms/step - loss: 205.6945 - val_loss: 256.1562\n",
"Epoch 29/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 7ms/step - loss: 238.1927 - val_loss: 255.8900\n",
"Epoch 30/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 7ms/step - loss: 199.9858 - val_loss: 255.9863\n",
"Epoch 31/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 5ms/step - loss: 218.4140 - val_loss: 256.0137\n",
"Epoch 32/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 8ms/step - loss: 233.9201 - val_loss: 255.7076\n",
"Epoch 33/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 7ms/step - loss: 216.9118 - val_loss: 256.2441\n",
"Epoch 34/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 11ms/step - loss: 214.1853 - val_loss: 255.9830\n",
"Epoch 35/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 9ms/step - loss: 216.3486 - val_loss: 256.2111\n",
"Epoch 36/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 10ms/step - loss: 175.9630 - val_loss: 256.3767\n",
"Epoch 37/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 9ms/step - loss: 204.1494 - val_loss: 256.2661\n",
"Epoch 38/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 6ms/step - loss: 205.1333 - val_loss: 256.2394\n",
"Epoch 39/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 5ms/step - loss: 194.3962 - val_loss: 256.8536\n",
"Epoch 40/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 8ms/step - loss: 192.2582 - val_loss: 257.1403\n",
"Epoch 41/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 6ms/step - loss: 200.7191 - val_loss: 257.0217\n",
"Epoch 42/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 5ms/step - loss: 179.5457 - val_loss: 257.5062\n",
"Epoch 43/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 6ms/step - loss: 182.3336 - val_loss: 257.6974\n",
"Epoch 44/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 6ms/step - loss: 186.9329 - val_loss: 258.0083\n",
"Epoch 45/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 6ms/step - loss: 175.0525 - val_loss: 257.9814\n",
"Epoch 46/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 6ms/step - loss: 208.5019 - val_loss: 258.1638\n",
"Epoch 47/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 7ms/step - loss: 204.7254 - val_loss: 258.5562\n",
"Epoch 48/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 6ms/step - loss: 185.3362 - val_loss: 258.8849\n",
"Epoch 49/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 5ms/step - loss: 185.8382 - val_loss: 259.3331\n",
"Epoch 50/50\n",
"\u001b[1m25/25\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 4ms/step - loss: 179.8962 - val_loss: 259.4713\n",
"\u001b[1m7/7\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 24ms/step\n",
"RMSE: 16.108112997784122\n"
]
}
]
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "cB3wFHSgCFjN"
},
"execution_count": null,
"outputs": []
}
]
}