[dee452]: / Data_processing / criteria_embeddings_BERT.ipynb

Download this file

1 lines (1 with data), 57.5 kB

{"cells":[{"cell_type":"markdown","source":["## Imports"],"metadata":{"id":"4da2Sb7ct74V"}},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":36396,"status":"ok","timestamp":1651325792492,"user":{"displayName":"Thomas Smits","userId":"13349275365625627797"},"user_tz":240},"id":"avEwtXgX61wn","outputId":"3daca637-9de2-4c0c-85e4-53dbdf5cc608"},"outputs":[{"output_type":"stream","name":"stdout","text":["Mounted at /content/gdrive\n"]}],"source":["from google.colab import drive\n","drive.mount('/content/gdrive')"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"xiC9awOs8aiv"},"outputs":[],"source":["import pandas as pd\n","import os"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"lDKYr8LWYqDO"},"outputs":[],"source":["# Directory to the project folder\n","deep_learning_dir = '/content/gdrive/My Drive/BMI 707 Project' "]},{"cell_type":"code","source":["train = pd.read_csv(deep_learning_dir + '/data_formatting/training_data.tsv', sep = '\\t')\n","test = pd.read_csv(deep_learning_dir + '/data_formatting/testing_data.tsv', sep = '\\t')\n","val = pd.read_csv(deep_learning_dir + '/data_formatting/validation_data.tsv', sep = '\\t')"],"metadata":{"id":"Kh9N5P-inJBX"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["merged_df = train\n","merged_df = merged_df.append(test, ignore_index = True)\n","merged_df = merged_df.append(val, ignore_index = True)\n","\n","merged_df[merged_df['criteria'].apply(lambda x: not isinstance(x, float))]"],"metadata":{"id":"HjVq_iRmsVsU"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## Generating embeddings"],"metadata":{"id":"W9IbvFfyt_Uw"}},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":7619,"status":"ok","timestamp":1651325838933,"user":{"displayName":"Thomas Smits","userId":"13349275365625627797"},"user_tz":240},"id":"cebB6_Pvxc2l","outputId":"b874bf7e-0ce0-4127-e753-51cbbcb23b47"},"outputs":[{"output_type":"stream","name":"stdout","text":["Collecting transformers\n","  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)\n","\u001b[K     |████████████████████████████████| 4.0 MB 7.1 MB/s \n","\u001b[?25hRequirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers) (4.64.0)\n","Collecting pyyaml>=5.1\n","  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)\n","\u001b[K     |████████████████████████████████| 596 kB 59.5 MB/s \n","\u001b[?25hRequirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.7/dist-packages (from transformers) (21.3)\n","Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers) (3.6.0)\n","Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from transformers) (4.11.3)\n","Collecting sacremoses\n","  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)\n","\u001b[K     |████████████████████████████████| 895 kB 77.3 MB/s \n","\u001b[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1\n","  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)\n","\u001b[K     |████████████████████████████████| 6.6 MB 68.8 MB/s \n","\u001b[?25hRequirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (2019.12.20)\n","Collecting huggingface-hub<1.0,>=0.1.0\n","  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)\n","\u001b[K     |████████████████████████████████| 77 kB 6.9 MB/s \n","\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (1.21.6)\n","Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers) (2.23.0)\n","Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0,>=0.1.0->transformers) (4.2.0)\n","Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging>=20.0->transformers) (3.0.8)\n","Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->transformers) (3.8.0)\n","Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (3.0.4)\n","Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2021.10.8)\n","Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (1.24.3)\n","Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2.10)\n","Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.15.0)\n","Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.1.0)\n","Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (7.1.2)\n","Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers\n","  Attempting uninstall: pyyaml\n","    Found existing installation: PyYAML 3.13\n","    Uninstalling PyYAML-3.13:\n","      Successfully uninstalled PyYAML-3.13\n","Successfully installed huggingface-hub-0.5.1 pyyaml-6.0 sacremoses-0.0.49 tokenizers-0.12.1 transformers-4.18.0\n"]}],"source":["!pip install transformers"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"K0LQHhptn7TX"},"outputs":[],"source":["from transformers import BertModel, BertTokenizer\n","import torch\n","import pickle \n","from tqdm import tqdm\n","\n","device = torch.device(\"cuda:0\")"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"FTHgKQ0wmOls"},"outputs":[],"source":["class ClinBiobertEmbedding(object):\n","  '''\n","  Embedding class adapted from BiobertEmbedding class. \n","  Incorporates ability to initialise with own tokenizer and model, and adapted sentence encoding. \n","\n","  Example usage: \n","    clinBiobertEmbedding = ClinBiobertEmbedding(myTokenizer, myModel)\n","    embedding = ClinBiobertEmbedding.sentence_encoding(myTextToEmbed)\n","\n","  '''\n","  def __init__(self, tokenizer, model):\n","    '''\n","    Initialiser. Sets tokenizer and model of object.\n","    '''\n","    self.tokenizer = tokenizer\n","    self.model = model.to(device)\n","\n","  def process_text(self, text):\n","    '''\n","    Process text with tokenizer. \n","    Generally not directly used by user.\n","    '''\n","    marked_text = \"[CLS] \" + text + \" [SEP]\"\n","    tokenized_text = self.tokenizer.tokenize(marked_text)\n","    return tokenized_text\n","\n","  def sentence_encoding(self,text):\n","    '''\n","    Encodes sentence using tokenizer and model.\n","\n","    Input: \n","      text (str): sentence (without tokenizer marks) to encode. \n","\n","    Output: \n","      sentence_embedding (tensor): tensor with embedding of sentence.\n","    '''\n","    marked_text = \"[CLS] \" + text + \" [SEP]\"\n","    tokenized_text = self.tokenizer.tokenize(marked_text)\n","\n","    # Mark each of the tokens as belonging to sentence \"1\".\n","    segments_ids = [1] * len(tokenized_text)\n","    # Map the token strings to their vocabulary indeces.\n","    indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)\n","\n","    # Convert inputs to PyTorch tensors\n","    tokens_tensor = torch.tensor([indexed_tokens]).to(device)\n","    segments_tensors = torch.tensor([segments_ids]).to(device)\n","\n","    # Predict hidden states features for each layer\n","    bert_output = self.model(tokens_tensor, segments_tensors)\n","    token_vecs = bert_output['last_hidden_state'][0]\n","\n","    # Calculate the average of token vectors.\n","    sentence_embedding = torch.mean(token_vecs, dim=0)\n","\n","    return sentence_embedding.detach().cpu().numpy() #this returns the embedding as a copy in cpu memory as np array"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"LrPJcKg7ffWu"},"outputs":[],"source":["def get_all_protocols():\n","\t'''\n","\tMethod for getting all protocols\n","\t'''\n","\tprotocols = merged_df['criteria'].tolist()\n","\treturn protocols\n","\n","def clean_protocol(protocol):\n","\t'''\n","\tMethod for cleaning protocols, to be used in split_protocol\n","\t'''\n","\tif isinstance(protocol, float):\n","\t\tprotocol = \"No criteria\"\n","\tprotocol = protocol.lower()\n","\tprotocol_split = protocol.split('\\n')\n","\tfilter_out_empty_fn = lambda x: len(x.strip())>0\n","\tstrip_fn = lambda x:x.strip()\n","\tprotocol_split = list(filter(filter_out_empty_fn, protocol_split))\t\n","\tprotocol_split = list(map(strip_fn, protocol_split))\n","\treturn protocol_split \n","\n","def split_protocol(protocol):\n","\t'''\n","\tMethod for splitting protocols\n","\t'''\n","\tprotocol_split = clean_protocol(protocol)\n","\tinclusion_idx, exclusion_idx = len(protocol_split), len(protocol_split)\t\n","\tfor idx, sentence in enumerate(protocol_split):\n","\t\tif \"inclusion\" in sentence:\n","\t\t\tinclusion_idx = idx\n","\t\t\tbreak\n","\tfor idx, sentence in enumerate(protocol_split):\n","\t\tif \"exclusion\" in sentence:\n","\t\t\texclusion_idx = idx \n","\t\t\tbreak \t\t\n","\tif inclusion_idx + 1 < exclusion_idx + 1 < len(protocol_split):\n","\t\tinclusion_criteria = protocol_split[inclusion_idx:exclusion_idx]\n","\t\texclusion_criteria = protocol_split[exclusion_idx:]\n","\t\tif not (len(inclusion_criteria) > 0 and len(exclusion_criteria) > 0):\n","\t\t\tprint(len(inclusion_criteria), len(exclusion_criteria), len(protocol_split))\n","\t\t\texit()\n","\t\treturn inclusion_criteria, exclusion_criteria ## list, list \n","\telse:\n","\t\treturn protocol_split"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"olaEi4R8OdME"},"outputs":[],"source":["def get_embedder():\n","  '''\n","  Retrieving the clinical bio BERT embedder\n","  '''\n","  clinbert_tokenizer = BertTokenizer.from_pretrained(\"emilyalsentzer/Bio_ClinicalBERT\")\n","  clinbert_model = BertModel.from_pretrained(\"emilyalsentzer/Bio_ClinicalBERT\")\n","  clinbert_model.trainable = False\n","  clinbert_embedder = ClinBiobertEmbedding(clinbert_tokenizer, clinbert_model)\n","  return clinbert_embedder"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"gRH70NmP1Z7K"},"outputs":[],"source":["def collect_cleaned_sentence_set():\n","\t'''\n","\tObtaining a set of all possible sentences used in the data.\n","\t'''\n","\tprotocol_lst = get_all_protocols() \n","\tcleaned_sentence_lst = []\n","\tfor protocol in protocol_lst:\n","\t\tresult = split_protocol(protocol)\n","\t\tcleaned_sentence_lst.extend(result[0])\n","\t\tif len(result)==2:\n","\t\t\tcleaned_sentence_lst.extend(result[1])\n","\treturn list(set(cleaned_sentence_lst))"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":185,"referenced_widgets":["c5cba86e6a7a4011ad126f2eebfe0501","61b2df015d284874964a796fdca62d0f","0072f3db2b7847b88a3f179e474078d8","a2be369d3bff4d46878809f13496bb6e","3349eb1ef9cf45d98cdbcf3038583df2","d11faeac9e13444aaa0ff56f70543d25","9970aaadad7b405d95cbf86939ff6008","74e08bed16f54abfbe84488d2cf4b59e","555140b8bd4e4ae59e07b320dc9b4352","d6bf961e5672496c8a811b4fad30b104","0db31230db0d42238be1088a461ea7ad","f6ac67000379440e8160d4851f6b8029","dc3b43c0f3db4ee58dff7cf1ab589b76","3bd397b8c83549c3880248bb56320574","a5895c864c304d7e981fd5e40c7a0d5d","9f02ce5f4f534ad28a4a87bf44fcd1c6","bcc8ab6b913c49cfac9eb7b705bdaa20","802f8dd088c14872b85c9d790f578adf","5e24caa80ee14f81991f8f72772630b1","0ff0e159844e4127a4d0643b396f47f3","e7ab1a81065a4072a75c91d1ed3fd457","54eebb12b501447e8e37740fb222b2bc","9a2759f2bc8548d4855433565f98861f","592fec34a5dd4d30b2707b50c817aaf9","cb0055d346c4439ca49268eba023484c","bef2b49f50bc427e9f4a6175c288b532","daec42aae3eb45949974aa301668bf09","2239afe1792e4390b220437d8e3d5e57","0e3231b76431433db86417d6a18043cd","66a7d10a2e7c4eefaa7135918b69287d","5372ec2363314de48570199e9c76de21","cd4b834c21524142996719c4304d36b1","c11089c73f2a49a788f8fd267ab798e2"]},"executionInfo":{"elapsed":28413,"status":"ok","timestamp":1651326429969,"user":{"displayName":"Thomas Smits","userId":"13349275365625627797"},"user_tz":240},"id":"7fJWCHOrZmvi","outputId":"5da27693-13cf-4deb-c93f-92a3c96670f6"},"outputs":[{"output_type":"display_data","data":{"text/plain":["Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"c5cba86e6a7a4011ad126f2eebfe0501"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Downloading:   0%|          | 0.00/385 [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"f6ac67000379440e8160d4851f6b8029"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"9a2759f2bc8548d4855433565f98861f"}},"metadata":{}},{"output_type":"stream","name":"stderr","text":["Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']\n","- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n","- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"]}],"source":["cleaned_sentence_set = collect_cleaned_sentence_set() \n","clinbert_embedder = get_embedder()"]},{"cell_type":"markdown","source":["## Splitting into chunks"],"metadata":{"id":"KficpAe5uW21"}},{"cell_type":"code","execution_count":null,"metadata":{"id":"OIUOeM8lafbg"},"outputs":[],"source":["chunks_dir = deep_learning_dir + \"/embedding/criteria_sentence_embeddings/chunks\""]},{"cell_type":"code","execution_count":null,"metadata":{"id":"W44ZEnXAZlGN"},"outputs":[],"source":["def save_sentence_bert_dict_pkl_chunk(clinbert_embedder, sentence_iterator, i, size):\n","  protocol_sentence_2_embedding = dict()\n","  for sentence in tqdm(sentence_iterator,desc=f\"generating for {i}\"):\n","    protocol_sentence_2_embedding[sentence] = clinbert_embedder.sentence_encoding(sentence)\n","  pickle.dump(protocol_sentence_2_embedding, open(f'{chunks_dir}/sentence2embedding_{i}.pkl', 'wb'))\n","  return protocol_sentence_2_embedding"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":4038360,"status":"ok","timestamp":1650839009346,"user":{"displayName":"Thomas Smits","userId":"13349275365625627797"},"user_tz":240},"id":"uF6iABvM2qt9","outputId":"e03ff7dd-feb6-4c62-e6dc-203c971a8003"},"outputs":[{"name":"stderr","output_type":"stream","text":["generating for 0: 100%|██████████| 10000/10000 [02:15<00:00, 73.77it/s]\n","generating for 1: 100%|██████████| 10000/10000 [02:15<00:00, 73.54it/s]\n","generating for 2: 100%|██████████| 10000/10000 [02:16<00:00, 73.44it/s]\n","generating for 3: 100%|██████████| 10000/10000 [02:15<00:00, 73.85it/s]\n","generating for 4: 100%|██████████| 10000/10000 [02:15<00:00, 73.60it/s]\n","generating for 5: 100%|██████████| 10000/10000 [02:15<00:00, 73.74it/s]\n","generating for 6: 100%|██████████| 10000/10000 [02:15<00:00, 73.92it/s]\n","generating for 7: 100%|██████████| 10000/10000 [02:15<00:00, 73.64it/s]\n","generating for 8: 100%|██████████| 10000/10000 [02:15<00:00, 73.87it/s]\n","generating for 9: 100%|██████████| 10000/10000 [02:15<00:00, 73.74it/s]\n","generating for 10: 100%|██████████| 10000/10000 [02:15<00:00, 73.81it/s]\n","generating for 11: 100%|██████████| 10000/10000 [02:15<00:00, 73.56it/s]\n","generating for 12: 100%|██████████| 10000/10000 [02:15<00:00, 73.63it/s]\n","generating for 13: 100%|██████████| 10000/10000 [02:15<00:00, 73.62it/s]\n","generating for 14: 100%|██████████| 10000/10000 [02:16<00:00, 73.32it/s]\n","generating for 15: 100%|██████████| 10000/10000 [02:16<00:00, 73.18it/s]\n","generating for 16: 100%|██████████| 10000/10000 [02:15<00:00, 73.68it/s]\n","generating for 17: 100%|██████████| 10000/10000 [02:15<00:00, 73.67it/s]\n","generating for 18: 100%|██████████| 10000/10000 [02:15<00:00, 73.88it/s]\n","generating for 19: 100%|██████████| 10000/10000 [02:15<00:00, 73.56it/s]\n","generating for 20: 100%|██████████| 10000/10000 [02:16<00:00, 73.35it/s]\n","generating for 21: 100%|██████████| 10000/10000 [02:16<00:00, 73.24it/s]\n","generating for 22: 100%|██████████| 10000/10000 [02:16<00:00, 73.47it/s]\n","generating for 23: 100%|██████████| 10000/10000 [02:16<00:00, 73.26it/s]\n","generating for 24: 100%|██████████| 10000/10000 [02:16<00:00, 73.21it/s]\n","generating for 25: 100%|██████████| 10000/10000 [02:16<00:00, 73.51it/s]\n","generating for 26: 100%|██████████| 10000/10000 [02:16<00:00, 73.40it/s]\n","generating for 27: 100%|██████████| 10000/10000 [02:15<00:00, 73.56it/s]\n","generating for 28: 100%|██████████| 10000/10000 [02:16<00:00, 73.49it/s]\n","generating for 29: 100%|██████████| 6346/6346 [01:26<00:00, 72.98it/s]\n"]}],"source":["import os\n","import gc\n","def generate_datasets(size):\n","    return (cleaned_sentence_set[i:i+size] for i in range(0, len(cleaned_sentence_set), size))\n","\n","SIZE = 10000\n","dirpath = chunks_dir\n","if not os.path.exists(dirpath):\n","  os.makedirs(dirpath)\n","total_len = len(cleaned_sentence_set)\n","for i, dataset in enumerate(generate_datasets(10000)):\n","  save_sentence_bert_dict_pkl_chunk(clinbert_embedder, dataset,i,SIZE)\n","  gc.collect()"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"zfyfsYwetSs7"},"outputs":[],"source":["#combining all files\n","full_pickle_dict = {}\n","for i in range(29):\n","  with open(f\"{chunks_dir}/sentence2embedding_{i}.pkl\",\"rb\") as f:\n","    pickle_dict = pickle.load(f)\n","    full_pickle_dict.update(pickle_dict)\n","  \n","pickle.dump(full_pickle_dict, open(f'{deep_learning_dir}/embeddings/criteria_sentence_embeddings/sentence2embedding_full.pkl', 'wb'))"]},{"cell_type":"markdown","source":["## Loading any embedding\n","\n","I have now generated a dictionary for each possible sentence to embedding. The reason for this is that this will speed up time (rather than calculating the embeddings on the fly). \n","\n","With load_sentence_2_vec I can now load up that dictionary. This is then a dictionary of 'some split sentence' = (np version of) tensor.\n","\n","Then, I can take a 'raw' protocol in protocol2feature, split it in incl and excl, and for each sentence in incl/excl can get the sentence embedding. Then, I create the full embedding of the criteria from the sentences. I do this by truncating or padding. The number of sentences I allow is 32, this is based on the distribution of lengths of the training set."],"metadata":{"id":"Az_oQYMUt149"}},{"cell_type":"code","execution_count":null,"metadata":{"id":"hmjqWhG42k-o"},"outputs":[],"source":["def load_sentence_2_vec():\n","\t'''\n","\tLoading the sentence2vec dictionary.\n","\t'''\n","\tsentence2vec = pickle.load(open(f'{deep_learning_dir}/embeddings/criteria_sentence_embeddings/sentence2embedding_full.pkl', 'rb'))\n","\treturn sentence2vec "]},{"cell_type":"code","source":["def reshape_features(feature, size):\n","  '''\n","  Reshaping feature size\n","\n","  Input: \n","    feature (tensor): tensor of torch.Size([x])\n","    size (int): maximum length\n","\n","  Output: \n","    feature (tensor): tensor of torch.Size([size])\n","  '''\n","  if feature.shape[0] > size:\n","    feature = feature[0:size]\n","  else: \n","    b = torch.zeros(size - feature.shape[0])\n","    feature = torch.cat((feature, b))\n","  return feature"],"metadata":{"id":"jWxxpB_fEmkN"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["def protocol2feature(protocol, sentence2vec, size):\n","  '''\n","  Extracting inclusion and exclusion feature from a protocol.\n","\n","  Input: \n","    protocol (str): criteria text\n","    sentence2vec (dict): sentence to embedding dictionary\n","    size (int): size of output tensor (to pad or truncate concatenated sentence embeddings with)\n","\n","  Output: \n","    inclusion_feature (tensor): tensor of torch.Size([size]) with inclusion criteria embedding\n","    exclusion_feature (tensor): tensor of torch.Size([size]) with exclusion criteria embedding\n","  '''\n","  # same lengths (24576)\n","  result = split_protocol(protocol)\n","  inclusion_criteria = result[0]\n","  if len(result) > 1:\n","    exclusion_criteria = result[1]\n","  else: \n","    exclusion_criteria = []\n","\n","  inclusion_feature = [torch.from_numpy(sentence2vec[sentence]) for sentence in inclusion_criteria if sentence in sentence2vec]\n","  exclusion_feature = [torch.from_numpy(sentence2vec[sentence]) for sentence in exclusion_criteria if sentence in sentence2vec]\n","\n","  if inclusion_feature == []:\n","    inclusion_feature = torch.zeros(size)\n","  else:\n","    inclusion_feature = torch.cat(inclusion_feature, 0)\n","    inclusion_feature = reshape_features(inclusion_feature, size)\n","\n","  if exclusion_feature == []:\n","    exclusion_feature = torch.zeros(size)\n","  else:\n","    exclusion_feature = torch.cat(exclusion_feature, 0)\n","    exclusion_feature = reshape_features(exclusion_feature, size)\n","\n","  return inclusion_feature, exclusion_feature "],"metadata":{"id":"NELPUNCBCPkb"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["max_length = 24576 #32 * 768"],"metadata":{"id":"oWU67urHA9yM"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["sentence2vec = load_sentence_2_vec()"],"metadata":{"id":"mDF-QzAs3ksk"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["df_temp = merged_df[['nctid', 'criteria']]\n","dict_crit = df_temp.set_index('nctid').T.to_dict('list')\n","\n","nctid2incl_criteria = dict()\n","nctid2excl_criteria = dict()\n","\n","max_size = 24576\n","\n","for key, value in dict_crit.items(): \n","  incl, excl = protocol2feature(value[0], sentence2vec, max_size)\n","  nctid2incl_criteria[key] = incl\n","  nctid2excl_criteria[key] = excl"],"metadata":{"id":"Yeo9pccA8Jv0"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["pickle.dump(nctid2incl_criteria, open(f'{deep_learning_dir}/embeddings/nctid2incl_criteria.pkl', 'wb'))\n","pickle.dump(nctid2excl_criteria, open(f'{deep_learning_dir}/embeddings/nctid2excl_criteria.pkl', 'wb'))"],"metadata":{"id":"P8Ae_DAr9ymt"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## Determination of max length"],"metadata":{"id":"017siSYIGEWA"}},{"cell_type":"code","source":["train_df_temp = train[['nctid', 'criteria']]\n","train_dict_crit = train_df_temp.set_index('nctid').T.to_dict('list')\n","\n","train_nctid2incl_criteria = dict()\n","train_nctid2excl_criteria = dict()\n","\n","\n","for key, value in train_dict_crit.items(): \n","  incl, excl = protocol2feature(value[0], sentence2vec)\n","  train_nctid2incl_criteria[key] = incl\n","  train_nctid2excl_criteria[key] = excl"],"metadata":{"id":"8m_rJhDj-1rk"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["import matplotlib.pyplot as plt\n","\n","leng = []\n","\n","for key, value in train_nctid2excl_criteria.items(): \n","  leng_temp = value.shape[0]\n","  leng.append(leng_temp)\n","\n","plt.hist(leng, bins = 30)\n","plt.show()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":278},"id":"HC7eBY0LAgW7","executionInfo":{"status":"ok","timestamp":1651332488586,"user_tz":240,"elapsed":537,"user":{"displayName":"Thomas Smits","userId":"13349275365625627797"}},"outputId":"43db2d0c-547c-4e0b-be50-b40780f93b17"},"execution_count":null,"outputs":[{"output_type":"display_data","data":{"text/plain":["<Figure size 432x288 with 1 Axes>"],"image/png":"iVBORw0KGgoAAAANSUhEUgAAAX0AAAEFCAYAAAAPCDf9AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAATMUlEQVR4nO3df6zd9X3f8eerdpKuCQ1QHIvYrPYypy2ZGoc6QJWoJbUKhv4wmVhm1gTCkNytpkq1rit03eiaIiVqU7ooCZ0bXJyWhqEkDC/zAh5NG3UVP0zrGmxKuAMS7BF8G1PaJCqT6Xt/nI+lk5t7fc+9Pj6O83k+pKPzPe/v5/s9n48svc73fn98nKpCktSHbzvZHZAkTY6hL0kdMfQlqSOGviR1xNCXpI4sPdkdOJazzjqrVq1adbK7IUmnlIcffvivqmrZbOu+qUN/1apV7N69+2R3Q5JOKUm+MNc6T+9IUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHvqmfyJW+ma26/n+M1O7p9/3YCe6JNDqP9CWpI4a+JHVk3tBP8u1JHkzyF0n2JflPrb46yQNJppL81yQvb/VXtM9Tbf2qoX3d0OqPJ7nkRA1KkjS7UY70XwR+pKreCKwFNiS5EHg/cHNV/WPgeeDa1v5a4PlWv7m1I8m5wCbgDcAG4CNJloxzMJKkY5s39GvgK+3jy9qrgB8BPtHq24HL2/LG9pm2fn2StPodVfViVT0FTAHnj2UUkqSRjHROP8mSJHuAQ8Au4P8Af11VR1qTA8CKtrwCeAagrX8B+K7h+izbDH/X5iS7k+yenp5e+IgkSXMaKfSr6qWqWgusZHB0/r0nqkNVtbWq1lXVumXLZv2PXyRJi7Sgu3eq6q+BzwI/CJye5Oh9/iuBg235IHAOQFv/auDLw/VZtpEkTcAod+8sS3J6W/4HwI8CjzEI/ytas6uBu9vyjvaZtv4Pq6pafVO7u2c1sAZ4cFwDkSTNb5Qncs8Gtrc7bb4NuLOqPp1kP3BHkl8D/hy4tbW/Ffi9JFPAYQZ37FBV+5LcCewHjgBbquql8Q5HknQs84Z+Ve0F3jRL/Ulmufumqv4O+Gdz7Osm4KaFd1OSNA4+kStJHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SerIvKGf5Jwkn02yP8m+JO9p9V9JcjDJnva6bGibG5JMJXk8ySVD9Q2tNpXk+hMzJEnSXJaO0OYI8PNV9WdJTgMeTrKrrbu5qn5juHGSc4FNwBuA1wL/K8nr2+oPAz8KHAAeSrKjqvaPYyCSpPnNG/pV9SzwbFv+2ySPASuOsclG4I6qehF4KskUcH5bN1VVTwIkuaO1NfQlaUIWdE4/ySrgTcADrXRdkr1JtiU5o9VWAM8MbXag1eaqz/yOzUl2J9k9PT29kO5JkuYxcugneRXwSeDnqupvgFuA1wFrGfwl8IFxdKiqtlbVuqpat2zZsnHsUpLUjHJOnyQvYxD4t1fVpwCq6rmh9b8DfLp9PAicM7T5ylbjGHVJ0gSMcvdOgFuBx6rqN4fqZw81ezvwaFveAWxK8ookq4E1wIPAQ8CaJKuTvJzBxd4d4xmGJGkUoxzpvwV4F/BIkj2t9kvAlUnWAgU8Dfw0QFXtS3Ingwu0R4AtVfUSQJLrgHuAJcC2qto3xrFIkuYxyt07fwJkllU7j7HNTcBNs9R3Hms7SdKJ5RO5ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktSReUM/yTlJPptkf5J9Sd7T6mcm2ZXkifZ+RqsnyQeTTCXZm+S8oX1d3do/keTqEzcsSdJsRjnSPwL8fFWdC1wIbElyLnA9cF9VrQHua58BLgXWtNdm4BYY/EgANwIXAOcDNx79oZAkTca8oV9Vz1bVn7XlvwUeA1YAG4Htrdl24PK2vBH4WA3cD5ye5GzgEmBXVR2uqueBXcCGsY5GknRMCzqnn2QV8CbgAWB5VT3bVn0JWN6WVwDPDG12oNXmqs/8js1JdifZPT09vZDuSZLmMXLoJ3kV8Eng56rqb4bXVVUBNY4OVdXWqlpXVeuWLVs2jl1KkpqRQj/JyxgE/u1V9alWfq6dtqG9H2r1g8A5Q5uvbLW56pKkCRnl7p0AtwKPVdVvDq3aARy9A+dq4O6h+lXtLp4LgRfaaaB7gIuTnNEu4F7capKkCVk6Qpu3AO8CHkmyp9V+CXgfcGeSa4EvAO9o63YClwFTwNeAawCq6nCS9wIPtXa/WlWHxzIKSdJI5g39qvoTIHOsXj9L+wK2zLGvbcC2hXRQkjQ+PpErSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqyLyhn2RbkkNJHh2q/UqSg0n2tNdlQ+tuSDKV5PEklwzVN7TaVJLrxz8USdJ8RjnSvw3YMEv95qpa2147AZKcC2wC3tC2+UiSJUmWAB8GLgXOBa5sbSVJE7R0vgZV9bkkq0bc30bgjqp6EXgqyRRwfls3VVVPAiS5o7Xdv+AeS5IW7XjO6V+XZG87/XNGq60Anhlqc6DV5qp/gySbk+xOsnt6evo4uidJmmmxoX8L8DpgLfAs8IFxdaiqtlbVuqpat2zZsnHtVpLECKd3ZlNVzx1dTvI7wKfbx4PAOUNNV7Yax6hLkiZkUUf6Sc4e+vh24OidPTuATUlekWQ1sAZ4EHgIWJNkdZKXM7jYu2Px3ZYkLca8R/pJPg5cBJyV5ABwI3BRkrVAAU8DPw1QVfuS3MngAu0RYEtVvdT2cx1wD7AE2FZV+8Y+GknSMY1y986Vs5RvPUb7m4CbZqnvBHYuqHeSpLHyiVxJ6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6si8oZ9kW5JDSR4dqp2ZZFeSJ9r7Ga2eJB9MMpVkb5Lzhra5urV/IsnVJ2Y4kqRjGeVI/zZgw4za9cB9VbUGuK99BrgUWNNem4FbYPAjAdwIXACcD9x49IdCkjQ584Z+VX0OODyjvBHY3pa3A5cP1T9WA/cDpyc5G7gE2FVVh6vqeWAX3/hDIkk6wRZ7Tn95VT3blr8ELG/LK4BnhtodaLW56t8gyeYku5Psnp6eXmT3JEmzOe4LuVVVQI2hL0f3t7Wq1lXVumXLlo1rt5IkFh/6z7XTNrT3Q61+EDhnqN3KVpurLkmaoMWG/g7g6B04VwN3D9WvanfxXAi80E4D3QNcnOSMdgH34laTJE3Q0vkaJPk4cBFwVpIDDO7CeR9wZ5JrgS8A72jNdwKXAVPA14BrAKrqcJL3Ag+1dr9aVTMvDkuSTrB5Q7+qrpxj1fpZ2hawZY79bAO2Lah3kqSx8olcSeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSR44r9JM8neSRJHuS7G61M5PsSvJEez+j1ZPkg0mmkuxNct44BiBJGt04jvTfVlVrq2pd+3w9cF9VrQHua58BLgXWtNdm4JYxfLckaQFOxOmdjcD2trwduHyo/rEauB84PcnZJ+D7JUlzON7QL+DeJA8n2dxqy6vq2bb8JWB5W14BPDO07YFW+zpJNifZnWT39PT0cXZPkjRs6XFu/9aqOpjkNcCuJH85vLKqKkktZIdVtRXYCrBu3boFbStJOrbjOtKvqoPt/RBwF3A+8NzR0zbt/VBrfhA4Z2jzla0mSZqQRYd+klcmOe3oMnAx8CiwA7i6NbsauLst7wCuanfxXAi8MHQaSJI0Acdzemc5cFeSo/v5g6r6TJKHgDuTXAt8AXhHa78TuAyYAr4GXHMc3y1JWoRFh35VPQm8cZb6l4H1s9QL2LLY75MkHT+fyJWkjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjkw89JNsSPJ4kqkk10/6+yWpZxMN/SRLgA8DlwLnAlcmOXeSfZCknk36SP98YKqqnqyq/wfcAWyccB8kqVtLJ/x9K4Bnhj4fAC4YbpBkM7C5ffxKkscn1LdxOgv4q5PdiQlzzHPI+yfQk8nx3/nU8N1zrZh06M+rqrYCW092P45Hkt1Vte5k92OSHHMfHPOpb9Kndw4C5wx9XtlqkqQJmHToPwSsSbI6ycuBTcCOCfdBkro10dM7VXUkyXXAPcASYFtV7ZtkHybklD49tUiOuQ+O+RSXqjrZfZAkTYhP5EpSRwx9SeqIoX8ckjyd5JEke5LsnqPNRW39viR/POk+jtt8Y07y6iT/PclftDFfczL6OU5JTk/yiSR/meSxJD84Y32SfLBNLbI3yXknq6/jMMJ4f6qN85Ekf5rkjSerr+My35iH2r05yZEkV0y6j+PyTXef/inobVU164MbSU4HPgJsqKovJnnNZLt2wsw5ZmALsL+qfiLJMuDxJLe3J7BPVf8Z+ExVXdHuOvuOGesvBda01wXALcx46PAUM994nwJ+uKqeT3Ipgwudp/J4Yf4xH51G5v3AvZPu3DgZ+ifWvwA+VVVfBKiqQye5P5NQwGlJArwKOAwcObldWrwkrwZ+CHg3QPvxmvkDthH4WA3uiri/HTWeXVXPTrSzYzDKeKvqT4c+3s/geZtT1oj/xgA/C3wSePPEOncCeHrn+BRwb5KH2/QRM70eOCPJH7U2V024fyfCfGP+EPB9wP8FHgHeU1V/P8kOjtlqYBr43SR/nuSjSV45o81s04usmFQHx2yU8Q67Fvifk+naCTPvmJOsAN7O4K+4U5qhf3zeWlXnMfjzfkuSH5qxfinwA8CPAZcA/yHJ6yfcx3Gbb8yXAHuA1wJrgQ8l+c4J93GclgLnAbdU1ZuArwLfylOCjzzeJG9jEPq/OLnunRCjjPm3gF88xQ9gAEP/uFTVwfZ+CLiLwSyiww4A91TVV9s58M8Bp/RFrxHGfA2DU1pVVVMMzv9+72R7OVYHgANV9UD7/AkGATHsW2l6kVHGS5LvBz4KbKyqL0+wfyfCKGNeB9yR5GngCuAjSS6fXBfHx9BfpCSvTHLa0WXgYuDRGc3uBt6aZGmS72BwseuxyfZ0fEYc8xeB9a3NcuB7gCcn2c9xqqovAc8k+Z5WWg/sn9FsB3BVu4vnQuCFU/F8Pow23iT/EPgU8K6q+vyEuzh2o4y5qlZX1aqqWsXgR+Fnquq/Tban4+GF3MVbDtw1uF7JUuAPquozSf4VQFX9dlU9luQzwF7g74GPVtXMkDyVzDtm4L3AbUkeAcLgT+JTbVramX4WuL3d1fEkcM2MMe8ELgOmgK8x+GvnVDbfeP8j8F0MjnYBjnwLzEI535i/ZTgNgyR1xNM7ktQRQ1+SOmLoS1JHDH1J6oihL0kLkOTX28Rse5Pc1ebYmqvtkvaU76eHarcleapNWrgnydpW/4Wh2qNJXkpy5oh92pFkpDsDDX1JmkMGs+TeNqO8C/gnVfX9wOeBG46xi/cw+7M5v1BVa9trD0BV/frRWtvnH1fV4RH6+E+Br4wwHMDQl6QFqap7q+roJIJzTjiXZCWDKVg+uoivuRL4+NC+3pnkwfZXwH9pM36S5FXAvwF+bdQdG/qStHj/krknnPst4N8xeDBzppva6aGbk7xieEV7en8Dgxk9SfJ9wD8H3tL+CngJ+KnW/L3ABxg8FDgSQ1+SZkjyQJI9DI7Sf3LoXPslQ23+PYNpw2+fZfsfBw5V1cOz7P4GBvNRvRk4k2+csO4ngP89dGpnPYOJGx9qfVoP/KN2LeB1VXXXQsbmNAySNENVXQCDc/rAu6vq3cPrk7wb+HFgfc0+rcFbGPxYXAZ8O/CdSX6/qt45NC/Ti0l+F/i3M7bdxNCpHQbTmWyvqq+7dpDkXwPr2iRwS4HXJPmjqrroWGPzSF+SFiDJBganbX6yqmY9rVJVN1TVyjZB2ybgD6vqnW37s9t7gMsZmrSw/YcuP8xgssaj7gOuSPuf95KcmeS7q+qWqnpt+463Ap+fL/DB0JekhfoQcBqwq53y+W2AJK9NsnOE7W9vExI+ApzF11+EfTtwb1V99WihqvYDv8zgPy/ay+DuobMX23knXJOkjnikL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSR/4/AR6aGJ+08R0AAAAASUVORK5CYII=\n"},"metadata":{"needs_background":"light"}}]},{"cell_type":"markdown","source":["### Lengths of criterias"],"metadata":{"id":"60kQRagDKEut"}},{"cell_type":"code","source":["df_temp = merged_df[['nctid', 'criteria']]\n","dict_crit = df_temp.set_index('nctid').T.to_dict('list')\n","\n","len_nctid2incl_criteria = dict()\n","len_nctid2excl_criteria = dict()\n","\n","max_size = 24576\n","\n","for key, value in dict_crit.items(): \n","\n","  result = split_protocol(value[0])\n","  inclusion_criteria = result[0]\n","  if len(result) > 1:\n","    exclusion_criteria = result[1]\n","  else: \n","    exclusion_criteria = []\n","\n","  leng_incl = len(inclusion_criteria)\n","  leng_excl = len(exclusion_criteria)\n","\n","\n","  len_nctid2incl_criteria[key] = leng_incl\n","  len_nctid2excl_criteria[key] = leng_excl"],"metadata":{"id":"DTAD41GvIf-V"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["pickle.dump(len_nctid2incl_criteria, open(f'{deep_learning_dir}/embeddings/length_nctid2incl_criteria.pkl', 'wb'))\n","pickle.dump(len_nctid2excl_criteria, open(f'{deep_learning_dir}/embeddings/length_nctid2excl_criteria.pkl', 'wb'))"],"metadata":{"id":"Uo_CLL-IJsPm"},"execution_count":null,"outputs":[]}],"metadata":{"accelerator":"GPU","colab":{"background_execution":"on","collapsed_sections":[],"machine_shape":"hm","name":"criteria_embeddings_BERT.ipynb","provenance":[]},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"name":"python"},"widgets":{"application/vnd.jupyter.widget-state+json":{"c5cba86e6a7a4011ad126f2eebfe0501":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_61b2df015d284874964a796fdca62d0f","IPY_MODEL_0072f3db2b7847b88a3f179e474078d8","IPY_MODEL_a2be369d3bff4d46878809f13496bb6e"],"layout":"IPY_MODEL_3349eb1ef9cf45d98cdbcf3038583df2"}},"61b2df015d284874964a796fdca62d0f":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_d11faeac9e13444aaa0ff56f70543d25","placeholder":"​","style":"IPY_MODEL_9970aaadad7b405d95cbf86939ff6008","value":"Downloading: 100%"}},"0072f3db2b7847b88a3f179e474078d8":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_74e08bed16f54abfbe84488d2cf4b59e","max":213450,"min":0,"orientation":"horizontal","style":"IPY_MODEL_555140b8bd4e4ae59e07b320dc9b4352","value":213450}},"a2be369d3bff4d46878809f13496bb6e":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_d6bf961e5672496c8a811b4fad30b104","placeholder":"​","style":"IPY_MODEL_0db31230db0d42238be1088a461ea7ad","value":" 208k/208k [00:00&lt;00:00, 810kB/s]"}},"3349eb1ef9cf45d98cdbcf3038583df2":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"d11faeac9e13444aaa0ff56f70543d25":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"9970aaadad7b405d95cbf86939ff6008":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"74e08bed16f54abfbe84488d2cf4b59e":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"555140b8bd4e4ae59e07b320dc9b4352":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"d6bf961e5672496c8a811b4fad30b104":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"0db31230db0d42238be1088a461ea7ad":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"f6ac67000379440e8160d4851f6b8029":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_dc3b43c0f3db4ee58dff7cf1ab589b76","IPY_MODEL_3bd397b8c83549c3880248bb56320574","IPY_MODEL_a5895c864c304d7e981fd5e40c7a0d5d"],"layout":"IPY_MODEL_9f02ce5f4f534ad28a4a87bf44fcd1c6"}},"dc3b43c0f3db4ee58dff7cf1ab589b76":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_bcc8ab6b913c49cfac9eb7b705bdaa20","placeholder":"​","style":"IPY_MODEL_802f8dd088c14872b85c9d790f578adf","value":"Downloading: 100%"}},"3bd397b8c83549c3880248bb56320574":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_5e24caa80ee14f81991f8f72772630b1","max":385,"min":0,"orientation":"horizontal","style":"IPY_MODEL_0ff0e159844e4127a4d0643b396f47f3","value":385}},"a5895c864c304d7e981fd5e40c7a0d5d":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_e7ab1a81065a4072a75c91d1ed3fd457","placeholder":"​","style":"IPY_MODEL_54eebb12b501447e8e37740fb222b2bc","value":" 385/385 [00:00&lt;00:00, 14.2kB/s]"}},"9f02ce5f4f534ad28a4a87bf44fcd1c6":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"bcc8ab6b913c49cfac9eb7b705bdaa20":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"802f8dd088c14872b85c9d790f578adf":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"5e24caa80ee14f81991f8f72772630b1":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"0ff0e159844e4127a4d0643b396f47f3":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"e7ab1a81065a4072a75c91d1ed3fd457":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"54eebb12b501447e8e37740fb222b2bc":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"9a2759f2bc8548d4855433565f98861f":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_592fec34a5dd4d30b2707b50c817aaf9","IPY_MODEL_cb0055d346c4439ca49268eba023484c","IPY_MODEL_bef2b49f50bc427e9f4a6175c288b532"],"layout":"IPY_MODEL_daec42aae3eb45949974aa301668bf09"}},"592fec34a5dd4d30b2707b50c817aaf9":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_2239afe1792e4390b220437d8e3d5e57","placeholder":"​","style":"IPY_MODEL_0e3231b76431433db86417d6a18043cd","value":"Downloading: 100%"}},"cb0055d346c4439ca49268eba023484c":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_66a7d10a2e7c4eefaa7135918b69287d","max":435778770,"min":0,"orientation":"horizontal","style":"IPY_MODEL_5372ec2363314de48570199e9c76de21","value":435778770}},"bef2b49f50bc427e9f4a6175c288b532":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_cd4b834c21524142996719c4304d36b1","placeholder":"​","style":"IPY_MODEL_c11089c73f2a49a788f8fd267ab798e2","value":" 416M/416M [00:06&lt;00:00, 65.0MB/s]"}},"daec42aae3eb45949974aa301668bf09":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"2239afe1792e4390b220437d8e3d5e57":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"0e3231b76431433db86417d6a18043cd":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"66a7d10a2e7c4eefaa7135918b69287d":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"5372ec2363314de48570199e9c76de21":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"cd4b834c21524142996719c4304d36b1":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"c11089c73f2a49a788f8fd267ab798e2":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}}}}},"nbformat":4,"nbformat_minor":0}