--- a
+++ b/EvaluationBaseline/baseline_results_analysis.ipynb
@@ -0,0 +1,1315 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Results evaluation from baseline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 98,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[\"        full evaluation metrix: {'overall': {'acc': 0.7999, 'strict': {'precision': 0.5863842662632375, 'recall': 0.6398151205018158, 'f_score': 0.6119355857278181}, 'relax': {'precision': 0.7310136157337368, 'recall': 0.7976229778804886, 'f_score': 0.7628670666245659}}, 'category': {'strict': {'condition': {'precision': 0.652555910543131, 'recall': 0.7502295684113865, 'f_score': 0.6979923109782143}, 'procedure': {'precision': 0.4088888888888889, 'recall': 0.5993485342019544, 'f_score': 0.4861294583883752}, 'temporal': {'precision': 0.48656716417910445, 'recall': 0.5361842105263158, 'f_score': 0.5101721439749609}, 'measurement': {'precision': 0.5589225589225589, 'recall': 0.6287878787878788, 'f_score': 0.5918003565062389}, 'value': {'precision': 0.6620111731843575, 'recall': 0.7247706422018348, 'f_score': 0.691970802919708}, 'person': {'precision': 0.773972602739726, 'recall': 0.8129496402877698, 'f_score': 0.7929824561403509}, 'drug': {'precision': 0.5916870415647921, 'recall': 0.7289156626506024, 'f_score': 0.6531713900134952}, 'pregnancy_considerations': {'precision': 0.0, 'recall': 0.0, 'f_score': 0.0}, 'observation': {'precision': 0.5161290322580645, 'recall': 0.09937888198757763, 'f_score': 0.16666666666666666}}, 'relax': {'condition': {'precision': 0.7931309904153354, 'recall': 0.9118457300275482, 'f_score': 0.8483554036736437}, 'procedure': {'precision': 0.5555555555555556, 'recall': 0.8143322475570033, 'f_score': 0.6605019815059445}, 'temporal': {'precision': 0.6716417910447762, 'recall': 0.7401315789473685, 'f_score': 0.7042253521126761}, 'measurement': {'precision': 0.734006734006734, 'recall': 0.8257575757575758, 'f_score': 0.7771836007130125}, 'value': {'precision': 0.8044692737430168, 'recall': 0.8807339449541285, 'f_score': 0.8408759124087591}, 'person': {'precision': 0.7808219178082192, 'recall': 0.8201438848920863, 'f_score': 0.7999999999999999}, 'drug': {'precision': 0.7310513447432763, 'recall': 0.9006024096385542, 'f_score': 0.8070175438596492}, 'pregnancy_considerations': {'precision': 0.2962962962962963, 'recall': 0.5714285714285714, 'f_score': 0.3902439024390244}, 'observation': {'precision': 0.6774193548387096, 'recall': 0.13043478260869565, 'f_score': 0.21875}}}}\",\n",
+       " \"        full evaluation metrix: {'overall': {'acc': 0.8083, 'strict': {'precision': 0.6034431582071832, 'recall': 0.6711786068009244, 'f_score': 0.6355110972178806}, 'relax': {'precision': 0.7441377263282873, 'recall': 0.8276658963354242, 'f_score': 0.7836824007502344}}, 'category': {'strict': {'condition': {'precision': 0.6642394822006472, 'recall': 0.7539026629935721, 'f_score': 0.7062365591397849}, 'temporal': {'precision': 0.504, 'recall': 0.6217105263157895, 'f_score': 0.5567010309278352}, 'procedure': {'precision': 0.5079787234042553, 'recall': 0.6221498371335505, 'f_score': 0.5592972181551976}, 'measurement': {'precision': 0.6063829787234043, 'recall': 0.6477272727272727, 'f_score': 0.6263736263736264}, 'value': {'precision': 0.6912181303116147, 'recall': 0.746177370030581, 'f_score': 0.7176470588235294}, 'observation': {'precision': 0.44, 'recall': 0.2732919254658385, 'f_score': 0.3371647509578544}, 'pregnancy_considerations': {'precision': 0.0, 'recall': 0.0, 'f_score': 0.0}, 'person': {'precision': 0.7531645569620253, 'recall': 0.8561151079136691, 'f_score': 0.8013468013468013}, 'drug': {'precision': 0.6275, 'recall': 0.7560240963855421, 'f_score': 0.6857923497267758}, 'device': {'precision': 0.0, 'recall': 0.0, 'f_score': 0.0}, 'mood': {'precision': 0.375, 'recall': 0.046153846153846156, 'f_score': 0.08219178082191782}}, 'relax': {'condition': {'precision': 0.8074433656957929, 'recall': 0.9164370982552801, 'f_score': 0.858494623655914}, 'temporal': {'precision': 0.664, 'recall': 0.819078947368421, 'f_score': 0.7334315169366716}, 'procedure': {'precision': 0.6648936170212766, 'recall': 0.8143322475570033, 'f_score': 0.7320644216691069}, 'measurement': {'precision': 0.7801418439716312, 'recall': 0.8333333333333334, 'f_score': 0.8058608058608059}, 'value': {'precision': 0.8186968838526912, 'recall': 0.8837920489296636, 'f_score': 0.85}, 'observation': {'precision': 0.57, 'recall': 0.35403726708074534, 'f_score': 0.43678160919540227}, 'pregnancy_considerations': {'precision': 0.16666666666666666, 'recall': 0.8571428571428571, 'f_score': 0.27906976744186046}, 'person': {'precision': 0.759493670886076, 'recall': 0.8633093525179856, 'f_score': 0.8080808080808081}, 'drug': {'precision': 0.7475, 'recall': 0.9006024096385542, 'f_score': 0.8169398907103825}, 'device': {'precision': 1.0, 'recall': 0.3333333333333333, 'f_score': 0.5}, 'mood': {'precision': 0.5, 'recall': 0.06153846153846154, 'f_score': 0.10958904109589042}}}}\",\n",
+       " \"        full evaluation metrix: {'overall': {'acc': 0.8188, 'strict': {'precision': 0.623020706455542, 'recall': 0.6754704522944867, 'f_score': 0.6481862822746713}, 'relax': {'precision': 0.7633982947624848, 'recall': 0.8276658963354242, 'f_score': 0.7942341200696975}}, 'category': {'strict': {'condition': {'precision': 0.6878661087866109, 'recall': 0.7548209366391184, 'f_score': 0.7197898423817863}, 'procedure': {'precision': 0.5170603674540682, 'recall': 0.6416938110749185, 'f_score': 0.5726744186046512}, 'temporal': {'precision': 0.5373134328358209, 'recall': 0.5921052631578947, 'f_score': 0.5633802816901408}, 'measurement': {'precision': 0.6227758007117438, 'recall': 0.6628787878787878, 'f_score': 0.6422018348623854}, 'value': {'precision': 0.7025495750708215, 'recall': 0.7584097859327217, 'f_score': 0.7294117647058823}, 'observation': {'precision': 0.4, 'recall': 0.2484472049689441, 'f_score': 0.3065134099616858}, 'person': {'precision': 0.7828947368421053, 'recall': 0.8561151079136691, 'f_score': 0.8178694158075602}, 'drug': {'precision': 0.661498708010336, 'recall': 0.7710843373493976, 'f_score': 0.7121001390820584}, 'pregnancy_considerations': {'precision': 0.0, 'recall': 0.0, 'f_score': 0.0}, 'device': {'precision': 0.15384615384615385, 'recall': 0.07407407407407407, 'f_score': 0.1}, 'mood': {'precision': 0.3888888888888889, 'recall': 0.1076923076923077, 'f_score': 0.16867469879518074}}, 'relax': {'condition': {'precision': 0.8384937238493724, 'recall': 0.9201101928374655, 'f_score': 0.8774080560420315}, 'procedure': {'precision': 0.6456692913385826, 'recall': 0.8013029315960912, 'f_score': 0.7151162790697674}, 'temporal': {'precision': 0.7104477611940299, 'recall': 0.7828947368421053, 'f_score': 0.7449139280125195}, 'measurement': {'precision': 0.8113879003558719, 'recall': 0.8636363636363636, 'f_score': 0.8366972477064221}, 'value': {'precision': 0.8243626062322946, 'recall': 0.8899082568807339, 'f_score': 0.8558823529411764}, 'observation': {'precision': 0.52, 'recall': 0.32298136645962733, 'f_score': 0.3984674329501916}, 'person': {'precision': 0.7894736842105263, 'recall': 0.8633093525179856, 'f_score': 0.8247422680412372}, 'drug': {'precision': 0.7777777777777778, 'recall': 0.9066265060240963, 'f_score': 0.8372739916550764}, 'pregnancy_considerations': {'precision': 0.13043478260869565, 'recall': 0.6428571428571429, 'f_score': 0.21686746987951808}, 'device': {'precision': 0.8461538461538461, 'recall': 0.4074074074074074, 'f_score': 0.5499999999999999}, 'mood': {'precision': 0.5, 'recall': 0.13846153846153847, 'f_score': 0.2168674698795181}}}}\",\n",
+       " \"        full evaluation metrix: {'overall': {'acc': 0.7836, 'strict': {'precision': 0.5396489104116223, 'recall': 0.5886431165401123, 'f_score': 0.5630822674877625}, 'relax': {'precision': 0.7091404358353511, 'recall': 0.7735226147243315, 'f_score': 0.739933680720038}}, 'category': {'strict': {'condition': {'precision': 0.6074544012688342, 'recall': 0.7033976124885216, 'f_score': 0.6519148936170213}, 'procedure': {'precision': 0.3376865671641791, 'recall': 0.5895765472312704, 'f_score': 0.4294187425860024}, 'temporal': {'precision': 0.4316109422492401, 'recall': 0.46710526315789475, 'f_score': 0.4486571879936809}, 'measurement': {'precision': 0.5102739726027398, 'recall': 0.5643939393939394, 'f_score': 0.5359712230215827}, 'value': {'precision': 0.6069364161849711, 'recall': 0.6422018348623854, 'f_score': 0.6240713224368498}, 'person': {'precision': 0.781021897810219, 'recall': 0.7697841726618705, 'f_score': 0.7753623188405797}, 'drug': {'precision': 0.5905511811023622, 'recall': 0.677710843373494, 'f_score': 0.6311360448807855}, 'pregnancy_considerations': {'precision': 0.0, 'recall': 0.0, 'f_score': 0.0}, 'observation': {'precision': 0.42857142857142855, 'recall': 0.018633540372670808, 'f_score': 0.03571428571428571}}, 'relax': {'condition': {'precision': 0.7787470261697066, 'recall': 0.9017447199265382, 'f_score': 0.8357446808510639}, 'procedure': {'precision': 0.48134328358208955, 'recall': 0.8403908794788274, 'f_score': 0.6120996441281139}, 'temporal': {'precision': 0.6534954407294833, 'recall': 0.7072368421052632, 'f_score': 0.679304897314376}, 'measurement': {'precision': 0.7397260273972602, 'recall': 0.8181818181818182, 'f_score': 0.776978417266187}, 'value': {'precision': 0.7947976878612717, 'recall': 0.8409785932721713, 'f_score': 0.8172362555720654}, 'person': {'precision': 0.7883211678832117, 'recall': 0.7769784172661871, 'f_score': 0.782608695652174}, 'drug': {'precision': 0.7322834645669292, 'recall': 0.8403614457831325, 'f_score': 0.782608695652174}, 'pregnancy_considerations': {'precision': 0.4, 'recall': 0.42857142857142855, 'f_score': 0.4137931034482759}, 'observation': {'precision': 0.5714285714285714, 'recall': 0.024844720496894408, 'f_score': 0.047619047619047616}}}}\",\n",
+       " \"        full evaluation metrix: {'overall': {'acc': 0.8075, 'strict': {'precision': 0.6046163069544365, 'recall': 0.6658963354242324, 'f_score': 0.6337784760408484}, 'relax': {'precision': 0.7470023980815348, 'recall': 0.8227137669197755, 'f_score': 0.7830322073841319}}, 'category': {'strict': {'condition': {'precision': 0.6650124069478908, 'recall': 0.7382920110192838, 'f_score': 0.6997389033942558}, 'temporal': {'precision': 0.4962025316455696, 'recall': 0.6447368421052632, 'f_score': 0.5608011444921315}, 'procedure': {'precision': 0.5041322314049587, 'recall': 0.5960912052117264, 'f_score': 0.5462686567164179}, 'measurement': {'precision': 0.6057347670250897, 'recall': 0.6401515151515151, 'f_score': 0.6224677716390423}, 'value': {'precision': 0.7046783625730995, 'recall': 0.7370030581039755, 'f_score': 0.7204783258594918}, 'observation': {'precision': 0.4423076923076923, 'recall': 0.2857142857142857, 'f_score': 0.3471698113207547}, 'person': {'precision': 0.7692307692307693, 'recall': 0.8633093525179856, 'f_score': 0.8135593220338984}, 'drug': {'precision': 0.6292682926829268, 'recall': 0.7771084337349398, 'f_score': 0.6954177897574124}, 'pregnancy_considerations': {'precision': 0.0, 'recall': 0.0, 'f_score': 0.0}, 'device': {'precision': 0.0, 'recall': 0.0, 'f_score': 0.0}, 'mood': {'precision': 0.0, 'recall': 0.0, 'f_score': 0.0}}, 'relax': {'condition': {'precision': 0.8221670802315963, 'recall': 0.9127640036730946, 'f_score': 0.8651000870322019}, 'temporal': {'precision': 0.640506329113924, 'recall': 0.8322368421052632, 'f_score': 0.7238912732474964}, 'procedure': {'precision': 0.6721763085399449, 'recall': 0.7947882736156352, 'f_score': 0.7283582089552239}, 'measurement': {'precision': 0.7813620071684588, 'recall': 0.8257575757575758, 'f_score': 0.8029465930018418}, 'value': {'precision': 0.827485380116959, 'recall': 0.8654434250764526, 'f_score': 0.8460388639760836}, 'observation': {'precision': 0.5576923076923077, 'recall': 0.36024844720496896, 'f_score': 0.4377358490566038}, 'person': {'precision': 0.7756410256410257, 'recall': 0.8705035971223022, 'f_score': 0.8203389830508475}, 'drug': {'precision': 0.7414634146341463, 'recall': 0.9156626506024096, 'f_score': 0.8194070080862533}, 'pregnancy_considerations': {'precision': 0.1323529411764706, 'recall': 0.6428571428571429, 'f_score': 0.21951219512195122}, 'device': {'precision': 0.8888888888888888, 'recall': 0.2962962962962963, 'f_score': 0.4444444444444444}, 'mood': {'precision': 0.0, 'recall': 0.0, 'f_score': 0.0}}}}\",\n",
+       " \"        full evaluation metrix: {'overall': {'acc': 0.8123, 'strict': {'precision': 0.6153166421207659, 'recall': 0.6896665566193463, 'f_score': 0.650373599003736}, 'relax': {'precision': 0.7508100147275405, 'recall': 0.8415318586992406, 'f_score': 0.7935865504358655}}, 'category': {'strict': {'condition': {'precision': 0.6830065359477124, 'recall': 0.7676767676767676, 'f_score': 0.7228707306528318}, 'procedure': {'precision': 0.5201005025125628, 'recall': 0.6742671009771987, 'f_score': 0.5872340425531914}, 'temporal': {'precision': 0.5565749235474006, 'recall': 0.5986842105263158, 'f_score': 0.5768621236133122}, 'measurement': {'precision': 0.6330935251798561, 'recall': 0.6666666666666666, 'f_score': 0.6494464944649446}, 'value': {'precision': 0.7151162790697675, 'recall': 0.7522935779816514, 'f_score': 0.7332339791356185}, 'observation': {'precision': 0.3223684210526316, 'recall': 0.30434782608695654, 'f_score': 0.3130990415335463}, 'person': {'precision': 0.8650793650793651, 'recall': 0.7841726618705036, 'f_score': 0.8226415094339623}, 'drug': {'precision': 0.673469387755102, 'recall': 0.7951807228915663, 'f_score': 0.729281767955801}, 'pregnancy_considerations': {'precision': 0.0, 'recall': 0.0, 'f_score': 0.0}, 'device': {'precision': 0.2962962962962963, 'recall': 0.2962962962962963, 'f_score': 0.2962962962962963}, 'mood': {'precision': 0.34285714285714286, 'recall': 0.18461538461538463, 'f_score': 0.24}}, 'relax': {'condition': {'precision': 0.8202614379084967, 'recall': 0.9219467401285583, 'f_score': 0.8681366191093817}, 'procedure': {'precision': 0.6457286432160804, 'recall': 0.8371335504885994, 'f_score': 0.7290780141843972}, 'temporal': {'precision': 0.7400611620795107, 'recall': 0.7960526315789473, 'f_score': 0.7670364500792394}, 'measurement': {'precision': 0.8129496402877698, 'recall': 0.8560606060606061, 'f_score': 0.8339483394833948}, 'value': {'precision': 0.8401162790697675, 'recall': 0.8837920489296636, 'f_score': 0.8614008941877794}, 'observation': {'precision': 0.4473684210526316, 'recall': 0.422360248447205, 'f_score': 0.43450479233226846}, 'person': {'precision': 0.8809523809523809, 'recall': 0.7985611510791367, 'f_score': 0.8377358490566038}, 'drug': {'precision': 0.7806122448979592, 'recall': 0.9216867469879518, 'f_score': 0.845303867403315}, 'pregnancy_considerations': {'precision': 0.14130434782608695, 'recall': 0.9285714285714286, 'f_score': 0.2452830188679245}, 'device': {'precision': 0.5185185185185185, 'recall': 0.5185185185185185, 'f_score': 0.5185185185185185}, 'mood': {'precision': 0.5428571428571428, 'recall': 0.2923076923076923, 'f_score': 0.38000000000000006}}}}\",\n",
+       " \"        full evaluation metrix: {'overall': {'acc': 0.8182, 'strict': {'precision': 0.6247777119146414, 'recall': 0.695939253879168, 'f_score': 0.6584413556145556}, 'relax': {'precision': 0.7593360995850622, 'recall': 0.845823704192803, 'f_score': 0.8002498828674058}}, 'category': {'strict': {'condition': {'precision': 0.6767036450079239, 'recall': 0.7842056932966024, 'f_score': 0.7264993619736283}, 'drug': {'precision': 0.6795865633074936, 'recall': 0.7921686746987951, 'f_score': 0.7315716272600834}, 'procedure': {'precision': 0.5284090909090909, 'recall': 0.6058631921824105, 'f_score': 0.5644916540212442}, 'temporal': {'precision': 0.5582089552238806, 'recall': 0.6151315789473685, 'f_score': 0.5852895148669797}, 'measurement': {'precision': 0.6202090592334495, 'recall': 0.6742424242424242, 'f_score': 0.6460980036297641}, 'value': {'precision': 0.6902173913043478, 'recall': 0.7767584097859327, 'f_score': 0.730935251798561}, 'observation': {'precision': 0.2975206611570248, 'recall': 0.2236024844720497, 'f_score': 0.2553191489361702}, 'mood': {'precision': 0.33962264150943394, 'recall': 0.27692307692307694, 'f_score': 0.30508474576271183}, 'person': {'precision': 0.7577639751552795, 'recall': 0.8776978417266187, 'f_score': 0.8133333333333332}, 'pregnancy_considerations': {'precision': 0.0, 'recall': 0.0, 'f_score': 0.0}, 'device': {'precision': 0.45454545454545453, 'recall': 0.37037037037037035, 'f_score': 0.40816326530612246}}, 'relax': {'condition': {'precision': 0.803486529318542, 'recall': 0.931129476584022, 'f_score': 0.8626116546150574}, 'drug': {'precision': 0.7855297157622739, 'recall': 0.9156626506024096, 'f_score': 0.8456189151599444}, 'procedure': {'precision': 0.6988636363636364, 'recall': 0.8013029315960912, 'f_score': 0.7465857359635812}, 'temporal': {'precision': 0.7134328358208956, 'recall': 0.7861842105263158, 'f_score': 0.7480438184663537}, 'measurement': {'precision': 0.7979094076655052, 'recall': 0.8674242424242424, 'f_score': 0.8312159709618874}, 'value': {'precision': 0.8070652173913043, 'recall': 0.908256880733945, 'f_score': 0.8546762589928057}, 'observation': {'precision': 0.47107438016528924, 'recall': 0.35403726708074534, 'f_score': 0.40425531914893614}, 'mood': {'precision': 0.5471698113207547, 'recall': 0.4461538461538462, 'f_score': 0.4915254237288135}, 'person': {'precision': 0.7701863354037267, 'recall': 0.8920863309352518, 'f_score': 0.8266666666666668}, 'pregnancy_considerations': {'precision': 0.3076923076923077, 'recall': 0.5714285714285714, 'f_score': 0.4}, 'device': {'precision': 0.6818181818181818, 'recall': 0.5555555555555556, 'f_score': 0.6122448979591836}}}}\",\n",
+       " \"        full evaluation metrix: {'overall': {'acc': 0.8189, 'strict': {'precision': 0.6260355029585799, 'recall': 0.698580389567514, 'f_score': 0.6603214229989077}, 'relax': {'precision': 0.7615384615384615, 'recall': 0.8497854077253219, 'f_score': 0.8032454361054767}}, 'category': {'strict': {'condition': {'precision': 0.6889250814332247, 'recall': 0.7768595041322314, 'f_score': 0.7302546396201985}, 'procedure': {'precision': 0.5439560439560439, 'recall': 0.6449511400651465, 'f_score': 0.5901639344262295}, 'temporal': {'precision': 0.5696969696969697, 'recall': 0.618421052631579, 'f_score': 0.5930599369085174}, 'measurement': {'precision': 0.5811688311688312, 'recall': 0.678030303030303, 'f_score': 0.6258741258741259}, 'value': {'precision': 0.6843501326259946, 'recall': 0.7889908256880734, 'f_score': 0.7329545454545454}, 'observation': {'precision': 0.31297709923664124, 'recall': 0.2546583850931677, 'f_score': 0.2808219178082192}, 'person': {'precision': 0.7880794701986755, 'recall': 0.8561151079136691, 'f_score': 0.8206896551724139}, 'mood': {'precision': 0.3137254901960784, 'recall': 0.24615384615384617, 'f_score': 0.27586206896551724}, 'drug': {'precision': 0.6921052631578948, 'recall': 0.7921686746987951, 'f_score': 0.7387640449438202}, 'pregnancy_considerations': {'precision': 0.0, 'recall': 0.0, 'f_score': 0.0}, 'device': {'precision': 0.26666666666666666, 'recall': 0.2962962962962963, 'f_score': 0.28070175438596495}}, 'relax': {'condition': {'precision': 0.8249185667752443, 'recall': 0.9302112029384757, 'f_score': 0.8744065602071645}, 'procedure': {'precision': 0.6978021978021978, 'recall': 0.8273615635179153, 'f_score': 0.7570789865871832}, 'temporal': {'precision': 0.7242424242424242, 'recall': 0.7861842105263158, 'f_score': 0.7539432176656152}, 'measurement': {'precision': 0.7662337662337663, 'recall': 0.8939393939393939, 'f_score': 0.8251748251748252}, 'value': {'precision': 0.7931034482758621, 'recall': 0.9143730886850153, 'f_score': 0.8494318181818183}, 'observation': {'precision': 0.4580152671755725, 'recall': 0.37267080745341613, 'f_score': 0.410958904109589}, 'person': {'precision': 0.8013245033112583, 'recall': 0.8705035971223022, 'f_score': 0.8344827586206897}, 'mood': {'precision': 0.49019607843137253, 'recall': 0.38461538461538464, 'f_score': 0.43103448275862066}, 'drug': {'precision': 0.7921052631578948, 'recall': 0.9066265060240963, 'f_score': 0.8455056179775281}, 'pregnancy_considerations': {'precision': 0.26666666666666666, 'recall': 0.5714285714285714, 'f_score': 0.36363636363636365}, 'device': {'precision': 0.6, 'recall': 0.6666666666666666, 'f_score': 0.631578947368421}}}}\",\n",
+       " \"        full evaluation metrix: {'overall': {'acc': 0.8155, 'strict': {'precision': 0.6268788682581786, 'recall': 0.7022119511389898, 'f_score': 0.6624104640298973}, 'relax': {'precision': 0.7606837606837606, 'recall': 0.8520964014526247, 'f_score': 0.8037994394269697}}, 'category': {'strict': {'condition': {'precision': 0.6882690730106645, 'recall': 0.7704315886134068, 'f_score': 0.7270363951473136}, 'temporal': {'precision': 0.5223097112860893, 'recall': 0.6546052631578947, 'f_score': 0.5810218978102191}, 'procedure': {'precision': 0.5333333333333333, 'recall': 0.6775244299674267, 'f_score': 0.5968436154949784}, 'measurement': {'precision': 0.6423357664233577, 'recall': 0.6666666666666666, 'f_score': 0.654275092936803}, 'value': {'precision': 0.7126760563380282, 'recall': 0.7737003058103975, 'f_score': 0.7419354838709676}, 'observation': {'precision': 0.3333333333333333, 'recall': 0.2732919254658385, 'f_score': 0.3003412969283276}, 'person': {'precision': 0.7894736842105263, 'recall': 0.8633093525179856, 'f_score': 0.8247422680412372}, 'mood': {'precision': 0.25925925925925924, 'recall': 0.2153846153846154, 'f_score': 0.23529411764705882}, 'drug': {'precision': 0.6902887139107612, 'recall': 0.7921686746987951, 'f_score': 0.7377279102384292}, 'pregnancy_considerations': {'precision': 0.0, 'recall': 0.0, 'f_score': 0.0}, 'device': {'precision': 0.34375, 'recall': 0.4074074074074074, 'f_score': 0.37288135593220334}}, 'relax': {'condition': {'precision': 0.8252666119770303, 'recall': 0.9237832874196511, 'f_score': 0.8717504332755632}, 'temporal': {'precision': 0.6745406824146981, 'recall': 0.8453947368421053, 'f_score': 0.7503649635036497}, 'procedure': {'precision': 0.6666666666666666, 'recall': 0.8469055374592834, 'f_score': 0.7460545193687231}, 'measurement': {'precision': 0.8321167883211679, 'recall': 0.8636363636363636, 'f_score': 0.8475836431226766}, 'value': {'precision': 0.8225352112676056, 'recall': 0.8929663608562691, 'f_score': 0.8563049853372435}, 'observation': {'precision': 0.45454545454545453, 'recall': 0.37267080745341613, 'f_score': 0.40955631399317405}, 'person': {'precision': 0.8026315789473685, 'recall': 0.8776978417266187, 'f_score': 0.8384879725085911}, 'mood': {'precision': 0.5185185185185185, 'recall': 0.4307692307692308, 'f_score': 0.47058823529411764}, 'drug': {'precision': 0.7952755905511811, 'recall': 0.9126506024096386, 'f_score': 0.849929873772791}, 'pregnancy_considerations': {'precision': 0.2608695652173913, 'recall': 0.42857142857142855, 'f_score': 0.3243243243243243}, 'device': {'precision': 0.59375, 'recall': 0.7037037037037037, 'f_score': 0.6440677966101694}}}}\"]"
+      ]
+     },
+     "execution_count": 98,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# bert model case\n",
+    "with open('../data/chia_bert_log 2.txt') as fd:\n",
+    "    log = fd.read()\n",
+    "\n",
+    "lines = log.split('\\n')\n",
+    "eval_lines = [line for line in lines if 'full evaluation metrix' in line]\n",
+    "\n",
+    "eval_lines"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 99,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "\"        full evaluation metrix: {'overall': {'acc': 0.8155, 'strict': {'precision': 0.6268788682581786, 'recall': 0.7022119511389898, 'f_score': 0.6624104640298973}, 'relax': {'precision': 0.7606837606837606, 'recall': 0.8520964014526247, 'f_score': 0.8037994394269697}}, 'category': {'strict': {'condition': {'precision': 0.6882690730106645, 'recall': 0.7704315886134068, 'f_score': 0.7270363951473136}, 'temporal': {'precision': 0.5223097112860893, 'recall': 0.6546052631578947, 'f_score': 0.5810218978102191}, 'procedure': {'precision': 0.5333333333333333, 'recall': 0.6775244299674267, 'f_score': 0.5968436154949784}, 'measurement': {'precision': 0.6423357664233577, 'recall': 0.6666666666666666, 'f_score': 0.654275092936803}, 'value': {'precision': 0.7126760563380282, 'recall': 0.7737003058103975, 'f_score': 0.7419354838709676}, 'observation': {'precision': 0.3333333333333333, 'recall': 0.2732919254658385, 'f_score': 0.3003412969283276}, 'person': {'precision': 0.7894736842105263, 'recall': 0.8633093525179856, 'f_score': 0.8247422680412372}, 'mood': {'precision': 0.25925925925925924, 'recall': 0.2153846153846154, 'f_score': 0.23529411764705882}, 'drug': {'precision': 0.6902887139107612, 'recall': 0.7921686746987951, 'f_score': 0.7377279102384292}, 'pregnancy_considerations': {'precision': 0.0, 'recall': 0.0, 'f_score': 0.0}, 'device': {'precision': 0.34375, 'recall': 0.4074074074074074, 'f_score': 0.37288135593220334}}, 'relax': {'condition': {'precision': 0.8252666119770303, 'recall': 0.9237832874196511, 'f_score': 0.8717504332755632}, 'temporal': {'precision': 0.6745406824146981, 'recall': 0.8453947368421053, 'f_score': 0.7503649635036497}, 'procedure': {'precision': 0.6666666666666666, 'recall': 0.8469055374592834, 'f_score': 0.7460545193687231}, 'measurement': {'precision': 0.8321167883211679, 'recall': 0.8636363636363636, 'f_score': 0.8475836431226766}, 'value': {'precision': 0.8225352112676056, 'recall': 0.8929663608562691, 'f_score': 0.8563049853372435}, 'observation': {'precision': 0.45454545454545453, 'recall': 0.37267080745341613, 'f_score': 0.40955631399317405}, 'person': {'precision': 0.8026315789473685, 'recall': 0.8776978417266187, 'f_score': 0.8384879725085911}, 'mood': {'precision': 0.5185185185185185, 'recall': 0.4307692307692308, 'f_score': 0.47058823529411764}, 'drug': {'precision': 0.7952755905511811, 'recall': 0.9126506024096386, 'f_score': 0.849929873772791}, 'pregnancy_considerations': {'precision': 0.2608695652173913, 'recall': 0.42857142857142855, 'f_score': 0.3243243243243243}, 'device': {'precision': 0.59375, 'recall': 0.7037037037037037, 'f_score': 0.6440677966101694}}}}\""
+      ]
+     },
+     "execution_count": 99,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "last_eval = eval_lines[-1]\n",
+    "last_eval"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 100,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "\"{'overall': {'acc': 0.8155, 'strict': {'precision': 0.6268788682581786, 'recall': 0.7022119511389898, 'f_score': 0.6624104640298973}, 'relax': {'precision': 0.7606837606837606, 'recall': 0.8520964014526247, 'f_score': 0.8037994394269697}}, 'category': {'strict': {'condition': {'precision': 0.6882690730106645, 'recall': 0.7704315886134068, 'f_score': 0.7270363951473136}, 'temporal': {'precision': 0.5223097112860893, 'recall': 0.6546052631578947, 'f_score': 0.5810218978102191}, 'procedure': {'precision': 0.5333333333333333, 'recall': 0.6775244299674267, 'f_score': 0.5968436154949784}, 'measurement': {'precision': 0.6423357664233577, 'recall': 0.6666666666666666, 'f_score': 0.654275092936803}, 'value': {'precision': 0.7126760563380282, 'recall': 0.7737003058103975, 'f_score': 0.7419354838709676}, 'observation': {'precision': 0.3333333333333333, 'recall': 0.2732919254658385, 'f_score': 0.3003412969283276}, 'person': {'precision': 0.7894736842105263, 'recall': 0.8633093525179856, 'f_score': 0.8247422680412372}, 'mood': {'precision': 0.25925925925925924, 'recall': 0.2153846153846154, 'f_score': 0.23529411764705882}, 'drug': {'precision': 0.6902887139107612, 'recall': 0.7921686746987951, 'f_score': 0.7377279102384292}, 'pregnancy_considerations': {'precision': 0.0, 'recall': 0.0, 'f_score': 0.0}, 'device': {'precision': 0.34375, 'recall': 0.4074074074074074, 'f_score': 0.37288135593220334}}, 'relax': {'condition': {'precision': 0.8252666119770303, 'recall': 0.9237832874196511, 'f_score': 0.8717504332755632}, 'temporal': {'precision': 0.6745406824146981, 'recall': 0.8453947368421053, 'f_score': 0.7503649635036497}, 'procedure': {'precision': 0.6666666666666666, 'recall': 0.8469055374592834, 'f_score': 0.7460545193687231}, 'measurement': {'precision': 0.8321167883211679, 'recall': 0.8636363636363636, 'f_score': 0.8475836431226766}, 'value': {'precision': 0.8225352112676056, 'recall': 0.8929663608562691, 'f_score': 0.8563049853372435}, 'observation': {'precision': 0.45454545454545453, 'recall': 0.37267080745341613, 'f_score': 0.40955631399317405}, 'person': {'precision': 0.8026315789473685, 'recall': 0.8776978417266187, 'f_score': 0.8384879725085911}, 'mood': {'precision': 0.5185185185185185, 'recall': 0.4307692307692308, 'f_score': 0.47058823529411764}, 'drug': {'precision': 0.7952755905511811, 'recall': 0.9126506024096386, 'f_score': 0.849929873772791}, 'pregnancy_considerations': {'precision': 0.2608695652173913, 'recall': 0.42857142857142855, 'f_score': 0.3243243243243243}, 'device': {'precision': 0.59375, 'recall': 0.7037037037037037, 'f_score': 0.6440677966101694}}}}\""
+      ]
+     },
+     "execution_count": 100,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "eval = last_eval.split('full evaluation metrix: ')[1]\n",
+    "eval"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 101,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "with open('../data/chia_bert_eval.json', 'w') as fd:\n",
+    "    json.dump(eval, fd, indent=4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 137,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open('../data/chia_bert_eval.json', 'r') as fd:\n",
+    "    eval_dict = json.load(fd)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 138,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'overall': {'acc': 0.8155, 'strict': {'precision': 0.6268788682581786, 'recall': 0.7022119511389898, 'f_score': 0.6624104640298973}, 'relax': {'precision': 0.7606837606837606, 'recall': 0.8520964014526247, 'f_score': 0.8037994394269697}}, 'category': {'strict': {'condition': {'precision': 0.6882690730106645, 'recall': 0.7704315886134068, 'f_score': 0.7270363951473136}, 'temporal': {'precision': 0.5223097112860893, 'recall': 0.6546052631578947, 'f_score': 0.5810218978102191}, 'procedure': {'precision': 0.5333333333333333, 'recall': 0.6775244299674267, 'f_score': 0.5968436154949784}, 'measurement': {'precision': 0.6423357664233577, 'recall': 0.6666666666666666, 'f_score': 0.654275092936803}, 'value': {'precision': 0.7126760563380282, 'recall': 0.7737003058103975, 'f_score': 0.7419354838709676}, 'observation': {'precision': 0.3333333333333333, 'recall': 0.2732919254658385, 'f_score': 0.3003412969283276}, 'person': {'precision': 0.7894736842105263, 'recall': 0.8633093525179856, 'f_score': 0.8247422680412372}, 'mood': {'precision': 0.25925925925925924, 'recall': 0.2153846153846154, 'f_score': 0.23529411764705882}, 'drug': {'precision': 0.6902887139107612, 'recall': 0.7921686746987951, 'f_score': 0.7377279102384292}, 'pregnancy_considerations': {'precision': 0.0, 'recall': 0.0, 'f_score': 0.0}, 'device': {'precision': 0.34375, 'recall': 0.4074074074074074, 'f_score': 0.37288135593220334}}, 'relax': {'condition': {'precision': 0.8252666119770303, 'recall': 0.9237832874196511, 'f_score': 0.8717504332755632}, 'temporal': {'precision': 0.6745406824146981, 'recall': 0.8453947368421053, 'f_score': 0.7503649635036497}, 'procedure': {'precision': 0.6666666666666666, 'recall': 0.8469055374592834, 'f_score': 0.7460545193687231}, 'measurement': {'precision': 0.8321167883211679, 'recall': 0.8636363636363636, 'f_score': 0.8475836431226766}, 'value': {'precision': 0.8225352112676056, 'recall': 0.8929663608562691, 'f_score': 0.8563049853372435}, 'observation': {'precision': 0.45454545454545453, 'recall': 0.37267080745341613, 'f_score': 0.40955631399317405}, 'person': {'precision': 0.8026315789473685, 'recall': 0.8776978417266187, 'f_score': 0.8384879725085911}, 'mood': {'precision': 0.5185185185185185, 'recall': 0.4307692307692308, 'f_score': 0.47058823529411764}, 'drug': {'precision': 0.7952755905511811, 'recall': 0.9126506024096386, 'f_score': 0.849929873772791}, 'pregnancy_considerations': {'precision': 0.2608695652173913, 'recall': 0.42857142857142855, 'f_score': 0.3243243243243243}, 'device': {'precision': 0.59375, 'recall': 0.7037037037037037, 'f_score': 0.6440677966101694}}}}\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(eval_dict)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 108,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "eval_df = pd.DataFrame(columns=['type', 'prec_strict', 'rec_strict', 'f1_strict', 'prec_relaxed', 'rec_relaxed', 'f1_relaxed'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 109,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['Overall',\n",
+       " 'condition',\n",
+       " 'temporal',\n",
+       " 'procedure',\n",
+       " 'measurement',\n",
+       " 'value',\n",
+       " 'observation',\n",
+       " 'person',\n",
+       " 'mood',\n",
+       " 'drug',\n",
+       " 'pregnancy_considerations',\n",
+       " 'device']"
+      ]
+     },
+     "execution_count": 109,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "types = ['Overall']\n",
+    "for keys in eval_dict['category']['strict']:\n",
+    "    types.append(keys)\n",
+    "types"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 110,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "eval_df['type'] = types"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 111,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pre_strict = [eval_dict['overall']['strict']['precision']]\n",
+    "for keys in eval_dict['category']['strict']:\n",
+    "    pre_strict.append(eval_dict['category']['strict'][keys]['precision'])\n",
+    "eval_df['prec_strict'] = pre_strict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 112,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rec_strict = [eval_dict['overall']['strict']['recall']]\n",
+    "for keys in eval_dict['category']['strict']:\n",
+    "    rec_strict.append(eval_dict['category']['strict'][keys]['recall'])\n",
+    "eval_df['rec_strict'] = rec_strict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 113,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "f1_strict = [eval_dict['overall']['strict']['f_score']]\n",
+    "for keys in eval_dict['category']['strict']:\n",
+    "    f1_strict.append(eval_dict['category']['strict'][keys]['f_score'])\n",
+    "eval_df['f1_strict'] = f1_strict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 114,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pre_relaxed = [eval_dict['overall']['relax']['precision']]\n",
+    "for keys in eval_dict['category']['relax']:\n",
+    "    pre_relaxed.append(eval_dict['category']['relax'][keys]['precision'])\n",
+    "eval_df['prec_relaxed'] = pre_relaxed"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 115,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rec_relaxed = [eval_dict['overall']['relax']['recall']]\n",
+    "for keys in eval_dict['category']['relax']:\n",
+    "    rec_relaxed.append(eval_dict['category']['relax'][keys]['recall'])\n",
+    "eval_df['rec_relaxed'] = rec_relaxed"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 116,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "f1_relaxed = [eval_dict['overall']['relax']['f_score']]\n",
+    "for keys in eval_dict['category']['relax']:\n",
+    "    f1_relaxed.append(eval_dict['category']['relax'][keys]['f_score'])\n",
+    "eval_df['f1_relaxed'] = f1_relaxed"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 117,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>type</th>\n",
+       "      <th>prec_strict</th>\n",
+       "      <th>rec_strict</th>\n",
+       "      <th>f1_strict</th>\n",
+       "      <th>prec_relaxed</th>\n",
+       "      <th>rec_relaxed</th>\n",
+       "      <th>f1_relaxed</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Overall</td>\n",
+       "      <td>0.626879</td>\n",
+       "      <td>0.702212</td>\n",
+       "      <td>0.662410</td>\n",
+       "      <td>0.760684</td>\n",
+       "      <td>0.852096</td>\n",
+       "      <td>0.803799</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>condition</td>\n",
+       "      <td>0.688269</td>\n",
+       "      <td>0.770432</td>\n",
+       "      <td>0.727036</td>\n",
+       "      <td>0.825267</td>\n",
+       "      <td>0.923783</td>\n",
+       "      <td>0.871750</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>temporal</td>\n",
+       "      <td>0.522310</td>\n",
+       "      <td>0.654605</td>\n",
+       "      <td>0.581022</td>\n",
+       "      <td>0.674541</td>\n",
+       "      <td>0.845395</td>\n",
+       "      <td>0.750365</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>procedure</td>\n",
+       "      <td>0.533333</td>\n",
+       "      <td>0.677524</td>\n",
+       "      <td>0.596844</td>\n",
+       "      <td>0.666667</td>\n",
+       "      <td>0.846906</td>\n",
+       "      <td>0.746055</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>measurement</td>\n",
+       "      <td>0.642336</td>\n",
+       "      <td>0.666667</td>\n",
+       "      <td>0.654275</td>\n",
+       "      <td>0.832117</td>\n",
+       "      <td>0.863636</td>\n",
+       "      <td>0.847584</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>value</td>\n",
+       "      <td>0.712676</td>\n",
+       "      <td>0.773700</td>\n",
+       "      <td>0.741935</td>\n",
+       "      <td>0.822535</td>\n",
+       "      <td>0.892966</td>\n",
+       "      <td>0.856305</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>observation</td>\n",
+       "      <td>0.333333</td>\n",
+       "      <td>0.273292</td>\n",
+       "      <td>0.300341</td>\n",
+       "      <td>0.454545</td>\n",
+       "      <td>0.372671</td>\n",
+       "      <td>0.409556</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>person</td>\n",
+       "      <td>0.789474</td>\n",
+       "      <td>0.863309</td>\n",
+       "      <td>0.824742</td>\n",
+       "      <td>0.802632</td>\n",
+       "      <td>0.877698</td>\n",
+       "      <td>0.838488</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>mood</td>\n",
+       "      <td>0.259259</td>\n",
+       "      <td>0.215385</td>\n",
+       "      <td>0.235294</td>\n",
+       "      <td>0.518519</td>\n",
+       "      <td>0.430769</td>\n",
+       "      <td>0.470588</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>drug</td>\n",
+       "      <td>0.690289</td>\n",
+       "      <td>0.792169</td>\n",
+       "      <td>0.737728</td>\n",
+       "      <td>0.795276</td>\n",
+       "      <td>0.912651</td>\n",
+       "      <td>0.849930</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>pregnancy_considerations</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.260870</td>\n",
+       "      <td>0.428571</td>\n",
+       "      <td>0.324324</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>device</td>\n",
+       "      <td>0.343750</td>\n",
+       "      <td>0.407407</td>\n",
+       "      <td>0.372881</td>\n",
+       "      <td>0.593750</td>\n",
+       "      <td>0.703704</td>\n",
+       "      <td>0.644068</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                        type  prec_strict  rec_strict  f1_strict  \\\n",
+       "0                    Overall     0.626879    0.702212   0.662410   \n",
+       "1                  condition     0.688269    0.770432   0.727036   \n",
+       "2                   temporal     0.522310    0.654605   0.581022   \n",
+       "3                  procedure     0.533333    0.677524   0.596844   \n",
+       "4                measurement     0.642336    0.666667   0.654275   \n",
+       "5                      value     0.712676    0.773700   0.741935   \n",
+       "6                observation     0.333333    0.273292   0.300341   \n",
+       "7                     person     0.789474    0.863309   0.824742   \n",
+       "8                       mood     0.259259    0.215385   0.235294   \n",
+       "9                       drug     0.690289    0.792169   0.737728   \n",
+       "10  pregnancy_considerations     0.000000    0.000000   0.000000   \n",
+       "11                    device     0.343750    0.407407   0.372881   \n",
+       "\n",
+       "    prec_relaxed  rec_relaxed  f1_relaxed  \n",
+       "0       0.760684     0.852096    0.803799  \n",
+       "1       0.825267     0.923783    0.871750  \n",
+       "2       0.674541     0.845395    0.750365  \n",
+       "3       0.666667     0.846906    0.746055  \n",
+       "4       0.832117     0.863636    0.847584  \n",
+       "5       0.822535     0.892966    0.856305  \n",
+       "6       0.454545     0.372671    0.409556  \n",
+       "7       0.802632     0.877698    0.838488  \n",
+       "8       0.518519     0.430769    0.470588  \n",
+       "9       0.795276     0.912651    0.849930  \n",
+       "10      0.260870     0.428571    0.324324  \n",
+       "11      0.593750     0.703704    0.644068  "
+      ]
+     },
+     "execution_count": 117,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "eval_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 118,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>type</th>\n",
+       "      <th>prec_strict</th>\n",
+       "      <th>rec_strict</th>\n",
+       "      <th>f1_strict</th>\n",
+       "      <th>prec_relaxed</th>\n",
+       "      <th>rec_relaxed</th>\n",
+       "      <th>f1_relaxed</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>person</td>\n",
+       "      <td>0.789474</td>\n",
+       "      <td>0.863309</td>\n",
+       "      <td>0.824742</td>\n",
+       "      <td>0.802632</td>\n",
+       "      <td>0.877698</td>\n",
+       "      <td>0.838488</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>value</td>\n",
+       "      <td>0.712676</td>\n",
+       "      <td>0.773700</td>\n",
+       "      <td>0.741935</td>\n",
+       "      <td>0.822535</td>\n",
+       "      <td>0.892966</td>\n",
+       "      <td>0.856305</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>drug</td>\n",
+       "      <td>0.690289</td>\n",
+       "      <td>0.792169</td>\n",
+       "      <td>0.737728</td>\n",
+       "      <td>0.795276</td>\n",
+       "      <td>0.912651</td>\n",
+       "      <td>0.849930</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>condition</td>\n",
+       "      <td>0.688269</td>\n",
+       "      <td>0.770432</td>\n",
+       "      <td>0.727036</td>\n",
+       "      <td>0.825267</td>\n",
+       "      <td>0.923783</td>\n",
+       "      <td>0.871750</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>measurement</td>\n",
+       "      <td>0.642336</td>\n",
+       "      <td>0.666667</td>\n",
+       "      <td>0.654275</td>\n",
+       "      <td>0.832117</td>\n",
+       "      <td>0.863636</td>\n",
+       "      <td>0.847584</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>procedure</td>\n",
+       "      <td>0.533333</td>\n",
+       "      <td>0.677524</td>\n",
+       "      <td>0.596844</td>\n",
+       "      <td>0.666667</td>\n",
+       "      <td>0.846906</td>\n",
+       "      <td>0.746055</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>temporal</td>\n",
+       "      <td>0.522310</td>\n",
+       "      <td>0.654605</td>\n",
+       "      <td>0.581022</td>\n",
+       "      <td>0.674541</td>\n",
+       "      <td>0.845395</td>\n",
+       "      <td>0.750365</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>device</td>\n",
+       "      <td>0.343750</td>\n",
+       "      <td>0.407407</td>\n",
+       "      <td>0.372881</td>\n",
+       "      <td>0.593750</td>\n",
+       "      <td>0.703704</td>\n",
+       "      <td>0.644068</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>observation</td>\n",
+       "      <td>0.333333</td>\n",
+       "      <td>0.273292</td>\n",
+       "      <td>0.300341</td>\n",
+       "      <td>0.454545</td>\n",
+       "      <td>0.372671</td>\n",
+       "      <td>0.409556</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>mood</td>\n",
+       "      <td>0.259259</td>\n",
+       "      <td>0.215385</td>\n",
+       "      <td>0.235294</td>\n",
+       "      <td>0.518519</td>\n",
+       "      <td>0.430769</td>\n",
+       "      <td>0.470588</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>pregnancy_considerations</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.260870</td>\n",
+       "      <td>0.428571</td>\n",
+       "      <td>0.324324</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                        type  prec_strict  rec_strict  f1_strict  \\\n",
+       "0                     person     0.789474    0.863309   0.824742   \n",
+       "1                      value     0.712676    0.773700   0.741935   \n",
+       "2                       drug     0.690289    0.792169   0.737728   \n",
+       "3                  condition     0.688269    0.770432   0.727036   \n",
+       "4                measurement     0.642336    0.666667   0.654275   \n",
+       "5                  procedure     0.533333    0.677524   0.596844   \n",
+       "6                   temporal     0.522310    0.654605   0.581022   \n",
+       "7                     device     0.343750    0.407407   0.372881   \n",
+       "8                observation     0.333333    0.273292   0.300341   \n",
+       "9                       mood     0.259259    0.215385   0.235294   \n",
+       "10  pregnancy_considerations     0.000000    0.000000   0.000000   \n",
+       "\n",
+       "    prec_relaxed  rec_relaxed  f1_relaxed  \n",
+       "0       0.802632     0.877698    0.838488  \n",
+       "1       0.822535     0.892966    0.856305  \n",
+       "2       0.795276     0.912651    0.849930  \n",
+       "3       0.825267     0.923783    0.871750  \n",
+       "4       0.832117     0.863636    0.847584  \n",
+       "5       0.666667     0.846906    0.746055  \n",
+       "6       0.674541     0.845395    0.750365  \n",
+       "7       0.593750     0.703704    0.644068  \n",
+       "8       0.454545     0.372671    0.409556  \n",
+       "9       0.518519     0.430769    0.470588  \n",
+       "10      0.260870     0.428571    0.324324  "
+      ]
+     },
+     "execution_count": 118,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "eval_df[1:].sort_values(by=['f1_strict'], ascending=False, ignore_index=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 119,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "",
+      "text/plain": [
+       "<Figure size 640x480 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "eval_df.plot(x='type', y=['f1_strict', 'f1_relaxed'], title= f'Strict and Relax F1_Score for Bert base model on Chia', kind=\"bar\", rot=90)\n",
+    "plt.xlabel(\"\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 139,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[\"        full evaluation metrix: {'overall': {'acc': 0.7955, 'strict': {'precision': 0.5658914728682171, 'recall': 0.6266094420600858, 'f_score': 0.594704684317719}, 'relax': {'precision': 0.7185450208706022, 'recall': 0.7956421261142291, 'f_score': 0.7551308162306125}}, 'category': {'strict': {'condition': {'precision': 0.5995575221238938, 'recall': 0.7465564738292011, 'f_score': 0.6650306748466257}, 'drug': {'precision': 0.5701559020044543, 'recall': 0.7710843373493976, 'f_score': 0.6555697823303457}, 'procedure': {'precision': 0.4103194103194103, 'recall': 0.5439739413680782, 'f_score': 0.4677871148459384}, 'temporal': {'precision': 0.47619047619047616, 'recall': 0.4605263157894737, 'f_score': 0.46822742474916385}, 'measurement': {'precision': 0.5236486486486487, 'recall': 0.5871212121212122, 'f_score': 0.5535714285714286}, 'value': {'precision': 0.6628895184135978, 'recall': 0.7155963302752294, 'f_score': 0.6882352941176472}, 'observation': {'precision': 0.5652173913043478, 'recall': 0.2422360248447205, 'f_score': 0.3391304347826087}, 'person': {'precision': 0.7642276422764228, 'recall': 0.6762589928057554, 'f_score': 0.7175572519083969}, 'pregnancy_considerations': {'precision': 0.0, 'recall': 0.0, 'f_score': 0.0}}, 'relax': {'condition': {'precision': 0.7463126843657817, 'recall': 0.9292929292929293, 'f_score': 0.8278118609406953}, 'drug': {'precision': 0.6859688195991092, 'recall': 0.927710843373494, 'f_score': 0.7887323943661971}, 'procedure': {'precision': 0.5847665847665847, 'recall': 0.7752442996742671, 'f_score': 0.6666666666666666}, 'temporal': {'precision': 0.6768707482993197, 'recall': 0.6546052631578947, 'f_score': 0.6655518394648829}, 'measurement': {'precision': 0.75, 'recall': 0.8409090909090909, 'f_score': 0.7928571428571429}, 'value': {'precision': 0.8101983002832861, 'recall': 0.8746177370030581, 'f_score': 0.8411764705882353}, 'observation': {'precision': 0.6811594202898551, 'recall': 0.2919254658385093, 'f_score': 0.408695652173913}, 'person': {'precision': 0.7723577235772358, 'recall': 0.6834532374100719, 'f_score': 0.7251908396946565}, 'pregnancy_considerations': {'precision': 0.42857142857142855, 'recall': 0.21428571428571427, 'f_score': 0.2857142857142857}}}}\",\n",
+       " \"        full evaluation metrix: {'overall': {'acc': 0.7878, 'strict': {'precision': 0.5638719915366305, 'recall': 0.703862660944206, 'f_score': 0.6261380323054332}, 'relax': {'precision': 0.6857974080930971, 'recall': 0.8560581049851436, 'f_score': 0.7615271659324522}}, 'category': {'strict': {'condition': {'precision': 0.5894085281980743, 'recall': 0.7869605142332415, 'f_score': 0.6740070782540307}, 'temporal': {'precision': 0.4962962962962963, 'recall': 0.6611842105263158, 'f_score': 0.5669957686882934}, 'procedure': {'precision': 0.4050387596899225, 'recall': 0.6807817589576547, 'f_score': 0.5078979343863912}, 'measurement': {'precision': 0.584717607973422, 'recall': 0.6666666666666666, 'f_score': 0.6230088495575221}, 'value': {'precision': 0.6963788300835655, 'recall': 0.764525993883792, 'f_score': 0.7288629737609329}, 'observation': {'precision': 0.366412213740458, 'recall': 0.2981366459627329, 'f_score': 0.3287671232876712}, 'person': {'precision': 0.7349397590361446, 'recall': 0.8776978417266187, 'f_score': 0.7999999999999999}, 'drug': {'precision': 0.6397058823529411, 'recall': 0.786144578313253, 'f_score': 0.7054054054054053}, 'device': {'precision': 0.0, 'recall': 0.0, 'f_score': 0.0}, 'mood': {'precision': 0.25, 'recall': 0.12307692307692308, 'f_score': 0.16494845360824742}}, 'relax': {'condition': {'precision': 0.7070151306740028, 'recall': 0.9439853076216712, 'f_score': 0.8084939048368069}, 'temporal': {'precision': 0.6320987654320988, 'recall': 0.8421052631578947, 'f_score': 0.7221438645980254}, 'procedure': {'precision': 0.5193798449612403, 'recall': 0.8729641693811075, 'f_score': 0.6512758201701094}, 'measurement': {'precision': 0.7574750830564784, 'recall': 0.8636363636363636, 'f_score': 0.8070796460176991}, 'value': {'precision': 0.8189415041782729, 'recall': 0.8990825688073395, 'f_score': 0.8571428571428572}, 'observation': {'precision': 0.48091603053435117, 'recall': 0.391304347826087, 'f_score': 0.43150684931506855}, 'person': {'precision': 0.7409638554216867, 'recall': 0.8848920863309353, 'f_score': 0.8065573770491803}, 'drug': {'precision': 0.7573529411764706, 'recall': 0.9307228915662651, 'f_score': 0.8351351351351352}, 'device': {'precision': 0.7777777777777778, 'recall': 0.25925925925925924, 'f_score': 0.3888888888888889}, 'mood': {'precision': 0.53125, 'recall': 0.26153846153846155, 'f_score': 0.3505154639175258}}}}\",\n",
+       " \"        full evaluation metrix: {'overall': {'acc': 0.814, 'strict': {'precision': 0.6220472440944882, 'recall': 0.7041928029052492, 'f_score': 0.6605760297305667}, 'relax': {'precision': 0.7430737824438611, 'recall': 0.8412017167381974, 'f_score': 0.7890987921957261}}, 'category': {'strict': {'condition': {'precision': 0.6640986132511556, 'recall': 0.7915518824609734, 'f_score': 0.7222454964390448}, 'temporal': {'precision': 0.5188916876574308, 'recall': 0.6776315789473685, 'f_score': 0.587731811697575}, 'procedure': {'precision': 0.4678111587982833, 'recall': 0.7100977198697068, 'f_score': 0.5640362225097024}, 'measurement': {'precision': 0.6589147286821705, 'recall': 0.6439393939393939, 'f_score': 0.6513409961685823}, 'value': {'precision': 0.7697160883280757, 'recall': 0.746177370030581, 'f_score': 0.7577639751552795}, 'observation': {'precision': 0.5757575757575758, 'recall': 0.2360248447204969, 'f_score': 0.33480176211453744}, 'person': {'precision': 0.8297872340425532, 'recall': 0.841726618705036, 'f_score': 0.8357142857142857}, 'drug': {'precision': 0.7280701754385965, 'recall': 0.75, 'f_score': 0.7388724035608308}, 'pregnancy_considerations': {'precision': 0.0, 'recall': 0.0, 'f_score': 0.0}, 'device': {'precision': 0.3939393939393939, 'recall': 0.48148148148148145, 'f_score': 0.43333333333333324}, 'mood': {'precision': 0.2807017543859649, 'recall': 0.24615384615384617, 'f_score': 0.26229508196721313}}, 'relax': {'condition': {'precision': 0.7850539291217258, 'recall': 0.9357208448117539, 'f_score': 0.8537913699204022}, 'temporal': {'precision': 0.6599496221662469, 'recall': 0.8618421052631579, 'f_score': 0.7475035663338089}, 'procedure': {'precision': 0.5772532188841202, 'recall': 0.8762214983713354, 'f_score': 0.6959896507115135}, 'measurement': {'precision': 0.8294573643410853, 'recall': 0.8106060606060606, 'f_score': 0.8199233716475095}, 'value': {'precision': 0.886435331230284, 'recall': 0.8593272171253823, 'f_score': 0.8726708074534162}, 'observation': {'precision': 0.6818181818181818, 'recall': 0.2795031055900621, 'f_score': 0.3964757709251101}, 'person': {'precision': 0.8368794326241135, 'recall': 0.8489208633093526, 'f_score': 0.8428571428571429}, 'drug': {'precision': 0.8245614035087719, 'recall': 0.8493975903614458, 'f_score': 0.8367952522255192}, 'pregnancy_considerations': {'precision': 0.16666666666666666, 'recall': 0.6428571428571429, 'f_score': 0.2647058823529412}, 'device': {'precision': 0.5454545454545454, 'recall': 0.6666666666666666, 'f_score': 0.6}, 'mood': {'precision': 0.543859649122807, 'recall': 0.47692307692307695, 'f_score': 0.5081967213114754}}}}\",\n",
+       " \"        full evaluation metrix: {'overall': {'acc': 0.8285, 'strict': {'precision': 0.6570648878107944, 'recall': 0.7154176295807197, 'f_score': 0.6850007902639481}, 'relax': {'precision': 0.7786537295330503, 'recall': 0.8478045559590623, 'f_score': 0.8117591275486012}}, 'category': {'strict': {'condition': {'precision': 0.7032418952618454, 'recall': 0.7768595041322314, 'f_score': 0.7382198952879582}, 'temporal': {'precision': 0.5580110497237569, 'recall': 0.6644736842105263, 'f_score': 0.6066066066066066}, 'procedure': {'precision': 0.5608465608465608, 'recall': 0.6905537459283387, 'f_score': 0.618978102189781}, 'measurement': {'precision': 0.6762589928057554, 'recall': 0.7121212121212122, 'f_score': 0.6937269372693727}, 'value': {'precision': 0.7159420289855073, 'recall': 0.7553516819571865, 'f_score': 0.7351190476190476}, 'observation': {'precision': 0.5280898876404494, 'recall': 0.2919254658385093, 'f_score': 0.37599999999999995}, 'person': {'precision': 0.7469135802469136, 'recall': 0.8705035971223022, 'f_score': 0.8039867109634551}, 'drug': {'precision': 0.7176165803108808, 'recall': 0.8343373493975904, 'f_score': 0.7715877437325905}, 'pregnancy_considerations': {'precision': 0.0, 'recall': 0.0, 'f_score': 0.0}, 'device': {'precision': 0.5185185185185185, 'recall': 0.5185185185185185, 'f_score': 0.5185185185185185}, 'mood': {'precision': 0.24074074074074073, 'recall': 0.2, 'f_score': 0.2184873949579832}}, 'relax': {'condition': {'precision': 0.8337489609310058, 'recall': 0.921028466483012, 'f_score': 0.87521815008726}, 'temporal': {'precision': 0.6933701657458563, 'recall': 0.8256578947368421, 'f_score': 0.7537537537537538}, 'procedure': {'precision': 0.6772486772486772, 'recall': 0.8338762214983714, 'f_score': 0.7474452554744526}, 'measurement': {'precision': 0.8057553956834532, 'recall': 0.8484848484848485, 'f_score': 0.8265682656826568}, 'value': {'precision': 0.8521739130434782, 'recall': 0.8990825688073395, 'f_score': 0.8750000000000001}, 'observation': {'precision': 0.6067415730337079, 'recall': 0.33540372670807456, 'f_score': 0.432}, 'person': {'precision': 0.7592592592592593, 'recall': 0.8848920863309353, 'f_score': 0.8172757475083057}, 'drug': {'precision': 0.8031088082901554, 'recall': 0.9337349397590361, 'f_score': 0.8635097493036212}, 'pregnancy_considerations': {'precision': 0.42857142857142855, 'recall': 0.42857142857142855, 'f_score': 0.42857142857142855}, 'device': {'precision': 0.7037037037037037, 'recall': 0.7037037037037037, 'f_score': 0.7037037037037037}, 'mood': {'precision': 0.5185185185185185, 'recall': 0.4307692307692308, 'f_score': 0.47058823529411764}}}}\"]"
+      ]
+     },
+     "execution_count": 139,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Roberta model case\n",
+    "with open('../data/chia_roberta_log.txt') as fd:\n",
+    "    log = fd.read()\n",
+    "\n",
+    "lines = log.split('\\n')\n",
+    "eval_lines = [line for line in lines if 'full evaluation metrix' in line]\n",
+    "\n",
+    "eval_lines"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 140,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "\"        full evaluation metrix: {'overall': {'acc': 0.8285, 'strict': {'precision': 0.6570648878107944, 'recall': 0.7154176295807197, 'f_score': 0.6850007902639481}, 'relax': {'precision': 0.7786537295330503, 'recall': 0.8478045559590623, 'f_score': 0.8117591275486012}}, 'category': {'strict': {'condition': {'precision': 0.7032418952618454, 'recall': 0.7768595041322314, 'f_score': 0.7382198952879582}, 'temporal': {'precision': 0.5580110497237569, 'recall': 0.6644736842105263, 'f_score': 0.6066066066066066}, 'procedure': {'precision': 0.5608465608465608, 'recall': 0.6905537459283387, 'f_score': 0.618978102189781}, 'measurement': {'precision': 0.6762589928057554, 'recall': 0.7121212121212122, 'f_score': 0.6937269372693727}, 'value': {'precision': 0.7159420289855073, 'recall': 0.7553516819571865, 'f_score': 0.7351190476190476}, 'observation': {'precision': 0.5280898876404494, 'recall': 0.2919254658385093, 'f_score': 0.37599999999999995}, 'person': {'precision': 0.7469135802469136, 'recall': 0.8705035971223022, 'f_score': 0.8039867109634551}, 'drug': {'precision': 0.7176165803108808, 'recall': 0.8343373493975904, 'f_score': 0.7715877437325905}, 'pregnancy_considerations': {'precision': 0.0, 'recall': 0.0, 'f_score': 0.0}, 'device': {'precision': 0.5185185185185185, 'recall': 0.5185185185185185, 'f_score': 0.5185185185185185}, 'mood': {'precision': 0.24074074074074073, 'recall': 0.2, 'f_score': 0.2184873949579832}}, 'relax': {'condition': {'precision': 0.8337489609310058, 'recall': 0.921028466483012, 'f_score': 0.87521815008726}, 'temporal': {'precision': 0.6933701657458563, 'recall': 0.8256578947368421, 'f_score': 0.7537537537537538}, 'procedure': {'precision': 0.6772486772486772, 'recall': 0.8338762214983714, 'f_score': 0.7474452554744526}, 'measurement': {'precision': 0.8057553956834532, 'recall': 0.8484848484848485, 'f_score': 0.8265682656826568}, 'value': {'precision': 0.8521739130434782, 'recall': 0.8990825688073395, 'f_score': 0.8750000000000001}, 'observation': {'precision': 0.6067415730337079, 'recall': 0.33540372670807456, 'f_score': 0.432}, 'person': {'precision': 0.7592592592592593, 'recall': 0.8848920863309353, 'f_score': 0.8172757475083057}, 'drug': {'precision': 0.8031088082901554, 'recall': 0.9337349397590361, 'f_score': 0.8635097493036212}, 'pregnancy_considerations': {'precision': 0.42857142857142855, 'recall': 0.42857142857142855, 'f_score': 0.42857142857142855}, 'device': {'precision': 0.7037037037037037, 'recall': 0.7037037037037037, 'f_score': 0.7037037037037037}, 'mood': {'precision': 0.5185185185185185, 'recall': 0.4307692307692308, 'f_score': 0.47058823529411764}}}}\""
+      ]
+     },
+     "execution_count": 140,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "last_eval = eval_lines[-1]\n",
+    "last_eval"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 141,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "\"{'overall': {'acc': 0.8285, 'strict': {'precision': 0.6570648878107944, 'recall': 0.7154176295807197, 'f_score': 0.6850007902639481}, 'relax': {'precision': 0.7786537295330503, 'recall': 0.8478045559590623, 'f_score': 0.8117591275486012}}, 'category': {'strict': {'condition': {'precision': 0.7032418952618454, 'recall': 0.7768595041322314, 'f_score': 0.7382198952879582}, 'temporal': {'precision': 0.5580110497237569, 'recall': 0.6644736842105263, 'f_score': 0.6066066066066066}, 'procedure': {'precision': 0.5608465608465608, 'recall': 0.6905537459283387, 'f_score': 0.618978102189781}, 'measurement': {'precision': 0.6762589928057554, 'recall': 0.7121212121212122, 'f_score': 0.6937269372693727}, 'value': {'precision': 0.7159420289855073, 'recall': 0.7553516819571865, 'f_score': 0.7351190476190476}, 'observation': {'precision': 0.5280898876404494, 'recall': 0.2919254658385093, 'f_score': 0.37599999999999995}, 'person': {'precision': 0.7469135802469136, 'recall': 0.8705035971223022, 'f_score': 0.8039867109634551}, 'drug': {'precision': 0.7176165803108808, 'recall': 0.8343373493975904, 'f_score': 0.7715877437325905}, 'pregnancy_considerations': {'precision': 0.0, 'recall': 0.0, 'f_score': 0.0}, 'device': {'precision': 0.5185185185185185, 'recall': 0.5185185185185185, 'f_score': 0.5185185185185185}, 'mood': {'precision': 0.24074074074074073, 'recall': 0.2, 'f_score': 0.2184873949579832}}, 'relax': {'condition': {'precision': 0.8337489609310058, 'recall': 0.921028466483012, 'f_score': 0.87521815008726}, 'temporal': {'precision': 0.6933701657458563, 'recall': 0.8256578947368421, 'f_score': 0.7537537537537538}, 'procedure': {'precision': 0.6772486772486772, 'recall': 0.8338762214983714, 'f_score': 0.7474452554744526}, 'measurement': {'precision': 0.8057553956834532, 'recall': 0.8484848484848485, 'f_score': 0.8265682656826568}, 'value': {'precision': 0.8521739130434782, 'recall': 0.8990825688073395, 'f_score': 0.8750000000000001}, 'observation': {'precision': 0.6067415730337079, 'recall': 0.33540372670807456, 'f_score': 0.432}, 'person': {'precision': 0.7592592592592593, 'recall': 0.8848920863309353, 'f_score': 0.8172757475083057}, 'drug': {'precision': 0.8031088082901554, 'recall': 0.9337349397590361, 'f_score': 0.8635097493036212}, 'pregnancy_considerations': {'precision': 0.42857142857142855, 'recall': 0.42857142857142855, 'f_score': 0.42857142857142855}, 'device': {'precision': 0.7037037037037037, 'recall': 0.7037037037037037, 'f_score': 0.7037037037037037}, 'mood': {'precision': 0.5185185185185185, 'recall': 0.4307692307692308, 'f_score': 0.47058823529411764}}}}\""
+      ]
+     },
+     "execution_count": 141,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "eval_roberta = last_eval.split('full evaluation metrix: ')[1]\n",
+    "eval_roberta"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 155,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "with open('../data/chia_roberta_eval.json', 'w') as fd:\n",
+    "    json.dump(eval_roberta, fd, indent=4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 156,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open('../data/chia_roberta_eval.json', 'r') as fd:\n",
+    "    eval_dict_roberta = json.load(fd)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 157,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "eval_df_roberta = pd.DataFrame(columns=['type', 'prec_strict', 'rec_strict', 'f1_strict', 'prec_relaxed', 'rec_relaxed', 'f1_relaxed'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 158,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['Overall',\n",
+       " 'condition',\n",
+       " 'temporal',\n",
+       " 'procedure',\n",
+       " 'measurement',\n",
+       " 'value',\n",
+       " 'observation',\n",
+       " 'person',\n",
+       " 'drug',\n",
+       " 'pregnancy_considerations',\n",
+       " 'device',\n",
+       " 'mood']"
+      ]
+     },
+     "execution_count": 158,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "types_roberta = ['Overall']\n",
+    "for keys in eval_dict_roberta['category']['strict']:\n",
+    "    types_roberta.append(keys)\n",
+    "types_roberta"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 166,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "eval_df_roberta['type'] = types_roberta"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 159,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pre_strict = [eval_dict_roberta['overall']['strict']['precision']]\n",
+    "for keys in eval_dict_roberta['category']['strict']:\n",
+    "    pre_strict.append(eval_dict_roberta['category']['strict'][keys]['precision'])\n",
+    "eval_df_roberta['prec_strict'] = pre_strict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 160,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rec_strict = [eval_dict_roberta['overall']['strict']['recall']]\n",
+    "for keys in eval_dict_roberta['category']['strict']:\n",
+    "    rec_strict.append(eval_dict_roberta['category']['strict'][keys]['recall'])\n",
+    "eval_df_roberta['rec_strict'] = rec_strict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 161,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "f1_strict = [eval_dict_roberta['overall']['strict']['f_score']]\n",
+    "for keys in eval_dict_roberta['category']['strict']:\n",
+    "    f1_strict.append(eval_dict_roberta['category']['strict'][keys]['f_score'])\n",
+    "eval_df_roberta['f1_strict'] = f1_strict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 162,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pre_relaxed = [eval_dict_roberta['overall']['relax']['precision']]\n",
+    "for keys in eval_dict_roberta['category']['relax']:\n",
+    "    pre_relaxed.append(eval_dict_roberta['category']['relax'][keys]['precision'])\n",
+    "eval_df_roberta['prec_relaxed'] = pre_relaxed"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 163,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rec_relaxed = [eval_dict_roberta['overall']['relax']['recall']]\n",
+    "for keys in eval_dict_roberta['category']['relax']:\n",
+    "    rec_relaxed.append(eval_dict_roberta['category']['relax'][keys]['recall'])\n",
+    "eval_df_roberta['rec_relaxed'] = rec_relaxed"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 164,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "f1_relaxed = [eval_dict_roberta['overall']['relax']['f_score']]\n",
+    "for keys in eval_dict_roberta['category']['relax']:\n",
+    "    f1_relaxed.append(eval_dict_roberta['category']['relax'][keys]['f_score'])\n",
+    "eval_df_roberta['f1_relaxed'] = f1_relaxed"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 167,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>type</th>\n",
+       "      <th>prec_strict</th>\n",
+       "      <th>rec_strict</th>\n",
+       "      <th>f1_strict</th>\n",
+       "      <th>prec_relaxed</th>\n",
+       "      <th>rec_relaxed</th>\n",
+       "      <th>f1_relaxed</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Overall</td>\n",
+       "      <td>0.657065</td>\n",
+       "      <td>0.715418</td>\n",
+       "      <td>0.685001</td>\n",
+       "      <td>0.778654</td>\n",
+       "      <td>0.847805</td>\n",
+       "      <td>0.811759</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>condition</td>\n",
+       "      <td>0.703242</td>\n",
+       "      <td>0.776860</td>\n",
+       "      <td>0.738220</td>\n",
+       "      <td>0.833749</td>\n",
+       "      <td>0.921028</td>\n",
+       "      <td>0.875218</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>temporal</td>\n",
+       "      <td>0.558011</td>\n",
+       "      <td>0.664474</td>\n",
+       "      <td>0.606607</td>\n",
+       "      <td>0.693370</td>\n",
+       "      <td>0.825658</td>\n",
+       "      <td>0.753754</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>procedure</td>\n",
+       "      <td>0.560847</td>\n",
+       "      <td>0.690554</td>\n",
+       "      <td>0.618978</td>\n",
+       "      <td>0.677249</td>\n",
+       "      <td>0.833876</td>\n",
+       "      <td>0.747445</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>measurement</td>\n",
+       "      <td>0.676259</td>\n",
+       "      <td>0.712121</td>\n",
+       "      <td>0.693727</td>\n",
+       "      <td>0.805755</td>\n",
+       "      <td>0.848485</td>\n",
+       "      <td>0.826568</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>value</td>\n",
+       "      <td>0.715942</td>\n",
+       "      <td>0.755352</td>\n",
+       "      <td>0.735119</td>\n",
+       "      <td>0.852174</td>\n",
+       "      <td>0.899083</td>\n",
+       "      <td>0.875000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>observation</td>\n",
+       "      <td>0.528090</td>\n",
+       "      <td>0.291925</td>\n",
+       "      <td>0.376000</td>\n",
+       "      <td>0.606742</td>\n",
+       "      <td>0.335404</td>\n",
+       "      <td>0.432000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>person</td>\n",
+       "      <td>0.746914</td>\n",
+       "      <td>0.870504</td>\n",
+       "      <td>0.803987</td>\n",
+       "      <td>0.759259</td>\n",
+       "      <td>0.884892</td>\n",
+       "      <td>0.817276</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>drug</td>\n",
+       "      <td>0.717617</td>\n",
+       "      <td>0.834337</td>\n",
+       "      <td>0.771588</td>\n",
+       "      <td>0.803109</td>\n",
+       "      <td>0.933735</td>\n",
+       "      <td>0.863510</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>pregnancy_considerations</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.428571</td>\n",
+       "      <td>0.428571</td>\n",
+       "      <td>0.428571</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>device</td>\n",
+       "      <td>0.518519</td>\n",
+       "      <td>0.518519</td>\n",
+       "      <td>0.518519</td>\n",
+       "      <td>0.703704</td>\n",
+       "      <td>0.703704</td>\n",
+       "      <td>0.703704</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>mood</td>\n",
+       "      <td>0.240741</td>\n",
+       "      <td>0.200000</td>\n",
+       "      <td>0.218487</td>\n",
+       "      <td>0.518519</td>\n",
+       "      <td>0.430769</td>\n",
+       "      <td>0.470588</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                        type  prec_strict  rec_strict  f1_strict  \\\n",
+       "0                    Overall     0.657065    0.715418   0.685001   \n",
+       "1                  condition     0.703242    0.776860   0.738220   \n",
+       "2                   temporal     0.558011    0.664474   0.606607   \n",
+       "3                  procedure     0.560847    0.690554   0.618978   \n",
+       "4                measurement     0.676259    0.712121   0.693727   \n",
+       "5                      value     0.715942    0.755352   0.735119   \n",
+       "6                observation     0.528090    0.291925   0.376000   \n",
+       "7                     person     0.746914    0.870504   0.803987   \n",
+       "8                       drug     0.717617    0.834337   0.771588   \n",
+       "9   pregnancy_considerations     0.000000    0.000000   0.000000   \n",
+       "10                    device     0.518519    0.518519   0.518519   \n",
+       "11                      mood     0.240741    0.200000   0.218487   \n",
+       "\n",
+       "    prec_relaxed  rec_relaxed  f1_relaxed  \n",
+       "0       0.778654     0.847805    0.811759  \n",
+       "1       0.833749     0.921028    0.875218  \n",
+       "2       0.693370     0.825658    0.753754  \n",
+       "3       0.677249     0.833876    0.747445  \n",
+       "4       0.805755     0.848485    0.826568  \n",
+       "5       0.852174     0.899083    0.875000  \n",
+       "6       0.606742     0.335404    0.432000  \n",
+       "7       0.759259     0.884892    0.817276  \n",
+       "8       0.803109     0.933735    0.863510  \n",
+       "9       0.428571     0.428571    0.428571  \n",
+       "10      0.703704     0.703704    0.703704  \n",
+       "11      0.518519     0.430769    0.470588  "
+      ]
+     },
+     "execution_count": 167,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "eval_df_roberta"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 168,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>type</th>\n",
+       "      <th>prec_strict</th>\n",
+       "      <th>rec_strict</th>\n",
+       "      <th>f1_strict</th>\n",
+       "      <th>prec_relaxed</th>\n",
+       "      <th>rec_relaxed</th>\n",
+       "      <th>f1_relaxed</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>person</td>\n",
+       "      <td>0.746914</td>\n",
+       "      <td>0.870504</td>\n",
+       "      <td>0.803987</td>\n",
+       "      <td>0.759259</td>\n",
+       "      <td>0.884892</td>\n",
+       "      <td>0.817276</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>drug</td>\n",
+       "      <td>0.717617</td>\n",
+       "      <td>0.834337</td>\n",
+       "      <td>0.771588</td>\n",
+       "      <td>0.803109</td>\n",
+       "      <td>0.933735</td>\n",
+       "      <td>0.863510</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>condition</td>\n",
+       "      <td>0.703242</td>\n",
+       "      <td>0.776860</td>\n",
+       "      <td>0.738220</td>\n",
+       "      <td>0.833749</td>\n",
+       "      <td>0.921028</td>\n",
+       "      <td>0.875218</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>value</td>\n",
+       "      <td>0.715942</td>\n",
+       "      <td>0.755352</td>\n",
+       "      <td>0.735119</td>\n",
+       "      <td>0.852174</td>\n",
+       "      <td>0.899083</td>\n",
+       "      <td>0.875000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>measurement</td>\n",
+       "      <td>0.676259</td>\n",
+       "      <td>0.712121</td>\n",
+       "      <td>0.693727</td>\n",
+       "      <td>0.805755</td>\n",
+       "      <td>0.848485</td>\n",
+       "      <td>0.826568</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>procedure</td>\n",
+       "      <td>0.560847</td>\n",
+       "      <td>0.690554</td>\n",
+       "      <td>0.618978</td>\n",
+       "      <td>0.677249</td>\n",
+       "      <td>0.833876</td>\n",
+       "      <td>0.747445</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>temporal</td>\n",
+       "      <td>0.558011</td>\n",
+       "      <td>0.664474</td>\n",
+       "      <td>0.606607</td>\n",
+       "      <td>0.693370</td>\n",
+       "      <td>0.825658</td>\n",
+       "      <td>0.753754</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>device</td>\n",
+       "      <td>0.518519</td>\n",
+       "      <td>0.518519</td>\n",
+       "      <td>0.518519</td>\n",
+       "      <td>0.703704</td>\n",
+       "      <td>0.703704</td>\n",
+       "      <td>0.703704</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>observation</td>\n",
+       "      <td>0.528090</td>\n",
+       "      <td>0.291925</td>\n",
+       "      <td>0.376000</td>\n",
+       "      <td>0.606742</td>\n",
+       "      <td>0.335404</td>\n",
+       "      <td>0.432000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>mood</td>\n",
+       "      <td>0.240741</td>\n",
+       "      <td>0.200000</td>\n",
+       "      <td>0.218487</td>\n",
+       "      <td>0.518519</td>\n",
+       "      <td>0.430769</td>\n",
+       "      <td>0.470588</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>pregnancy_considerations</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.428571</td>\n",
+       "      <td>0.428571</td>\n",
+       "      <td>0.428571</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                        type  prec_strict  rec_strict  f1_strict  \\\n",
+       "0                     person     0.746914    0.870504   0.803987   \n",
+       "1                       drug     0.717617    0.834337   0.771588   \n",
+       "2                  condition     0.703242    0.776860   0.738220   \n",
+       "3                      value     0.715942    0.755352   0.735119   \n",
+       "4                measurement     0.676259    0.712121   0.693727   \n",
+       "5                  procedure     0.560847    0.690554   0.618978   \n",
+       "6                   temporal     0.558011    0.664474   0.606607   \n",
+       "7                     device     0.518519    0.518519   0.518519   \n",
+       "8                observation     0.528090    0.291925   0.376000   \n",
+       "9                       mood     0.240741    0.200000   0.218487   \n",
+       "10  pregnancy_considerations     0.000000    0.000000   0.000000   \n",
+       "\n",
+       "    prec_relaxed  rec_relaxed  f1_relaxed  \n",
+       "0       0.759259     0.884892    0.817276  \n",
+       "1       0.803109     0.933735    0.863510  \n",
+       "2       0.833749     0.921028    0.875218  \n",
+       "3       0.852174     0.899083    0.875000  \n",
+       "4       0.805755     0.848485    0.826568  \n",
+       "5       0.677249     0.833876    0.747445  \n",
+       "6       0.693370     0.825658    0.753754  \n",
+       "7       0.703704     0.703704    0.703704  \n",
+       "8       0.606742     0.335404    0.432000  \n",
+       "9       0.518519     0.430769    0.470588  \n",
+       "10      0.428571     0.428571    0.428571  "
+      ]
+     },
+     "execution_count": 168,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "eval_df_roberta[1:].sort_values(by=['f1_strict'], ascending=False, ignore_index=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 169,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "",
+      "text/plain": [
+       "<Figure size 640x480 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "eval_df_roberta.plot(x='type', y=['f1_strict', 'f1_relaxed'], title= f'Strict and Relax F1_Score for Roberta base model on Chia', kind=\"bar\", rot=90)\n",
+    "plt.xlabel(\"\")\n",
+    "plt.show()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "honlp",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}