--- a +++ b/pregnant.ipynb @@ -0,0 +1,881 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 17, + "id": "3f87254e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Last modified by Xiaoqing: 211210\n" + ] + } + ], + "source": [ + "from datetime import datetime\n", + "date = datetime.today().strftime('%y%m%d')\n", + "print ('Last modified by Xiaoqing: ' + date)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "6114c11a", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "id": "d3e710b5", + "metadata": {}, + "source": [ + "# Problem statement\n", + "Words such as 'pregnant,' 'breast-feeding,' and 'lactating' frequently appear in clinical trial eligibility. Unfortunately they are not recognized by stanza NER (stanza can however, recognize 'pregnancy' as a problem). \n", + "\n", + "Let's fix this." + ] + }, + { + "cell_type": "markdown", + "id": "86e4c32b", + "metadata": {}, + "source": [ + "# Note:\n", + " For this notebook to work, all key words must be extracted from a clinical trial. \n", + " \n", + " One clinical trial can have multiple rows; each row corresponds to a different key word.\n", + " \n", + " Under criteria, we should have ALL the bullet points, without separting them into different rows." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "49cd1778", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv('input_pregnant_121021.csv')\n", + "df['criteria']= df['criteria'].str.lower()\n", + "df['key_words']= df['key_words'].str.lower()" + ] + }, + { + "cell_type": "markdown", + "id": "cb993e1c", + "metadata": {}, + "source": [ + "# The less common scenarios\n", + "In the less common scenarios, for example, a clinical trial may want to study pregnancy related diabetes. These studies DO want to recruit pregnant women." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "c7a7d7d8", + "metadata": {}, + "outputs": [], + "source": [ + "# Does a study want to recruit pregnant women? \n", + "# If pregnant = 1, it means they want to INCLUDE pregnant women.\n", + "# If pregnant = 0, it means they want to EXCLUDE pregnant women.\n", + "\n", + "df['pregnant'] = np.nan\n", + "\n", + "for index, row in df.iterrows():\n", + " if 'pregnant' in row['key_words'] or 'pregnancy' in row['key_words']:\n", + " df.loc[index,'pregnant'] = 1\n" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "903e2281", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>id</th>\n", + " <th>criteria</th>\n", + " <th>key_words</th>\n", + " <th>pregnant</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>14</td>\n", + " <td>women pregnant with one fetus between 16 and 2...</td>\n", + " <td>sedentary time</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>14</td>\n", + " <td>women pregnant with one fetus between 16 and 2...</td>\n", + " <td>pregnancy</td>\n", + " <td>1.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>14</td>\n", + " <td>women pregnant with one fetus between 16 and 2...</td>\n", + " <td>pregnant women</td>\n", + " <td>1.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>14</td>\n", + " <td>women pregnant with one fetus between 16 and 2...</td>\n", + " <td>weight gain</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>15</td>\n", + " <td>this is a study that does not mention anything...</td>\n", + " <td>testing</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " id criteria key_words \\\n", + "16 14 women pregnant with one fetus between 16 and 2... sedentary time \n", + "17 14 women pregnant with one fetus between 16 and 2... pregnancy \n", + "18 14 women pregnant with one fetus between 16 and 2... pregnant women \n", + "19 14 women pregnant with one fetus between 16 and 2... weight gain \n", + "20 15 this is a study that does not mention anything... testing \n", + "\n", + " pregnant \n", + "16 NaN \n", + "17 1.0 \n", + "18 1.0 \n", + "19 NaN \n", + "20 NaN " + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.tail()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "14d19db1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>id</th>\n", + " <th>pregnant</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>11</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>12</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>13</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>14</td>\n", + " <td>1.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>15</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " id pregnant\n", + "10 11 NaN\n", + "11 12 NaN\n", + "12 13 NaN\n", + "13 14 1.0\n", + "14 15 NaN" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# grouping by id, if one of the key words contain Pregnancy related words, we label that entire study as pregnant = 1 \n", + "df1 = df.groupby(['id'])['pregnant'].agg('max').reset_index()\n", + "df1.tail()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "3f32dd16", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/n3/x7vymc8s2cj1p5yx96j08wlr0000gn/T/ipykernel_1469/2839200337.py:2: FutureWarning: In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only\n", + " df = df.drop('pregnant', 1)\n" + ] + } + ], + "source": [ + "# now merge this with the long format df\n", + "df = df.drop('pregnant', 1)\n", + "df2 = df.merge(df1, on='id', how='outer')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "4bcb95f1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>id</th>\n", + " <th>criteria</th>\n", + " <th>key_words</th>\n", + " <th>pregnant</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>1</td>\n", + " <td>for female participants, currently breastfeedi...</td>\n", + " <td>testing</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>1</td>\n", + " <td>for female participants, currently breastfeedi...</td>\n", + " <td>testing</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>1</td>\n", + " <td>for female participants, currently breastfeedi...</td>\n", + " <td>testing</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>2</td>\n", + " <td>patients will be excluded if they are pregnant.</td>\n", + " <td>testing</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>3</td>\n", + " <td>pregnant women or women currently breastfeeding;</td>\n", + " <td>testing</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>4</td>\n", + " <td>females who are pregnant or nursing</td>\n", + " <td>testing</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>5</td>\n", + " <td>in the case of women of childbearing age, urin...</td>\n", + " <td>testing</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>6</td>\n", + " <td>are pregnant or lactating or planning to becom...</td>\n", + " <td>testing</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>7</td>\n", + " <td>pregnant or breast-feeding</td>\n", + " <td>testing</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>8</td>\n", + " <td>patients who are pregnant or may be pregnant</td>\n", + " <td>testing</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>9</td>\n", + " <td>not pregnant or nursing</td>\n", + " <td>testing</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>10</td>\n", + " <td>females who are pregnant.</td>\n", + " <td>testing</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>11</td>\n", + " <td>for females of child-bearing age, current preg...</td>\n", + " <td>testing</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>12</td>\n", + " <td>for females of child-bearing age, current brea...</td>\n", + " <td>testing</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>13</td>\n", + " <td>are not pregnant (negative urine test) or brea...</td>\n", + " <td>testing</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>14</td>\n", + " <td>women pregnant with one fetus between 16 and 2...</td>\n", + " <td>physical activity</td>\n", + " <td>1.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>14</td>\n", + " <td>women pregnant with one fetus between 16 and 2...</td>\n", + " <td>sedentary time</td>\n", + " <td>1.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>14</td>\n", + " <td>women pregnant with one fetus between 16 and 2...</td>\n", + " <td>pregnancy</td>\n", + " <td>1.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>14</td>\n", + " <td>women pregnant with one fetus between 16 and 2...</td>\n", + " <td>pregnant women</td>\n", + " <td>1.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>14</td>\n", + " <td>women pregnant with one fetus between 16 and 2...</td>\n", + " <td>weight gain</td>\n", + " <td>1.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>15</td>\n", + " <td>this is a study that does not mention anything...</td>\n", + " <td>testing</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " id criteria key_words \\\n", + "0 1 for female participants, currently breastfeedi... testing \n", + "1 1 for female participants, currently breastfeedi... testing \n", + "2 1 for female participants, currently breastfeedi... testing \n", + "3 2 patients will be excluded if they are pregnant. testing \n", + "4 3 pregnant women or women currently breastfeeding; testing \n", + "5 4 females who are pregnant or nursing testing \n", + "6 5 in the case of women of childbearing age, urin... testing \n", + "7 6 are pregnant or lactating or planning to becom... testing \n", + "8 7 pregnant or breast-feeding testing \n", + "9 8 patients who are pregnant or may be pregnant testing \n", + "10 9 not pregnant or nursing testing \n", + "11 10 females who are pregnant. testing \n", + "12 11 for females of child-bearing age, current preg... testing \n", + "13 12 for females of child-bearing age, current brea... testing \n", + "14 13 are not pregnant (negative urine test) or brea... testing \n", + "15 14 women pregnant with one fetus between 16 and 2... physical activity \n", + "16 14 women pregnant with one fetus between 16 and 2... sedentary time \n", + "17 14 women pregnant with one fetus between 16 and 2... pregnancy \n", + "18 14 women pregnant with one fetus between 16 and 2... pregnant women \n", + "19 14 women pregnant with one fetus between 16 and 2... weight gain \n", + "20 15 this is a study that does not mention anything... testing \n", + "\n", + " pregnant \n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN \n", + "5 NaN \n", + "6 NaN \n", + "7 NaN \n", + "8 NaN \n", + "9 NaN \n", + "10 NaN \n", + "11 NaN \n", + "12 NaN \n", + "13 NaN \n", + "14 NaN \n", + "15 1.0 \n", + "16 1.0 \n", + "17 1.0 \n", + "18 1.0 \n", + "19 1.0 \n", + "20 NaN " + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2" + ] + }, + { + "cell_type": "markdown", + "id": "a15339fb", + "metadata": {}, + "source": [ + "# Most common scenarios" + ] + }, + { + "cell_type": "markdown", + "id": "d8aa02c4", + "metadata": {}, + "source": [ + "In the most common scenario, clinical trials do not want to recruit women who are pregnant or breast-feeding, out of concern for the baby.\n", + "\n", + "If a clinical trial's key words do not contain pregnancy related words AND the study eligibility mentioned pregnancy related words, we will mark them as a study that wants to AVOID recruiting pregnant women." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "ca59627d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>id</th>\n", + " <th>criteria</th>\n", + " <th>key_words</th>\n", + " <th>pregnant</th>\n", + " <th>lactating</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>1</td>\n", + " <td>for female participants, currently breastfeedi...</td>\n", + " <td>testing</td>\n", + " <td>NaN</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>1</td>\n", + " <td>for female participants, currently breastfeedi...</td>\n", + " <td>testing</td>\n", + " <td>NaN</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>1</td>\n", + " <td>for female participants, currently breastfeedi...</td>\n", + " <td>testing</td>\n", + " <td>NaN</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>2</td>\n", + " <td>patients will be excluded if they are pregnant.</td>\n", + " <td>testing</td>\n", + " <td>0.0</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>3</td>\n", + " <td>pregnant women or women currently breastfeeding;</td>\n", + " <td>testing</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>4</td>\n", + " <td>females who are pregnant or nursing</td>\n", + " <td>testing</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>5</td>\n", + " <td>in the case of women of childbearing age, urin...</td>\n", + " <td>testing</td>\n", + " <td>0.0</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>6</td>\n", + " <td>are pregnant or lactating or planning to becom...</td>\n", + " <td>testing</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>7</td>\n", + " <td>pregnant or breast-feeding</td>\n", + " <td>testing</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>8</td>\n", + " <td>patients who are pregnant or may be pregnant</td>\n", + " <td>testing</td>\n", + " <td>0.0</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>9</td>\n", + " <td>not pregnant or nursing</td>\n", + " <td>testing</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>10</td>\n", + " <td>females who are pregnant.</td>\n", + " <td>testing</td>\n", + " <td>0.0</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>11</td>\n", + " <td>for females of child-bearing age, current preg...</td>\n", + " <td>testing</td>\n", + " <td>0.0</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>12</td>\n", + " <td>for females of child-bearing age, current brea...</td>\n", + " <td>testing</td>\n", + " <td>NaN</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>13</td>\n", + " <td>are not pregnant (negative urine test) or brea...</td>\n", + " <td>testing</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>14</td>\n", + " <td>women pregnant with one fetus between 16 and 2...</td>\n", + " <td>physical activity</td>\n", + " <td>1.0</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>14</td>\n", + " <td>women pregnant with one fetus between 16 and 2...</td>\n", + " <td>sedentary time</td>\n", + " <td>1.0</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>14</td>\n", + " <td>women pregnant with one fetus between 16 and 2...</td>\n", + " <td>pregnancy</td>\n", + " <td>1.0</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>14</td>\n", + " <td>women pregnant with one fetus between 16 and 2...</td>\n", + " <td>pregnant women</td>\n", + " <td>1.0</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>14</td>\n", + " <td>women pregnant with one fetus between 16 and 2...</td>\n", + " <td>weight gain</td>\n", + " <td>1.0</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>15</td>\n", + " <td>this is a study that does not mention anything...</td>\n", + " <td>testing</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " id criteria key_words \\\n", + "0 1 for female participants, currently breastfeedi... testing \n", + "1 1 for female participants, currently breastfeedi... testing \n", + "2 1 for female participants, currently breastfeedi... testing \n", + "3 2 patients will be excluded if they are pregnant. testing \n", + "4 3 pregnant women or women currently breastfeeding; testing \n", + "5 4 females who are pregnant or nursing testing \n", + "6 5 in the case of women of childbearing age, urin... testing \n", + "7 6 are pregnant or lactating or planning to becom... testing \n", + "8 7 pregnant or breast-feeding testing \n", + "9 8 patients who are pregnant or may be pregnant testing \n", + "10 9 not pregnant or nursing testing \n", + "11 10 females who are pregnant. testing \n", + "12 11 for females of child-bearing age, current preg... testing \n", + "13 12 for females of child-bearing age, current brea... testing \n", + "14 13 are not pregnant (negative urine test) or brea... testing \n", + "15 14 women pregnant with one fetus between 16 and 2... physical activity \n", + "16 14 women pregnant with one fetus between 16 and 2... sedentary time \n", + "17 14 women pregnant with one fetus between 16 and 2... pregnancy \n", + "18 14 women pregnant with one fetus between 16 and 2... pregnant women \n", + "19 14 women pregnant with one fetus between 16 and 2... weight gain \n", + "20 15 this is a study that does not mention anything... testing \n", + "\n", + " pregnant lactating \n", + "0 NaN 0.0 \n", + "1 NaN 0.0 \n", + "2 NaN 0.0 \n", + "3 0.0 NaN \n", + "4 0.0 0.0 \n", + "5 0.0 0.0 \n", + "6 0.0 NaN \n", + "7 0.0 0.0 \n", + "8 0.0 0.0 \n", + "9 0.0 NaN \n", + "10 0.0 0.0 \n", + "11 0.0 NaN \n", + "12 0.0 NaN \n", + "13 NaN 0.0 \n", + "14 0.0 0.0 \n", + "15 1.0 NaN \n", + "16 1.0 NaN \n", + "17 1.0 NaN \n", + "18 1.0 NaN \n", + "19 1.0 NaN \n", + "20 NaN NaN " + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2['lactating'] = np.nan\n", + "\n", + "for index, row in df2.iterrows():\n", + " if row['pregnant'] != 1:\n", + " if 'pregnant' in row['criteria'] or 'pregnancy' in row['criteria']:\n", + " df2.loc[index,'pregnant'] = 0\n", + " if 'nursing' in row['criteria'] or 'breast-feeding' in row['criteria'] or 'breastfeeding' in row['criteria'] or 'breast feeding' in row['criteria'] or 'lactating' in row['criteria']:\n", + " df2.loc[index,'lactating'] = 0\n", + " \n", + "df2\n", + " " + ] + }, + { + "cell_type": "markdown", + "id": "a62e65a9", + "metadata": {}, + "source": [ + "Now we see that for each study we are indicating whether they want to...\n", + "- exclude women who are pregnant (pregnant = 0)\n", + "- include women who are pregnant (pregnant = 1)\n", + "- exclude women who are lactating (lactating = 0)\n", + "- or they did not specify whether they care about pregnancy (NaN)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "54038d07", + "metadata": {}, + "outputs": [], + "source": [ + "df2.to_csv(('output_pregnant_'+ date + '.csv'),index = False)" + ] + }, + { + "cell_type": "markdown", + "id": "240c04d5", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}