textual-data-analysis / Git / Diff of /data

Models:
philipB/
textual-data-analysis
Downloads: 1
Diff of /data_visualization.py [000000] .. [ab5bfb]
Switch to side-by-side view

--- a
+++ b/data_visualization.py
@@ -0,0 +1,212 @@
+import os
+import json
+from dotenv import load_dotenv
+import pymongo
+import pandas as pd
+import plotly.express as px
+import matplotlib.pyplot as plt
+
+
+load_dotenv()
+
+username = os.environ.get("USERNAME")
+password = os.environ.get("PASSWORD")
+
+client = pymongo.MongoClient(f"mongodb+srv://{username}:{password}@cluster0.lxsbb.mongodb.net/?retryWrites=true&w=majority")
+db = client.medical_record
+collection = db["report"]
+
+
+a = collection.find({})
+b = list(a)
+
+
+patient = {
+    k:[] for k in b[0]["demographics"].keys()
+}
+patient["patient_id"] = []
+for d in b:
+    patient["patient_id"].append(d["_id"]),
+    for k, v in d["demographics"].items():
+        patient[k].append(v)
+
+df_patient = pd.DataFrame(patient)
+
+fig_patient_gender = px.bar(
+    x=df_patient["gender"].value_counts().index,
+    y=df_patient["gender"].value_counts(),
+)
+
+fig_patient_gender.update_layout(
+    title="Distribution of Gender",
+    xaxis_title="Gender",
+    yaxis_title="Frequency",
+)
+
+fig_patient_race = px.bar(
+    x=df_patient["race"].value_counts().index,
+    y=df_patient["race"].value_counts()
+)
+
+fig_patient_race.update_layout(
+    title="Distribution of race",
+    xaxis_title="Race",
+    yaxis_title="Frequency",
+)
+
+allergies = {
+    k:[] for k in b[0]["allergies"][0].keys()
+}
+allergies["patient_id"] = []
+for d in b:
+    if d["allergies"]:
+        for allergy in d["allergies"]:
+            allergies["patient_id"].append(d["_id"])
+            for k, v in allergy.items():
+                allergies[k].append(v)
+    else:
+        allergies["patient_id"].append(d["_id"])
+        keys = list(allergies.keys())[:-1]
+        for k in keys:
+            allergies[k].append(None)
+
+df_allergy = pd.DataFrame(allergies)
+
+allergic = df_allergy.groupby("patient_id")["to"].count()
+allergic = allergic.gt(0).replace({True:"allergic", False:"not allergic"})
+
+
+fig_allergy = px.pie(
+    values=allergic.value_counts(),
+    names=allergic.value_counts().index,
+)
+
+fig_allergy.update_layout(
+    title="Percentage of Allergic and Non-allergic patients Patients",
+)
+
+fig_allergy_type = px.bar(
+    x=df_allergy["type"].value_counts().index,
+    y=df_allergy["type"].value_counts(),
+)
+
+fig_allergy_type.update_layout(
+    title="Different types of allergies",
+    xaxis_title="Types of allergies",
+    yaxis_title="Number of patients",
+)
+
+
+conditions = {
+     k:[] for k in b[0]["conditions"][0].keys()
+}
+conditions["patient_id"] = []
+for d in b:
+    if d["conditions"]:
+        for condition in d["conditions"]:
+            conditions["patient_id"].append(d["_id"])
+            for k, v in condition.items():
+                conditions[k].append(v)
+    else:
+        conditions["patient_id"].append(d["_id"])
+        keys = list(conditions.keys())[:-1]
+        for k in keys:
+            conditions[k].append(None)
+
+
+df_conditions = pd.DataFrame(conditions)
+
+
+top_10_conditions = df_conditions["condition"].value_counts().head(10)
+
+
+# top_10_conditions.plot(
+#     kind="bar",
+#     title="Top ten conditions of patients",
+#     xlabel="Conditions",
+#     ylabel="Number of patients"
+# );
+
+fig_top_conditions = px.bar(
+    x=top_10_conditions.index,
+    y=top_10_conditions,
+)
+
+fig_top_conditions.update_layout(
+    title="Top ten conditions of patients",
+    xaxis_title="Different Conditions",
+    yaxis_title="Number of patients",
+)
+
+
+df_demographic_condition = pd.merge(
+    df_patient, df_conditions, how='inner', on=["patient_id"]
+)[["gender", "condition"]]
+
+
+top_conds = top_10_conditions.index
+
+
+df_demographic_condition["gender"] = df_demographic_condition["gender"].replace({"M": "male", "F": "female"})
+
+cond_gender = {k:{"male":0, "female":0} for k in top_conds}
+
+for index, row in df_demographic_condition.iterrows():
+    if row["condition"] in top_conds:
+        cond_gender[row["condition"]][row["gender"]] += 1
+
+df_cond_gender = pd.DataFrame(cond_gender).T
+
+fig_cond_gender = px.bar(
+    df_cond_gender,
+    barmode='group'
+)
+
+fig_cond_gender.update_layout(
+    title="Top ten conditions of patients distributed among genders",
+    xaxis_title="Different Conditions",
+    yaxis_title="Number of patients",
+)
+
+
+immunizations = {
+    k:[] for k in b[0]["immunization"][0].keys()
+}
+
+immunizations["patient_id"] = []
+
+for d in b:
+    if d["immunization"]:
+        for imm in d["immunization"]:
+            immunizations["patient_id"].append(d["_id"])
+            for k, v in imm.items():
+                immunizations[k].append(v)
+    else:
+        immunizations["patient_id"].append(d["_id"])
+        keys = list(immunizations.keys())[:-1]
+        for k in keys:
+            immunizations[k].append(None)
+
+
+df_imm = pd.DataFrame(immunizations)
+df_imm["date"] = pd.to_datetime(df_imm['date'])
+df_imm["immunization"].value_counts()
+df_imm_covid = df_imm[df_imm["immunization"].str.contains("COVID")==True].sort_values(by='date')
+df_imm["month_year"] = df_imm_covid["date"].dt.to_period('M')
+covid_21_22 = df_imm.groupby("month_year")["immunization"].count()
+
+fig_covid_21_22 = px.line(
+    x=covid_21_22.index.strftime("%Y-%m"),
+    y=covid_21_22
+)
+
+fig_covid_21_22.update_layout(
+    title="Time plot of COVID vaccination from Jan 2021 to June 2022",
+    xaxis_title="Month and Year of vaccination",
+    yaxis_title="Number of patients",
+)
+# covid_21_22.plot(
+#     title="Time plot of COVID vaccination from Jan 2021 to June 2022",
+#     xlabel="Time",
+#     ylabel="Frequency of vaccination"
+# );
\ No newline at end of file