|
a |
|
b/EDA.py |
|
|
1 |
#1 Importing Needed Libraries |
|
|
2 |
|
|
|
3 |
import pandas as pd |
|
|
4 |
import seaborn as sns |
|
|
5 |
import numpy as np |
|
|
6 |
from matplotlib import pyplot as plt |
|
|
7 |
from sklearn.compose import ColumnTransformer |
|
|
8 |
from sklearn.pipeline import Pipeline |
|
|
9 |
from sklearn.experimental import enable_iterative_imputer |
|
|
10 |
from sklearn.impute import SimpleImputer, IterativeImputer |
|
|
11 |
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder |
|
|
12 |
from sklearn.model_selection import cross_val_score |
|
|
13 |
from scipy import stats |
|
|
14 |
from sklearn.cluster import AgglomerativeClustering |
|
|
15 |
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster |
|
|
16 |
from sklearn.decomposition import PCA |
|
|
17 |
from sklearn.model_selection import train_test_split |
|
|
18 |
from sklearn.cluster import KMeans |
|
|
19 |
from sklearn.linear_model import LogisticRegression |
|
|
20 |
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier |
|
|
21 |
from sklearn.svm import SVC |
|
|
22 |
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score |
|
|
23 |
from keras.models import Model |
|
|
24 |
from keras.layers import Input, Dense |
|
|
25 |
import missingno as msno |
|
|
26 |
|
|
|
27 |
#2 Loading and Exploring the Data |
|
|
28 |
|
|
|
29 |
patient_df = pd.read_csv ('patients_01.csv') |
|
|
30 |
#Exploring the Dataset |
|
|
31 |
print(patient_df.describe()) |
|
|
32 |
print(patient_df.info()) |
|
|
33 |
# Number of samples |
|
|
34 |
num_rows = len(patient_df) |
|
|
35 |
print(num_rows) |
|
|
36 |
# Identify the number of missing values in each feature |
|
|
37 |
missing_values = patient_df.isnull().sum() |
|
|
38 |
print(missing_values) |
|
|
39 |
# Calculating the percentage of missing values for each feature |
|
|
40 |
round(100*(1-patient_df.count()/len(patient_df)),2) |
|
|
41 |
# Extracting samples with 3 or more missing values |
|
|
42 |
patient_df.loc[patient_df.isnull().sum(axis=1)>=3, :] |
|
|
43 |
# Extracting samples with 4 or more missing values |
|
|
44 |
patient_df.loc[patient_df.isnull().sum(axis=1)>=4, :] |
|
|
45 |
|
|
|
46 |
#3 Data Visualisation |
|
|
47 |
|
|
|
48 |
#3.1 Plotting the distribution of age |
|
|
49 |
plt.figure(figsize=(10, 6)) |
|
|
50 |
# Plotting the histogram with specified bins and color |
|
|
51 |
plt.hist(patient_df['age'], bins=20, color="seagreen",alpha = 0.7, edgecolor="black") |
|
|
52 |
plt.xlabel('Age') |
|
|
53 |
plt.ylabel('Frequency') |
|
|
54 |
plt.title('Distribution of Age') |
|
|
55 |
plt.grid(axis='y', linestyle='--') |
|
|
56 |
plt.show() |
|
|
57 |
|
|
|
58 |
# Plotting the distribution of BMI |
|
|
59 |
plt.figure(figsize=(10, 6)) |
|
|
60 |
#3.2 Plotting the histogram with specified bins and color |
|
|
61 |
plt.hist(patient_df['bmi'], bins=20, color="darkorange", alpha = 0.8,edgecolor="black") |
|
|
62 |
plt.xlabel('BMI') |
|
|
63 |
plt.ylabel('Frequency') |
|
|
64 |
plt.title('Distribution of BMI') |
|
|
65 |
plt.grid(axis='y', linestyle='--') |
|
|
66 |
plt.show() |
|
|
67 |
|
|
|
68 |
#3.3 Plotting the distribution of drinking alcohol |
|
|
69 |
plt.figure(figsize=(10, 6)) |
|
|
70 |
# Plotting the histogram with specified bins and color |
|
|
71 |
plt.hist(patient_df['alcohol_misuse'], bins=20, color="steelblue",alpha = 0.9, edgecolor="black") |
|
|
72 |
plt.xlabel('Amount of Drinking Alcohol') |
|
|
73 |
plt.ylabel('Frequency') |
|
|
74 |
plt.title('Distribution of Drinking Alcohol') |
|
|
75 |
plt.grid(axis='y', linestyle='--') |
|
|
76 |
plt.show() |
|
|
77 |
|
|
|
78 |
#3.4 Plotting the distribution of having mental health issues |
|
|
79 |
plt.figure(figsize=(10, 6)) |
|
|
80 |
# Plotting the histogram with specified bins and color |
|
|
81 |
plt.hist(patient_df['health_ment'], bins=20, color="brown",alpha = 0.7, edgecolor="black") |
|
|
82 |
plt.xlabel('Mental Health') |
|
|
83 |
plt.ylabel('Frequency') |
|
|
84 |
plt.title('Distribution of Mental Health Issues') |
|
|
85 |
plt.grid(axis='y', linestyle='--') |
|
|
86 |
plt.show() |
|
|
87 |
|
|
|
88 |
#3.5 Plotting the distribution of having general health |
|
|
89 |
plt.figure(figsize=(10, 6)) |
|
|
90 |
# Plotting the histogram with specified bins and color |
|
|
91 |
plt.hist(patient_df['health_gen'], bins=10, color="red", alpha=0.7, edgecolor="black") |
|
|
92 |
plt.xlabel('General Health') |
|
|
93 |
plt.ylabel('Frequency') |
|
|
94 |
plt.title('Distribution of General Health') |
|
|
95 |
plt.grid(axis='y', linestyle='--') |
|
|
96 |
plt.show() |
|
|
97 |
|
|
|
98 |
#3.6 Plotting the distribution of having healthy body |
|
|
99 |
plt.figure(figsize=(10, 6)) |
|
|
100 |
# Plotting the histogram with specified bins, color, transparency, and edgecolor |
|
|
101 |
plt.hist(patient_df['health_phys'], bins=15, color="purple", alpha=0.7, edgecolor="black") |
|
|
102 |
plt.xlabel('Physical Health') |
|
|
103 |
plt.ylabel('Frequency') |
|
|
104 |
plt.title('Distribution of Physical Health') |
|
|
105 |
plt.grid(axis='y', linestyle='--') |
|
|
106 |
plt.show() |