a b/EDA.py
1
#1 Importing Needed Libraries 
2
3
import pandas as pd
4
import seaborn as sns
5
import numpy as np
6
from matplotlib import pyplot as plt
7
from sklearn.compose import ColumnTransformer
8
from sklearn.pipeline import Pipeline
9
from sklearn.experimental import enable_iterative_imputer
10
from sklearn.impute import SimpleImputer, IterativeImputer
11
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
12
from sklearn.model_selection import cross_val_score
13
from scipy import stats
14
from sklearn.cluster import AgglomerativeClustering
15
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
16
from sklearn.decomposition import PCA
17
from sklearn.model_selection import train_test_split
18
from sklearn.cluster import KMeans
19
from sklearn.linear_model import LogisticRegression
20
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
21
from sklearn.svm import SVC
22
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
23
from keras.models import Model
24
from keras.layers import Input, Dense
25
import missingno as msno
26
27
#2 Loading and Exploring the Data
28
29
patient_df = pd.read_csv ('patients_01.csv')
30
#Exploring the Dataset
31
print(patient_df.describe())
32
print(patient_df.info())
33
# Number of samples
34
num_rows = len(patient_df)
35
print(num_rows)
36
# Identify the number of missing values in each feature
37
missing_values = patient_df.isnull().sum()
38
print(missing_values)
39
# Calculating the percentage of missing values for each feature
40
round(100*(1-patient_df.count()/len(patient_df)),2)
41
# Extracting samples with 3 or more missing values
42
patient_df.loc[patient_df.isnull().sum(axis=1)>=3, :]
43
# Extracting samples with 4 or more missing values
44
patient_df.loc[patient_df.isnull().sum(axis=1)>=4, :]
45
46
#3 Data Visualisation 
47
48
#3.1 Plotting the distribution of age
49
plt.figure(figsize=(10, 6))
50
# Plotting the histogram with specified bins and color
51
plt.hist(patient_df['age'], bins=20, color="seagreen",alpha = 0.7, edgecolor="black")
52
plt.xlabel('Age')
53
plt.ylabel('Frequency')
54
plt.title('Distribution of Age')
55
plt.grid(axis='y', linestyle='--')
56
plt.show()
57
58
# Plotting the distribution of BMI
59
plt.figure(figsize=(10, 6))
60
#3.2 Plotting the histogram with specified bins and color
61
plt.hist(patient_df['bmi'], bins=20, color="darkorange", alpha = 0.8,edgecolor="black")
62
plt.xlabel('BMI')
63
plt.ylabel('Frequency')
64
plt.title('Distribution of BMI')
65
plt.grid(axis='y', linestyle='--')
66
plt.show()
67
68
#3.3 Plotting the distribution of drinking alcohol
69
plt.figure(figsize=(10, 6))
70
# Plotting the histogram with specified bins and color
71
plt.hist(patient_df['alcohol_misuse'], bins=20, color="steelblue",alpha = 0.9, edgecolor="black")
72
plt.xlabel('Amount of Drinking Alcohol')
73
plt.ylabel('Frequency')
74
plt.title('Distribution of Drinking Alcohol')
75
plt.grid(axis='y', linestyle='--')
76
plt.show()
77
78
#3.4 Plotting the distribution of having mental health issues
79
plt.figure(figsize=(10, 6))
80
# Plotting the histogram with specified bins and color
81
plt.hist(patient_df['health_ment'], bins=20, color="brown",alpha = 0.7, edgecolor="black")
82
plt.xlabel('Mental Health')
83
plt.ylabel('Frequency')
84
plt.title('Distribution of Mental Health Issues')
85
plt.grid(axis='y', linestyle='--')
86
plt.show()
87
88
#3.5 Plotting the distribution of having general health
89
plt.figure(figsize=(10, 6))
90
# Plotting the histogram with specified bins and color
91
plt.hist(patient_df['health_gen'], bins=10, color="red", alpha=0.7, edgecolor="black")
92
plt.xlabel('General Health')
93
plt.ylabel('Frequency')
94
plt.title('Distribution of General Health')
95
plt.grid(axis='y', linestyle='--')
96
plt.show()
97
98
#3.6 Plotting the distribution of having healthy body
99
plt.figure(figsize=(10, 6))
100
# Plotting the histogram with specified bins, color, transparency, and edgecolor
101
plt.hist(patient_df['health_phys'], bins=15, color="purple", alpha=0.7, edgecolor="black")
102
plt.xlabel('Physical Health')
103
plt.ylabel('Frequency')
104
plt.title('Distribution of Physical Health')
105
plt.grid(axis='y', linestyle='--')
106
plt.show()