[efae2c]: / EDA.py

Download this file

107 lines (96 with data), 4.0 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
#1 Importing Needed Libraries
import pandas as pd
import seaborn as sns
import numpy as np
from matplotlib import pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import cross_val_score
from scipy import stats
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from keras.models import Model
from keras.layers import Input, Dense
import missingno as msno
#2 Loading and Exploring the Data
patient_df = pd.read_csv ('patients_01.csv')
#Exploring the Dataset
print(patient_df.describe())
print(patient_df.info())
# Number of samples
num_rows = len(patient_df)
print(num_rows)
# Identify the number of missing values in each feature
missing_values = patient_df.isnull().sum()
print(missing_values)
# Calculating the percentage of missing values for each feature
round(100*(1-patient_df.count()/len(patient_df)),2)
# Extracting samples with 3 or more missing values
patient_df.loc[patient_df.isnull().sum(axis=1)>=3, :]
# Extracting samples with 4 or more missing values
patient_df.loc[patient_df.isnull().sum(axis=1)>=4, :]
#3 Data Visualisation
#3.1 Plotting the distribution of age
plt.figure(figsize=(10, 6))
# Plotting the histogram with specified bins and color
plt.hist(patient_df['age'], bins=20, color="seagreen",alpha = 0.7, edgecolor="black")
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.title('Distribution of Age')
plt.grid(axis='y', linestyle='--')
plt.show()
# Plotting the distribution of BMI
plt.figure(figsize=(10, 6))
#3.2 Plotting the histogram with specified bins and color
plt.hist(patient_df['bmi'], bins=20, color="darkorange", alpha = 0.8,edgecolor="black")
plt.xlabel('BMI')
plt.ylabel('Frequency')
plt.title('Distribution of BMI')
plt.grid(axis='y', linestyle='--')
plt.show()
#3.3 Plotting the distribution of drinking alcohol
plt.figure(figsize=(10, 6))
# Plotting the histogram with specified bins and color
plt.hist(patient_df['alcohol_misuse'], bins=20, color="steelblue",alpha = 0.9, edgecolor="black")
plt.xlabel('Amount of Drinking Alcohol')
plt.ylabel('Frequency')
plt.title('Distribution of Drinking Alcohol')
plt.grid(axis='y', linestyle='--')
plt.show()
#3.4 Plotting the distribution of having mental health issues
plt.figure(figsize=(10, 6))
# Plotting the histogram with specified bins and color
plt.hist(patient_df['health_ment'], bins=20, color="brown",alpha = 0.7, edgecolor="black")
plt.xlabel('Mental Health')
plt.ylabel('Frequency')
plt.title('Distribution of Mental Health Issues')
plt.grid(axis='y', linestyle='--')
plt.show()
#3.5 Plotting the distribution of having general health
plt.figure(figsize=(10, 6))
# Plotting the histogram with specified bins and color
plt.hist(patient_df['health_gen'], bins=10, color="red", alpha=0.7, edgecolor="black")
plt.xlabel('General Health')
plt.ylabel('Frequency')
plt.title('Distribution of General Health')
plt.grid(axis='y', linestyle='--')
plt.show()
#3.6 Plotting the distribution of having healthy body
plt.figure(figsize=(10, 6))
# Plotting the histogram with specified bins, color, transparency, and edgecolor
plt.hist(patient_df['health_phys'], bins=15, color="purple", alpha=0.7, edgecolor="black")
plt.xlabel('Physical Health')
plt.ylabel('Frequency')
plt.title('Distribution of Physical Health')
plt.grid(axis='y', linestyle='--')
plt.show()