Diff of /EDA.py [000000] .. [efae2c]

Switch to side-by-side view

--- a
+++ b/EDA.py
@@ -0,0 +1,106 @@
+#1 Importing Needed Libraries 
+
+import pandas as pd
+import seaborn as sns
+import numpy as np
+from matplotlib import pyplot as plt
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline
+from sklearn.experimental import enable_iterative_imputer
+from sklearn.impute import SimpleImputer, IterativeImputer
+from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
+from sklearn.model_selection import cross_val_score
+from scipy import stats
+from sklearn.cluster import AgglomerativeClustering
+from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
+from sklearn.decomposition import PCA
+from sklearn.model_selection import train_test_split
+from sklearn.cluster import KMeans
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
+from sklearn.svm import SVC
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
+from keras.models import Model
+from keras.layers import Input, Dense
+import missingno as msno
+
+#2 Loading and Exploring the Data
+
+patient_df = pd.read_csv ('patients_01.csv')
+#Exploring the Dataset
+print(patient_df.describe())
+print(patient_df.info())
+# Number of samples
+num_rows = len(patient_df)
+print(num_rows)
+# Identify the number of missing values in each feature
+missing_values = patient_df.isnull().sum()
+print(missing_values)
+# Calculating the percentage of missing values for each feature
+round(100*(1-patient_df.count()/len(patient_df)),2)
+# Extracting samples with 3 or more missing values
+patient_df.loc[patient_df.isnull().sum(axis=1)>=3, :]
+# Extracting samples with 4 or more missing values
+patient_df.loc[patient_df.isnull().sum(axis=1)>=4, :]
+
+#3 Data Visualisation 
+
+#3.1 Plotting the distribution of age
+plt.figure(figsize=(10, 6))
+# Plotting the histogram with specified bins and color
+plt.hist(patient_df['age'], bins=20, color="seagreen",alpha = 0.7, edgecolor="black")
+plt.xlabel('Age')
+plt.ylabel('Frequency')
+plt.title('Distribution of Age')
+plt.grid(axis='y', linestyle='--')
+plt.show()
+
+# Plotting the distribution of BMI
+plt.figure(figsize=(10, 6))
+#3.2 Plotting the histogram with specified bins and color
+plt.hist(patient_df['bmi'], bins=20, color="darkorange", alpha = 0.8,edgecolor="black")
+plt.xlabel('BMI')
+plt.ylabel('Frequency')
+plt.title('Distribution of BMI')
+plt.grid(axis='y', linestyle='--')
+plt.show()
+
+#3.3 Plotting the distribution of drinking alcohol
+plt.figure(figsize=(10, 6))
+# Plotting the histogram with specified bins and color
+plt.hist(patient_df['alcohol_misuse'], bins=20, color="steelblue",alpha = 0.9, edgecolor="black")
+plt.xlabel('Amount of Drinking Alcohol')
+plt.ylabel('Frequency')
+plt.title('Distribution of Drinking Alcohol')
+plt.grid(axis='y', linestyle='--')
+plt.show()
+
+#3.4 Plotting the distribution of having mental health issues
+plt.figure(figsize=(10, 6))
+# Plotting the histogram with specified bins and color
+plt.hist(patient_df['health_ment'], bins=20, color="brown",alpha = 0.7, edgecolor="black")
+plt.xlabel('Mental Health')
+plt.ylabel('Frequency')
+plt.title('Distribution of Mental Health Issues')
+plt.grid(axis='y', linestyle='--')
+plt.show()
+
+#3.5 Plotting the distribution of having general health
+plt.figure(figsize=(10, 6))
+# Plotting the histogram with specified bins and color
+plt.hist(patient_df['health_gen'], bins=10, color="red", alpha=0.7, edgecolor="black")
+plt.xlabel('General Health')
+plt.ylabel('Frequency')
+plt.title('Distribution of General Health')
+plt.grid(axis='y', linestyle='--')
+plt.show()
+
+#3.6 Plotting the distribution of having healthy body
+plt.figure(figsize=(10, 6))
+# Plotting the histogram with specified bins, color, transparency, and edgecolor
+plt.hist(patient_df['health_phys'], bins=15, color="purple", alpha=0.7, edgecolor="black")
+plt.xlabel('Physical Health')
+plt.ylabel('Frequency')
+plt.title('Distribution of Physical Health')
+plt.grid(axis='y', linestyle='--')
+plt.show()