--- a
+++ b/DataAnalysis.py
@@ -0,0 +1,298 @@
+import pandas as pd
+import numpy as np
+
+# Modelling Algorithms
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.naive_bayes import GaussianNB
+from sklearn.svm import SVC, LinearSVC
+from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
+from sklearn.metrics import make_scorer, accuracy_score
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import classification_report
+from sklearn.model_selection import GridSearchCV
+from sklearn.metrics import confusion_matrix
+from sklearn.metrics import accuracy_score
+from sklearn import preprocessing
+from sklearn.utils.multiclass import unique_labels
+# import plotly.graph_objects as go
+
+# Visualisation
+import matplotlib as mpl
+import matplotlib.pyplot as mpl
+import matplotlib.pylab as pylab
+import seaborn as sns
+
+
+class DataAnalysisUtils:
+
+    def plotCorrelationHeatMap(self, dataFrame):
+        corr = round(dataFrame.corr(), 2)
+        mask = np.zeros_like(corr, dtype=np.bool)
+        mask[np.triu_indices_from(mask)] = True
+
+        # Set up the matplotlib figure
+        f, ax = mpl.subplots(figsize=(26, 26))
+
+        # Generate a custom diverging colormap
+        cmap = sns.diverging_palette(220, 10, as_cmap=True)
+
+        # Draw the heatmap with the mask and correct aspect ratio
+        sns.heatmap(corr, cmap=cmap, vmax=1, vmin=-1, center=0,
+                    linewidths=.2, cbar_kws={"shrink": .7}, annot=True, annot_kws={"fontsize": 5, 'fontweight': 'bold'})
+
+        f.savefig("Correlation.png")
+        return f
+
+    def plotUnivariateDistribution(self, df, column):
+        sns.set(rc={'figure.figsize': (9, 7)})
+        sns.distplot(df[column])
+        mpl.savefig('importance.png', dpi=150)
+
+
+    def plotBivariateDistribution(self, df, var, target, **kwargs):
+        row = kwargs.get('row', None)
+        col = kwargs.get('col', None)
+        facet = sns.FacetGrid(df, hue=target, aspect=4, row=row, col=col)
+        facet.map(sns.kdeplot, var, shade=True)
+        facet.set(xlim=(0, df[var].max()))
+        facet.add_legend()
+
+    def plotPairAllFeatureByHue(self, dataFrame, hue):
+        sns.pairplot(dataFrame, hue=hue,corner=True)
+        mpl.savefig('pairFeatures2.png', dpi=150)
+
+    def plotJoint(self, dataFrame, columnX, columnY, size=6):
+        sns.jointplot(x=columnX, y=columnY, data=dataFrame,
+                      size=size, kind='kde', color='#800000', space=0)
+
+    def plotScatterAllFeatures(self, dataFrame):
+        pd.plotting.scatter_matrix(dataFrame, figsize=(14, 14),alpha=0.2)
+        mpl.savefig("scatter.png")
+
+    def plotColumnVersusColumn(self, dataFrame, columnX, columnY, kind='scatter', color='red'):
+        'kind : line|scatter'
+        dataFrame.plot(kind=kind, x=columnX, y=columnY, color=color)
+        mpl.xlabel(columnX)
+        mpl.ylabel(columnY)
+        mpl.savefig("ColumnVersusColumn.png")
+
+    def plotHistograms(self, df, variables, n_rows, n_cols):
+        sns.set(style="white")
+        fig = mpl.figure(figsize=(16, 12))
+        for i, var_name in enumerate(variables):
+            ax = fig.add_subplot(n_rows, n_cols, i + 1)
+            df[var_name].hist(bins=10, ax=ax)
+            # + ' ' + var_name ) #var_name+" Distribution")
+            ax.set_title(var_name, fontweight='bold')
+            # ax.set_xticklabels([], visible=False)
+            # ax.set_yticklabels([], visible=False)
+        fig.tight_layout()  # Improves appearance a bit.
+
+        mpl.show()
+        return fig
+
+    def plotCategories(self, df, cat, target, **kwargs):
+        row = kwargs.get('row', None)
+        col = kwargs.get('col', None)
+        facet = sns.FacetGrid(df, row=row, col=col)
+        facet.map(sns.barplot, cat, target)
+        facet.add_legend()
+
+    def describeMore(self, df):
+        var = []
+        l = []
+        t = []
+        for x in df:
+            var.append(x)
+            l.append(len(pd.value_counts(df[x])))
+            t.append(df[x].dtypes)
+        levels = pd.DataFrame({'Variable': var, 'Levels': l, 'Datatype': t})
+        levels.sort_values(by='Levels', inplace=True)
+        return levels
+
+    def plotVariableImportance(self, X, y):
+        tree = DecisionTreeClassifier(random_state=99)
+        tree.fit(X, y)
+        # self.plotModelVarImp(tree, X, y)
+        return tree.feature_importances_,X.columns
+
+    def plotModelVarImp(self, model, X, y):
+        imp = pd.DataFrame(
+            model.feature_importances_,
+            columns=['Importance'],
+            index=X.columns
+        )
+        mpl.figure(figsize=(60, 80))
+        imp = imp.sort_values(['Importance'], ascending=False)
+        imp[: 20].plot(kind='barh')
+        print(model.score(X, y))
+        mpl.tight_layout()
+        mpl.savefig('importance.png', dpi=150)
+
+    def boxPlotOnTwoColumn(self, dataFrame, column, columnBy):
+        f, ax = mpl.subplots(figsize=(12, 8))
+        fig = sns.boxplot(x=column, y=columnBy, data=dataFrame)
+
+    def convertColumnsToRow(self, dataFrame, id, columns):
+        return pd.melt(frame=dataFrame, id_vars=id, value_vars=columns)
+
+    def convertRowsToColumn(self, melted, id, columns, values):
+        return melted.pivot(index=id, columns=columns, values=values)
+
+    def concatDataFramesFromRow(self, dataFrame1, dataFrame2):
+        return pd.concat([dataFrame1, dataFrame2], axis=0, ignore_index=True)
+
+    def concatDataFramesFromColumn(self, dataFrame1, dataFrame2):
+        return pd.concat([dataFrame1, dataFrame2], axis=1)
+
+    def checkMissingData(self, df):
+        flag = df.isna().sum().any()
+        if flag == True:
+            total = df.isnull().sum()
+            percent = (df.isnull().sum()) / (df.isnull().count() * 100)
+            output = pd.concat([total, percent], axis=1,
+                               keys=['Total', 'Percent'])
+            data_type = []
+            # written by MJ Bahmani
+            for col in df.columns:
+                dtype = str(df[col].dtype)
+                data_type.append(dtype)
+            output['Types'] = data_type
+            return (np.transpose(output))
+        else:
+            return (False)
+
+    def randomForestClassifierGridSearch(self, X_train, y_train, estimator=[4, 6, 9], depth=[2, 3, 5, 10], sampleSplit=[2, 3, 5], sampleLeaf=[1, 5, 8]):
+        rfc = RandomForestClassifier()
+
+        # Choose some parameter combinations to try
+        parameters = {'n_estimators': estimator,
+                      'max_features': ['log2', 'sqrt', 'auto'],
+                      'criterion': ['entropy', 'gini'],
+                      'max_depth': depth,
+                      'min_samples_split': sampleSplit,
+                      'min_samples_leaf': sampleLeaf
+                      }
+
+        # Type of scoring used to compare parameter combinations
+        acc_scorer = make_scorer(accuracy_score)
+
+        # Run the grid search
+        grid_obj = GridSearchCV(rfc, parameters, scoring=acc_scorer)
+        grid_obj = grid_obj.fit(X_train, y_train)
+
+        # Set the clf to the best combination of parameters
+        rfc = grid_obj.best_estimator_
+
+        # Fit the best algorithm to the data.
+        rfc.fit(X_train, y_train)
+
+        return rfc
+
+    def plot_confusion_matrix(self, y_true, y_pred, classes,
+                              normalize=False,
+                              title=None,
+                              cmap=mpl.cm.Blues):
+        """
+        This function prints and plots the confusion matrix.
+        Normalization can be applied by setting `normalize=True`.
+        """
+        if not title:
+            if normalize:
+                title = 'Normalized confusion matrix'
+            else:
+                title = 'Confusion matrix, without normalization'
+
+        # Compute confusion matrix
+        cm = confusion_matrix(y_true, y_pred)
+        # Only use the labels that appear in the data
+        classes = classes[unique_labels(y_true, y_pred)]
+        if normalize:
+            cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
+            print("Normalized confusion matrix")
+        else:
+            print('Confusion matrix, without normalization')
+
+        print(cm)
+
+        fig, ax = mpl.subplots()
+        im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
+        ax.figure.colorbar(im, ax=ax)
+        # We want to show all ticks...
+        ax.set(xticks=np.arange(cm.shape[1]),
+               yticks=np.arange(cm.shape[0]),
+               # ... and label them with the respective list entries
+               xticklabels=classes, yticklabels=classes,
+               title=title,
+               ylabel='True label',
+               xlabel='Predicted label')
+
+        # Rotate the tick labels and set their alignment.
+        mpl.setp(ax.get_xticklabels(), rotation=45, ha="right",
+                 rotation_mode="anchor")
+
+        # Loop over data dimensions and create text annotations.
+        fmt = '.2f' if normalize else 'd'
+        thresh = cm.max() / 2.
+        for i in range(cm.shape[0]):
+            for j in range(cm.shape[1]):
+                ax.text(j, i, format(cm[i, j], fmt),
+                        ha="center", va="center",
+                        color="white" if cm[i, j] > thresh else "black")
+        fig.tight_layout()
+        return ax
+
+    def plotGeoData(self, lati, longi, df, column, dtick=10):
+        scl = [0, "rgb(150,0,90)"], [0.125, "rgb(0, 0, 200)"], [0.25, "rgb(0, 25, 255)"], [0.375, "rgb(0, 152, 255)"], [0.5, "rgb(44, 255, 150)"], [0.625, "rgb(151, 255, 0)"],
+        [0.75, "rgb(255, 234, 0)"], [0.875, "rgb(255, 111, 0)"], [1, "rgb(255, 0, 0)"]
+        fig = go.Figure(data=go.Scattergeo(
+            lon=longi,
+            lat=lati,
+            text=df[column],
+            marker=dict(
+                color=df[column],
+                colorscale=scl,
+                reversescale=True,
+                opacity=0.7,
+                colorbar=dict(
+                    titleside="right",
+                    outlinecolor="rgba(68, 68, 68, 0)",
+                    title=column,
+                    dtick=dtick)
+            )
+        ))
+
+        fig.update_layout(
+            geo=dict(
+
+                showland=True,
+                landcolor="rgb(212, 212, 212)",
+                subunitcolor="rgb(140, 255, 0)",
+                countrycolor="rgb(150, 255, 100)",
+                showlakes=True,
+                showcoastlines=True,
+                lakecolor="rgb(0, 150, 255)",
+
+                resolution=50,
+
+                lonaxis=dict(
+                    showgrid=True,
+                    gridwidth=0.5,
+                    range=[-180.0, -55.0],
+                    dtick=5
+                ),
+                lataxis=dict(
+                    showgrid=True,
+                    gridwidth=0.5,
+                    range=[45, 85],
+                    dtick=5
+                )
+            ),
+        )
+        fig.show()
+        nameOfFile = column + '_MAP.png'
+        fig.write_image(nameOfFile)