Multi-SpectralBleedingDet / Git / [f85ae2] /DataAnalysis.py

Models:
DavidFeaster/
Multi-SpectralBleedingDet
Downloads: 1
[f85ae2]: / DataAnalysis.py
History
Download this file
299 lines (252 with data), 11.1 kB

import pandas as pd
import numpy as np

# Modelling Algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.utils.multiclass import unique_labels
# import plotly.graph_objects as go

# Visualisation
import matplotlib as mpl
import matplotlib.pyplot as mpl
import matplotlib.pylab as pylab
import seaborn as sns


class DataAnalysisUtils:

    def plotCorrelationHeatMap(self, dataFrame):
        corr = round(dataFrame.corr(), 2)
        mask = np.zeros_like(corr, dtype=np.bool)
        mask[np.triu_indices_from(mask)] = True

        # Set up the matplotlib figure
        f, ax = mpl.subplots(figsize=(26, 26))

        # Generate a custom diverging colormap
        cmap = sns.diverging_palette(220, 10, as_cmap=True)

        # Draw the heatmap with the mask and correct aspect ratio
        sns.heatmap(corr, cmap=cmap, vmax=1, vmin=-1, center=0,
                    linewidths=.2, cbar_kws={"shrink": .7}, annot=True, annot_kws={"fontsize": 5, 'fontweight': 'bold'})

        f.savefig("Correlation.png")
        return f

    def plotUnivariateDistribution(self, df, column):
        sns.set(rc={'figure.figsize': (9, 7)})
        sns.distplot(df[column])
        mpl.savefig('importance.png', dpi=150)


    def plotBivariateDistribution(self, df, var, target, **kwargs):
        row = kwargs.get('row', None)
        col = kwargs.get('col', None)
        facet = sns.FacetGrid(df, hue=target, aspect=4, row=row, col=col)
        facet.map(sns.kdeplot, var, shade=True)
        facet.set(xlim=(0, df[var].max()))
        facet.add_legend()

    def plotPairAllFeatureByHue(self, dataFrame, hue):
        sns.pairplot(dataFrame, hue=hue,corner=True)
        mpl.savefig('pairFeatures2.png', dpi=150)

    def plotJoint(self, dataFrame, columnX, columnY, size=6):
        sns.jointplot(x=columnX, y=columnY, data=dataFrame,
                      size=size, kind='kde', color='#800000', space=0)

    def plotScatterAllFeatures(self, dataFrame):
        pd.plotting.scatter_matrix(dataFrame, figsize=(14, 14),alpha=0.2)
        mpl.savefig("scatter.png")

    def plotColumnVersusColumn(self, dataFrame, columnX, columnY, kind='scatter', color='red'):
        'kind : line|scatter'
        dataFrame.plot(kind=kind, x=columnX, y=columnY, color=color)
        mpl.xlabel(columnX)
        mpl.ylabel(columnY)
        mpl.savefig("ColumnVersusColumn.png")

    def plotHistograms(self, df, variables, n_rows, n_cols):
        sns.set(style="white")
        fig = mpl.figure(figsize=(16, 12))
        for i, var_name in enumerate(variables):
            ax = fig.add_subplot(n_rows, n_cols, i + 1)
            df[var_name].hist(bins=10, ax=ax)
            # + ' ' + var_name ) #var_name+" Distribution")
            ax.set_title(var_name, fontweight='bold')
            # ax.set_xticklabels([], visible=False)
            # ax.set_yticklabels([], visible=False)
        fig.tight_layout()  # Improves appearance a bit.

        mpl.show()
        return fig

    def plotCategories(self, df, cat, target, **kwargs):
        row = kwargs.get('row', None)
        col = kwargs.get('col', None)
        facet = sns.FacetGrid(df, row=row, col=col)
        facet.map(sns.barplot, cat, target)
        facet.add_legend()

    def describeMore(self, df):
        var = []
        l = []
        t = []
        for x in df:
            var.append(x)
            l.append(len(pd.value_counts(df[x])))
            t.append(df[x].dtypes)
        levels = pd.DataFrame({'Variable': var, 'Levels': l, 'Datatype': t})
        levels.sort_values(by='Levels', inplace=True)
        return levels

    def plotVariableImportance(self, X, y):
        tree = DecisionTreeClassifier(random_state=99)
        tree.fit(X, y)
        # self.plotModelVarImp(tree, X, y)
        return tree.feature_importances_,X.columns

    def plotModelVarImp(self, model, X, y):
        imp = pd.DataFrame(
            model.feature_importances_,
            columns=['Importance'],
            index=X.columns
        )
        mpl.figure(figsize=(60, 80))
        imp = imp.sort_values(['Importance'], ascending=False)
        imp[: 20].plot(kind='barh')
        print(model.score(X, y))
        mpl.tight_layout()
        mpl.savefig('importance.png', dpi=150)

    def boxPlotOnTwoColumn(self, dataFrame, column, columnBy):
        f, ax = mpl.subplots(figsize=(12, 8))
        fig = sns.boxplot(x=column, y=columnBy, data=dataFrame)

    def convertColumnsToRow(self, dataFrame, id, columns):
        return pd.melt(frame=dataFrame, id_vars=id, value_vars=columns)

    def convertRowsToColumn(self, melted, id, columns, values):
        return melted.pivot(index=id, columns=columns, values=values)

    def concatDataFramesFromRow(self, dataFrame1, dataFrame2):
        return pd.concat([dataFrame1, dataFrame2], axis=0, ignore_index=True)

    def concatDataFramesFromColumn(self, dataFrame1, dataFrame2):
        return pd.concat([dataFrame1, dataFrame2], axis=1)

    def checkMissingData(self, df):
        flag = df.isna().sum().any()
        if flag == True:
            total = df.isnull().sum()
            percent = (df.isnull().sum()) / (df.isnull().count() * 100)
            output = pd.concat([total, percent], axis=1,
                               keys=['Total', 'Percent'])
            data_type = []
            # written by MJ Bahmani
            for col in df.columns:
                dtype = str(df[col].dtype)
                data_type.append(dtype)
            output['Types'] = data_type
            return (np.transpose(output))
        else:
            return (False)

    def randomForestClassifierGridSearch(self, X_train, y_train, estimator=[4, 6, 9], depth=[2, 3, 5, 10], sampleSplit=[2, 3, 5], sampleLeaf=[1, 5, 8]):
        rfc = RandomForestClassifier()

        # Choose some parameter combinations to try
        parameters = {'n_estimators': estimator,
                      'max_features': ['log2', 'sqrt', 'auto'],
                      'criterion': ['entropy', 'gini'],
                      'max_depth': depth,
                      'min_samples_split': sampleSplit,
                      'min_samples_leaf': sampleLeaf
                      }

        # Type of scoring used to compare parameter combinations
        acc_scorer = make_scorer(accuracy_score)

        # Run the grid search
        grid_obj = GridSearchCV(rfc, parameters, scoring=acc_scorer)
        grid_obj = grid_obj.fit(X_train, y_train)

        # Set the clf to the best combination of parameters
        rfc = grid_obj.best_estimator_

        # Fit the best algorithm to the data.
        rfc.fit(X_train, y_train)

        return rfc

    def plot_confusion_matrix(self, y_true, y_pred, classes,
                              normalize=False,
                              title=None,
                              cmap=mpl.cm.Blues):
        """
        This function prints and plots the confusion matrix.
        Normalization can be applied by setting `normalize=True`.
        """
        if not title:
            if normalize:
                title = 'Normalized confusion matrix'
            else:
                title = 'Confusion matrix, without normalization'

        # Compute confusion matrix
        cm = confusion_matrix(y_true, y_pred)
        # Only use the labels that appear in the data
        classes = classes[unique_labels(y_true, y_pred)]
        if normalize:
            cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
            print("Normalized confusion matrix")
        else:
            print('Confusion matrix, without normalization')

        print(cm)

        fig, ax = mpl.subplots()
        im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
        ax.figure.colorbar(im, ax=ax)
        # We want to show all ticks...
        ax.set(xticks=np.arange(cm.shape[1]),
               yticks=np.arange(cm.shape[0]),
               # ... and label them with the respective list entries
               xticklabels=classes, yticklabels=classes,
               title=title,
               ylabel='True label',
               xlabel='Predicted label')

        # Rotate the tick labels and set their alignment.
        mpl.setp(ax.get_xticklabels(), rotation=45, ha="right",
                 rotation_mode="anchor")

        # Loop over data dimensions and create text annotations.
        fmt = '.2f' if normalize else 'd'
        thresh = cm.max() / 2.
        for i in range(cm.shape[0]):
            for j in range(cm.shape[1]):
                ax.text(j, i, format(cm[i, j], fmt),
                        ha="center", va="center",
                        color="white" if cm[i, j] > thresh else "black")
        fig.tight_layout()
        return ax

    def plotGeoData(self, lati, longi, df, column, dtick=10):
        scl = [0, "rgb(150,0,90)"], [0.125, "rgb(0, 0, 200)"], [0.25, "rgb(0, 25, 255)"], [0.375, "rgb(0, 152, 255)"], [0.5, "rgb(44, 255, 150)"], [0.625, "rgb(151, 255, 0)"],
        [0.75, "rgb(255, 234, 0)"], [0.875, "rgb(255, 111, 0)"], [1, "rgb(255, 0, 0)"]
        fig = go.Figure(data=go.Scattergeo(
            lon=longi,
            lat=lati,
            text=df[column],
            marker=dict(
                color=df[column],
                colorscale=scl,
                reversescale=True,
                opacity=0.7,
                colorbar=dict(
                    titleside="right",
                    outlinecolor="rgba(68, 68, 68, 0)",
                    title=column,
                    dtick=dtick)
            )
        ))

        fig.update_layout(
            geo=dict(

                showland=True,
                landcolor="rgb(212, 212, 212)",
                subunitcolor="rgb(140, 255, 0)",
                countrycolor="rgb(150, 255, 100)",
                showlakes=True,
                showcoastlines=True,
                lakecolor="rgb(0, 150, 255)",

                resolution=50,

                lonaxis=dict(
                    showgrid=True,
                    gridwidth=0.5,
                    range=[-180.0, -55.0],
                    dtick=5
                ),
                lataxis=dict(
                    showgrid=True,
                    gridwidth=0.5,
                    range=[45, 85],
                    dtick=5
                )
            ),
        )
        fig.show()
        nameOfFile = column + '_MAP.png'
        fig.write_image(nameOfFile)