Diff of /DataAnalysis.py [000000] .. [f85ae2]

Switch to unified view

a b/DataAnalysis.py
1
import pandas as pd
2
import numpy as np
3
4
# Modelling Algorithms
5
from sklearn.tree import DecisionTreeClassifier
6
from sklearn.linear_model import LogisticRegression
7
from sklearn.neighbors import KNeighborsClassifier
8
from sklearn.naive_bayes import GaussianNB
9
from sklearn.svm import SVC, LinearSVC
10
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
11
from sklearn.metrics import make_scorer, accuracy_score
12
from sklearn.model_selection import train_test_split
13
from sklearn.ensemble import RandomForestClassifier
14
from sklearn.linear_model import LogisticRegression
15
from sklearn.metrics import classification_report
16
from sklearn.model_selection import GridSearchCV
17
from sklearn.metrics import confusion_matrix
18
from sklearn.metrics import accuracy_score
19
from sklearn import preprocessing
20
from sklearn.utils.multiclass import unique_labels
21
# import plotly.graph_objects as go
22
23
# Visualisation
24
import matplotlib as mpl
25
import matplotlib.pyplot as mpl
26
import matplotlib.pylab as pylab
27
import seaborn as sns
28
29
30
class DataAnalysisUtils:
31
32
    def plotCorrelationHeatMap(self, dataFrame):
33
        corr = round(dataFrame.corr(), 2)
34
        mask = np.zeros_like(corr, dtype=np.bool)
35
        mask[np.triu_indices_from(mask)] = True
36
37
        # Set up the matplotlib figure
38
        f, ax = mpl.subplots(figsize=(26, 26))
39
40
        # Generate a custom diverging colormap
41
        cmap = sns.diverging_palette(220, 10, as_cmap=True)
42
43
        # Draw the heatmap with the mask and correct aspect ratio
44
        sns.heatmap(corr, cmap=cmap, vmax=1, vmin=-1, center=0,
45
                    linewidths=.2, cbar_kws={"shrink": .7}, annot=True, annot_kws={"fontsize": 5, 'fontweight': 'bold'})
46
47
        f.savefig("Correlation.png")
48
        return f
49
50
    def plotUnivariateDistribution(self, df, column):
51
        sns.set(rc={'figure.figsize': (9, 7)})
52
        sns.distplot(df[column])
53
        mpl.savefig('importance.png', dpi=150)
54
55
56
    def plotBivariateDistribution(self, df, var, target, **kwargs):
57
        row = kwargs.get('row', None)
58
        col = kwargs.get('col', None)
59
        facet = sns.FacetGrid(df, hue=target, aspect=4, row=row, col=col)
60
        facet.map(sns.kdeplot, var, shade=True)
61
        facet.set(xlim=(0, df[var].max()))
62
        facet.add_legend()
63
64
    def plotPairAllFeatureByHue(self, dataFrame, hue):
65
        sns.pairplot(dataFrame, hue=hue,corner=True)
66
        mpl.savefig('pairFeatures2.png', dpi=150)
67
68
    def plotJoint(self, dataFrame, columnX, columnY, size=6):
69
        sns.jointplot(x=columnX, y=columnY, data=dataFrame,
70
                      size=size, kind='kde', color='#800000', space=0)
71
72
    def plotScatterAllFeatures(self, dataFrame):
73
        pd.plotting.scatter_matrix(dataFrame, figsize=(14, 14),alpha=0.2)
74
        mpl.savefig("scatter.png")
75
76
    def plotColumnVersusColumn(self, dataFrame, columnX, columnY, kind='scatter', color='red'):
77
        'kind : line|scatter'
78
        dataFrame.plot(kind=kind, x=columnX, y=columnY, color=color)
79
        mpl.xlabel(columnX)
80
        mpl.ylabel(columnY)
81
        mpl.savefig("ColumnVersusColumn.png")
82
83
    def plotHistograms(self, df, variables, n_rows, n_cols):
84
        sns.set(style="white")
85
        fig = mpl.figure(figsize=(16, 12))
86
        for i, var_name in enumerate(variables):
87
            ax = fig.add_subplot(n_rows, n_cols, i + 1)
88
            df[var_name].hist(bins=10, ax=ax)
89
            # + ' ' + var_name ) #var_name+" Distribution")
90
            ax.set_title(var_name, fontweight='bold')
91
            # ax.set_xticklabels([], visible=False)
92
            # ax.set_yticklabels([], visible=False)
93
        fig.tight_layout()  # Improves appearance a bit.
94
95
        mpl.show()
96
        return fig
97
98
    def plotCategories(self, df, cat, target, **kwargs):
99
        row = kwargs.get('row', None)
100
        col = kwargs.get('col', None)
101
        facet = sns.FacetGrid(df, row=row, col=col)
102
        facet.map(sns.barplot, cat, target)
103
        facet.add_legend()
104
105
    def describeMore(self, df):
106
        var = []
107
        l = []
108
        t = []
109
        for x in df:
110
            var.append(x)
111
            l.append(len(pd.value_counts(df[x])))
112
            t.append(df[x].dtypes)
113
        levels = pd.DataFrame({'Variable': var, 'Levels': l, 'Datatype': t})
114
        levels.sort_values(by='Levels', inplace=True)
115
        return levels
116
117
    def plotVariableImportance(self, X, y):
118
        tree = DecisionTreeClassifier(random_state=99)
119
        tree.fit(X, y)
120
        # self.plotModelVarImp(tree, X, y)
121
        return tree.feature_importances_,X.columns
122
123
    def plotModelVarImp(self, model, X, y):
124
        imp = pd.DataFrame(
125
            model.feature_importances_,
126
            columns=['Importance'],
127
            index=X.columns
128
        )
129
        mpl.figure(figsize=(60, 80))
130
        imp = imp.sort_values(['Importance'], ascending=False)
131
        imp[: 20].plot(kind='barh')
132
        print(model.score(X, y))
133
        mpl.tight_layout()
134
        mpl.savefig('importance.png', dpi=150)
135
136
    def boxPlotOnTwoColumn(self, dataFrame, column, columnBy):
137
        f, ax = mpl.subplots(figsize=(12, 8))
138
        fig = sns.boxplot(x=column, y=columnBy, data=dataFrame)
139
140
    def convertColumnsToRow(self, dataFrame, id, columns):
141
        return pd.melt(frame=dataFrame, id_vars=id, value_vars=columns)
142
143
    def convertRowsToColumn(self, melted, id, columns, values):
144
        return melted.pivot(index=id, columns=columns, values=values)
145
146
    def concatDataFramesFromRow(self, dataFrame1, dataFrame2):
147
        return pd.concat([dataFrame1, dataFrame2], axis=0, ignore_index=True)
148
149
    def concatDataFramesFromColumn(self, dataFrame1, dataFrame2):
150
        return pd.concat([dataFrame1, dataFrame2], axis=1)
151
152
    def checkMissingData(self, df):
153
        flag = df.isna().sum().any()
154
        if flag == True:
155
            total = df.isnull().sum()
156
            percent = (df.isnull().sum()) / (df.isnull().count() * 100)
157
            output = pd.concat([total, percent], axis=1,
158
                               keys=['Total', 'Percent'])
159
            data_type = []
160
            # written by MJ Bahmani
161
            for col in df.columns:
162
                dtype = str(df[col].dtype)
163
                data_type.append(dtype)
164
            output['Types'] = data_type
165
            return (np.transpose(output))
166
        else:
167
            return (False)
168
169
    def randomForestClassifierGridSearch(self, X_train, y_train, estimator=[4, 6, 9], depth=[2, 3, 5, 10], sampleSplit=[2, 3, 5], sampleLeaf=[1, 5, 8]):
170
        rfc = RandomForestClassifier()
171
172
        # Choose some parameter combinations to try
173
        parameters = {'n_estimators': estimator,
174
                      'max_features': ['log2', 'sqrt', 'auto'],
175
                      'criterion': ['entropy', 'gini'],
176
                      'max_depth': depth,
177
                      'min_samples_split': sampleSplit,
178
                      'min_samples_leaf': sampleLeaf
179
                      }
180
181
        # Type of scoring used to compare parameter combinations
182
        acc_scorer = make_scorer(accuracy_score)
183
184
        # Run the grid search
185
        grid_obj = GridSearchCV(rfc, parameters, scoring=acc_scorer)
186
        grid_obj = grid_obj.fit(X_train, y_train)
187
188
        # Set the clf to the best combination of parameters
189
        rfc = grid_obj.best_estimator_
190
191
        # Fit the best algorithm to the data.
192
        rfc.fit(X_train, y_train)
193
194
        return rfc
195
196
    def plot_confusion_matrix(self, y_true, y_pred, classes,
197
                              normalize=False,
198
                              title=None,
199
                              cmap=mpl.cm.Blues):
200
        """
201
        This function prints and plots the confusion matrix.
202
        Normalization can be applied by setting `normalize=True`.
203
        """
204
        if not title:
205
            if normalize:
206
                title = 'Normalized confusion matrix'
207
            else:
208
                title = 'Confusion matrix, without normalization'
209
210
        # Compute confusion matrix
211
        cm = confusion_matrix(y_true, y_pred)
212
        # Only use the labels that appear in the data
213
        classes = classes[unique_labels(y_true, y_pred)]
214
        if normalize:
215
            cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
216
            print("Normalized confusion matrix")
217
        else:
218
            print('Confusion matrix, without normalization')
219
220
        print(cm)
221
222
        fig, ax = mpl.subplots()
223
        im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
224
        ax.figure.colorbar(im, ax=ax)
225
        # We want to show all ticks...
226
        ax.set(xticks=np.arange(cm.shape[1]),
227
               yticks=np.arange(cm.shape[0]),
228
               # ... and label them with the respective list entries
229
               xticklabels=classes, yticklabels=classes,
230
               title=title,
231
               ylabel='True label',
232
               xlabel='Predicted label')
233
234
        # Rotate the tick labels and set their alignment.
235
        mpl.setp(ax.get_xticklabels(), rotation=45, ha="right",
236
                 rotation_mode="anchor")
237
238
        # Loop over data dimensions and create text annotations.
239
        fmt = '.2f' if normalize else 'd'
240
        thresh = cm.max() / 2.
241
        for i in range(cm.shape[0]):
242
            for j in range(cm.shape[1]):
243
                ax.text(j, i, format(cm[i, j], fmt),
244
                        ha="center", va="center",
245
                        color="white" if cm[i, j] > thresh else "black")
246
        fig.tight_layout()
247
        return ax
248
249
    def plotGeoData(self, lati, longi, df, column, dtick=10):
250
        scl = [0, "rgb(150,0,90)"], [0.125, "rgb(0, 0, 200)"], [0.25, "rgb(0, 25, 255)"], [0.375, "rgb(0, 152, 255)"], [0.5, "rgb(44, 255, 150)"], [0.625, "rgb(151, 255, 0)"],
251
        [0.75, "rgb(255, 234, 0)"], [0.875, "rgb(255, 111, 0)"], [1, "rgb(255, 0, 0)"]
252
        fig = go.Figure(data=go.Scattergeo(
253
            lon=longi,
254
            lat=lati,
255
            text=df[column],
256
            marker=dict(
257
                color=df[column],
258
                colorscale=scl,
259
                reversescale=True,
260
                opacity=0.7,
261
                colorbar=dict(
262
                    titleside="right",
263
                    outlinecolor="rgba(68, 68, 68, 0)",
264
                    title=column,
265
                    dtick=dtick)
266
            )
267
        ))
268
269
        fig.update_layout(
270
            geo=dict(
271
272
                showland=True,
273
                landcolor="rgb(212, 212, 212)",
274
                subunitcolor="rgb(140, 255, 0)",
275
                countrycolor="rgb(150, 255, 100)",
276
                showlakes=True,
277
                showcoastlines=True,
278
                lakecolor="rgb(0, 150, 255)",
279
280
                resolution=50,
281
282
                lonaxis=dict(
283
                    showgrid=True,
284
                    gridwidth=0.5,
285
                    range=[-180.0, -55.0],
286
                    dtick=5
287
                ),
288
                lataxis=dict(
289
                    showgrid=True,
290
                    gridwidth=0.5,
291
                    range=[45, 85],
292
                    dtick=5
293
                )
294
            ),
295
        )
296
        fig.show()
297
        nameOfFile = column + '_MAP.png'
298
        fig.write_image(nameOfFile)