|
a |
|
b/DataAnalysis.py |
|
|
1 |
import pandas as pd |
|
|
2 |
import numpy as np |
|
|
3 |
|
|
|
4 |
# Modelling Algorithms |
|
|
5 |
from sklearn.tree import DecisionTreeClassifier |
|
|
6 |
from sklearn.linear_model import LogisticRegression |
|
|
7 |
from sklearn.neighbors import KNeighborsClassifier |
|
|
8 |
from sklearn.naive_bayes import GaussianNB |
|
|
9 |
from sklearn.svm import SVC, LinearSVC |
|
|
10 |
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier |
|
|
11 |
from sklearn.metrics import make_scorer, accuracy_score |
|
|
12 |
from sklearn.model_selection import train_test_split |
|
|
13 |
from sklearn.ensemble import RandomForestClassifier |
|
|
14 |
from sklearn.linear_model import LogisticRegression |
|
|
15 |
from sklearn.metrics import classification_report |
|
|
16 |
from sklearn.model_selection import GridSearchCV |
|
|
17 |
from sklearn.metrics import confusion_matrix |
|
|
18 |
from sklearn.metrics import accuracy_score |
|
|
19 |
from sklearn import preprocessing |
|
|
20 |
from sklearn.utils.multiclass import unique_labels |
|
|
21 |
# import plotly.graph_objects as go |
|
|
22 |
|
|
|
23 |
# Visualisation |
|
|
24 |
import matplotlib as mpl |
|
|
25 |
import matplotlib.pyplot as mpl |
|
|
26 |
import matplotlib.pylab as pylab |
|
|
27 |
import seaborn as sns |
|
|
28 |
|
|
|
29 |
|
|
|
30 |
class DataAnalysisUtils: |
|
|
31 |
|
|
|
32 |
def plotCorrelationHeatMap(self, dataFrame): |
|
|
33 |
corr = round(dataFrame.corr(), 2) |
|
|
34 |
mask = np.zeros_like(corr, dtype=np.bool) |
|
|
35 |
mask[np.triu_indices_from(mask)] = True |
|
|
36 |
|
|
|
37 |
# Set up the matplotlib figure |
|
|
38 |
f, ax = mpl.subplots(figsize=(26, 26)) |
|
|
39 |
|
|
|
40 |
# Generate a custom diverging colormap |
|
|
41 |
cmap = sns.diverging_palette(220, 10, as_cmap=True) |
|
|
42 |
|
|
|
43 |
# Draw the heatmap with the mask and correct aspect ratio |
|
|
44 |
sns.heatmap(corr, cmap=cmap, vmax=1, vmin=-1, center=0, |
|
|
45 |
linewidths=.2, cbar_kws={"shrink": .7}, annot=True, annot_kws={"fontsize": 5, 'fontweight': 'bold'}) |
|
|
46 |
|
|
|
47 |
f.savefig("Correlation.png") |
|
|
48 |
return f |
|
|
49 |
|
|
|
50 |
def plotUnivariateDistribution(self, df, column): |
|
|
51 |
sns.set(rc={'figure.figsize': (9, 7)}) |
|
|
52 |
sns.distplot(df[column]) |
|
|
53 |
mpl.savefig('importance.png', dpi=150) |
|
|
54 |
|
|
|
55 |
|
|
|
56 |
def plotBivariateDistribution(self, df, var, target, **kwargs): |
|
|
57 |
row = kwargs.get('row', None) |
|
|
58 |
col = kwargs.get('col', None) |
|
|
59 |
facet = sns.FacetGrid(df, hue=target, aspect=4, row=row, col=col) |
|
|
60 |
facet.map(sns.kdeplot, var, shade=True) |
|
|
61 |
facet.set(xlim=(0, df[var].max())) |
|
|
62 |
facet.add_legend() |
|
|
63 |
|
|
|
64 |
def plotPairAllFeatureByHue(self, dataFrame, hue): |
|
|
65 |
sns.pairplot(dataFrame, hue=hue,corner=True) |
|
|
66 |
mpl.savefig('pairFeatures2.png', dpi=150) |
|
|
67 |
|
|
|
68 |
def plotJoint(self, dataFrame, columnX, columnY, size=6): |
|
|
69 |
sns.jointplot(x=columnX, y=columnY, data=dataFrame, |
|
|
70 |
size=size, kind='kde', color='#800000', space=0) |
|
|
71 |
|
|
|
72 |
def plotScatterAllFeatures(self, dataFrame): |
|
|
73 |
pd.plotting.scatter_matrix(dataFrame, figsize=(14, 14),alpha=0.2) |
|
|
74 |
mpl.savefig("scatter.png") |
|
|
75 |
|
|
|
76 |
def plotColumnVersusColumn(self, dataFrame, columnX, columnY, kind='scatter', color='red'): |
|
|
77 |
'kind : line|scatter' |
|
|
78 |
dataFrame.plot(kind=kind, x=columnX, y=columnY, color=color) |
|
|
79 |
mpl.xlabel(columnX) |
|
|
80 |
mpl.ylabel(columnY) |
|
|
81 |
mpl.savefig("ColumnVersusColumn.png") |
|
|
82 |
|
|
|
83 |
def plotHistograms(self, df, variables, n_rows, n_cols): |
|
|
84 |
sns.set(style="white") |
|
|
85 |
fig = mpl.figure(figsize=(16, 12)) |
|
|
86 |
for i, var_name in enumerate(variables): |
|
|
87 |
ax = fig.add_subplot(n_rows, n_cols, i + 1) |
|
|
88 |
df[var_name].hist(bins=10, ax=ax) |
|
|
89 |
# + ' ' + var_name ) #var_name+" Distribution") |
|
|
90 |
ax.set_title(var_name, fontweight='bold') |
|
|
91 |
# ax.set_xticklabels([], visible=False) |
|
|
92 |
# ax.set_yticklabels([], visible=False) |
|
|
93 |
fig.tight_layout() # Improves appearance a bit. |
|
|
94 |
|
|
|
95 |
mpl.show() |
|
|
96 |
return fig |
|
|
97 |
|
|
|
98 |
def plotCategories(self, df, cat, target, **kwargs): |
|
|
99 |
row = kwargs.get('row', None) |
|
|
100 |
col = kwargs.get('col', None) |
|
|
101 |
facet = sns.FacetGrid(df, row=row, col=col) |
|
|
102 |
facet.map(sns.barplot, cat, target) |
|
|
103 |
facet.add_legend() |
|
|
104 |
|
|
|
105 |
def describeMore(self, df): |
|
|
106 |
var = [] |
|
|
107 |
l = [] |
|
|
108 |
t = [] |
|
|
109 |
for x in df: |
|
|
110 |
var.append(x) |
|
|
111 |
l.append(len(pd.value_counts(df[x]))) |
|
|
112 |
t.append(df[x].dtypes) |
|
|
113 |
levels = pd.DataFrame({'Variable': var, 'Levels': l, 'Datatype': t}) |
|
|
114 |
levels.sort_values(by='Levels', inplace=True) |
|
|
115 |
return levels |
|
|
116 |
|
|
|
117 |
def plotVariableImportance(self, X, y): |
|
|
118 |
tree = DecisionTreeClassifier(random_state=99) |
|
|
119 |
tree.fit(X, y) |
|
|
120 |
# self.plotModelVarImp(tree, X, y) |
|
|
121 |
return tree.feature_importances_,X.columns |
|
|
122 |
|
|
|
123 |
def plotModelVarImp(self, model, X, y): |
|
|
124 |
imp = pd.DataFrame( |
|
|
125 |
model.feature_importances_, |
|
|
126 |
columns=['Importance'], |
|
|
127 |
index=X.columns |
|
|
128 |
) |
|
|
129 |
mpl.figure(figsize=(60, 80)) |
|
|
130 |
imp = imp.sort_values(['Importance'], ascending=False) |
|
|
131 |
imp[: 20].plot(kind='barh') |
|
|
132 |
print(model.score(X, y)) |
|
|
133 |
mpl.tight_layout() |
|
|
134 |
mpl.savefig('importance.png', dpi=150) |
|
|
135 |
|
|
|
136 |
def boxPlotOnTwoColumn(self, dataFrame, column, columnBy): |
|
|
137 |
f, ax = mpl.subplots(figsize=(12, 8)) |
|
|
138 |
fig = sns.boxplot(x=column, y=columnBy, data=dataFrame) |
|
|
139 |
|
|
|
140 |
def convertColumnsToRow(self, dataFrame, id, columns): |
|
|
141 |
return pd.melt(frame=dataFrame, id_vars=id, value_vars=columns) |
|
|
142 |
|
|
|
143 |
def convertRowsToColumn(self, melted, id, columns, values): |
|
|
144 |
return melted.pivot(index=id, columns=columns, values=values) |
|
|
145 |
|
|
|
146 |
def concatDataFramesFromRow(self, dataFrame1, dataFrame2): |
|
|
147 |
return pd.concat([dataFrame1, dataFrame2], axis=0, ignore_index=True) |
|
|
148 |
|
|
|
149 |
def concatDataFramesFromColumn(self, dataFrame1, dataFrame2): |
|
|
150 |
return pd.concat([dataFrame1, dataFrame2], axis=1) |
|
|
151 |
|
|
|
152 |
def checkMissingData(self, df): |
|
|
153 |
flag = df.isna().sum().any() |
|
|
154 |
if flag == True: |
|
|
155 |
total = df.isnull().sum() |
|
|
156 |
percent = (df.isnull().sum()) / (df.isnull().count() * 100) |
|
|
157 |
output = pd.concat([total, percent], axis=1, |
|
|
158 |
keys=['Total', 'Percent']) |
|
|
159 |
data_type = [] |
|
|
160 |
# written by MJ Bahmani |
|
|
161 |
for col in df.columns: |
|
|
162 |
dtype = str(df[col].dtype) |
|
|
163 |
data_type.append(dtype) |
|
|
164 |
output['Types'] = data_type |
|
|
165 |
return (np.transpose(output)) |
|
|
166 |
else: |
|
|
167 |
return (False) |
|
|
168 |
|
|
|
169 |
def randomForestClassifierGridSearch(self, X_train, y_train, estimator=[4, 6, 9], depth=[2, 3, 5, 10], sampleSplit=[2, 3, 5], sampleLeaf=[1, 5, 8]): |
|
|
170 |
rfc = RandomForestClassifier() |
|
|
171 |
|
|
|
172 |
# Choose some parameter combinations to try |
|
|
173 |
parameters = {'n_estimators': estimator, |
|
|
174 |
'max_features': ['log2', 'sqrt', 'auto'], |
|
|
175 |
'criterion': ['entropy', 'gini'], |
|
|
176 |
'max_depth': depth, |
|
|
177 |
'min_samples_split': sampleSplit, |
|
|
178 |
'min_samples_leaf': sampleLeaf |
|
|
179 |
} |
|
|
180 |
|
|
|
181 |
# Type of scoring used to compare parameter combinations |
|
|
182 |
acc_scorer = make_scorer(accuracy_score) |
|
|
183 |
|
|
|
184 |
# Run the grid search |
|
|
185 |
grid_obj = GridSearchCV(rfc, parameters, scoring=acc_scorer) |
|
|
186 |
grid_obj = grid_obj.fit(X_train, y_train) |
|
|
187 |
|
|
|
188 |
# Set the clf to the best combination of parameters |
|
|
189 |
rfc = grid_obj.best_estimator_ |
|
|
190 |
|
|
|
191 |
# Fit the best algorithm to the data. |
|
|
192 |
rfc.fit(X_train, y_train) |
|
|
193 |
|
|
|
194 |
return rfc |
|
|
195 |
|
|
|
196 |
def plot_confusion_matrix(self, y_true, y_pred, classes, |
|
|
197 |
normalize=False, |
|
|
198 |
title=None, |
|
|
199 |
cmap=mpl.cm.Blues): |
|
|
200 |
""" |
|
|
201 |
This function prints and plots the confusion matrix. |
|
|
202 |
Normalization can be applied by setting `normalize=True`. |
|
|
203 |
""" |
|
|
204 |
if not title: |
|
|
205 |
if normalize: |
|
|
206 |
title = 'Normalized confusion matrix' |
|
|
207 |
else: |
|
|
208 |
title = 'Confusion matrix, without normalization' |
|
|
209 |
|
|
|
210 |
# Compute confusion matrix |
|
|
211 |
cm = confusion_matrix(y_true, y_pred) |
|
|
212 |
# Only use the labels that appear in the data |
|
|
213 |
classes = classes[unique_labels(y_true, y_pred)] |
|
|
214 |
if normalize: |
|
|
215 |
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] |
|
|
216 |
print("Normalized confusion matrix") |
|
|
217 |
else: |
|
|
218 |
print('Confusion matrix, without normalization') |
|
|
219 |
|
|
|
220 |
print(cm) |
|
|
221 |
|
|
|
222 |
fig, ax = mpl.subplots() |
|
|
223 |
im = ax.imshow(cm, interpolation='nearest', cmap=cmap) |
|
|
224 |
ax.figure.colorbar(im, ax=ax) |
|
|
225 |
# We want to show all ticks... |
|
|
226 |
ax.set(xticks=np.arange(cm.shape[1]), |
|
|
227 |
yticks=np.arange(cm.shape[0]), |
|
|
228 |
# ... and label them with the respective list entries |
|
|
229 |
xticklabels=classes, yticklabels=classes, |
|
|
230 |
title=title, |
|
|
231 |
ylabel='True label', |
|
|
232 |
xlabel='Predicted label') |
|
|
233 |
|
|
|
234 |
# Rotate the tick labels and set their alignment. |
|
|
235 |
mpl.setp(ax.get_xticklabels(), rotation=45, ha="right", |
|
|
236 |
rotation_mode="anchor") |
|
|
237 |
|
|
|
238 |
# Loop over data dimensions and create text annotations. |
|
|
239 |
fmt = '.2f' if normalize else 'd' |
|
|
240 |
thresh = cm.max() / 2. |
|
|
241 |
for i in range(cm.shape[0]): |
|
|
242 |
for j in range(cm.shape[1]): |
|
|
243 |
ax.text(j, i, format(cm[i, j], fmt), |
|
|
244 |
ha="center", va="center", |
|
|
245 |
color="white" if cm[i, j] > thresh else "black") |
|
|
246 |
fig.tight_layout() |
|
|
247 |
return ax |
|
|
248 |
|
|
|
249 |
def plotGeoData(self, lati, longi, df, column, dtick=10): |
|
|
250 |
scl = [0, "rgb(150,0,90)"], [0.125, "rgb(0, 0, 200)"], [0.25, "rgb(0, 25, 255)"], [0.375, "rgb(0, 152, 255)"], [0.5, "rgb(44, 255, 150)"], [0.625, "rgb(151, 255, 0)"], |
|
|
251 |
[0.75, "rgb(255, 234, 0)"], [0.875, "rgb(255, 111, 0)"], [1, "rgb(255, 0, 0)"] |
|
|
252 |
fig = go.Figure(data=go.Scattergeo( |
|
|
253 |
lon=longi, |
|
|
254 |
lat=lati, |
|
|
255 |
text=df[column], |
|
|
256 |
marker=dict( |
|
|
257 |
color=df[column], |
|
|
258 |
colorscale=scl, |
|
|
259 |
reversescale=True, |
|
|
260 |
opacity=0.7, |
|
|
261 |
colorbar=dict( |
|
|
262 |
titleside="right", |
|
|
263 |
outlinecolor="rgba(68, 68, 68, 0)", |
|
|
264 |
title=column, |
|
|
265 |
dtick=dtick) |
|
|
266 |
) |
|
|
267 |
)) |
|
|
268 |
|
|
|
269 |
fig.update_layout( |
|
|
270 |
geo=dict( |
|
|
271 |
|
|
|
272 |
showland=True, |
|
|
273 |
landcolor="rgb(212, 212, 212)", |
|
|
274 |
subunitcolor="rgb(140, 255, 0)", |
|
|
275 |
countrycolor="rgb(150, 255, 100)", |
|
|
276 |
showlakes=True, |
|
|
277 |
showcoastlines=True, |
|
|
278 |
lakecolor="rgb(0, 150, 255)", |
|
|
279 |
|
|
|
280 |
resolution=50, |
|
|
281 |
|
|
|
282 |
lonaxis=dict( |
|
|
283 |
showgrid=True, |
|
|
284 |
gridwidth=0.5, |
|
|
285 |
range=[-180.0, -55.0], |
|
|
286 |
dtick=5 |
|
|
287 |
), |
|
|
288 |
lataxis=dict( |
|
|
289 |
showgrid=True, |
|
|
290 |
gridwidth=0.5, |
|
|
291 |
range=[45, 85], |
|
|
292 |
dtick=5 |
|
|
293 |
) |
|
|
294 |
), |
|
|
295 |
) |
|
|
296 |
fig.show() |
|
|
297 |
nameOfFile = column + '_MAP.png' |
|
|
298 |
fig.write_image(nameOfFile) |