--- a +++ b/DataAnalysis.py @@ -0,0 +1,298 @@ +import pandas as pd +import numpy as np + +# Modelling Algorithms +from sklearn.tree import DecisionTreeClassifier +from sklearn.linear_model import LogisticRegression +from sklearn.neighbors import KNeighborsClassifier +from sklearn.naive_bayes import GaussianNB +from sklearn.svm import SVC, LinearSVC +from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier +from sklearn.metrics import make_scorer, accuracy_score +from sklearn.model_selection import train_test_split +from sklearn.ensemble import RandomForestClassifier +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import classification_report +from sklearn.model_selection import GridSearchCV +from sklearn.metrics import confusion_matrix +from sklearn.metrics import accuracy_score +from sklearn import preprocessing +from sklearn.utils.multiclass import unique_labels +# import plotly.graph_objects as go + +# Visualisation +import matplotlib as mpl +import matplotlib.pyplot as mpl +import matplotlib.pylab as pylab +import seaborn as sns + + +class DataAnalysisUtils: + + def plotCorrelationHeatMap(self, dataFrame): + corr = round(dataFrame.corr(), 2) + mask = np.zeros_like(corr, dtype=np.bool) + mask[np.triu_indices_from(mask)] = True + + # Set up the matplotlib figure + f, ax = mpl.subplots(figsize=(26, 26)) + + # Generate a custom diverging colormap + cmap = sns.diverging_palette(220, 10, as_cmap=True) + + # Draw the heatmap with the mask and correct aspect ratio + sns.heatmap(corr, cmap=cmap, vmax=1, vmin=-1, center=0, + linewidths=.2, cbar_kws={"shrink": .7}, annot=True, annot_kws={"fontsize": 5, 'fontweight': 'bold'}) + + f.savefig("Correlation.png") + return f + + def plotUnivariateDistribution(self, df, column): + sns.set(rc={'figure.figsize': (9, 7)}) + sns.distplot(df[column]) + mpl.savefig('importance.png', dpi=150) + + + def plotBivariateDistribution(self, df, var, target, **kwargs): + row = kwargs.get('row', None) + col = kwargs.get('col', None) + facet = sns.FacetGrid(df, hue=target, aspect=4, row=row, col=col) + facet.map(sns.kdeplot, var, shade=True) + facet.set(xlim=(0, df[var].max())) + facet.add_legend() + + def plotPairAllFeatureByHue(self, dataFrame, hue): + sns.pairplot(dataFrame, hue=hue,corner=True) + mpl.savefig('pairFeatures2.png', dpi=150) + + def plotJoint(self, dataFrame, columnX, columnY, size=6): + sns.jointplot(x=columnX, y=columnY, data=dataFrame, + size=size, kind='kde', color='#800000', space=0) + + def plotScatterAllFeatures(self, dataFrame): + pd.plotting.scatter_matrix(dataFrame, figsize=(14, 14),alpha=0.2) + mpl.savefig("scatter.png") + + def plotColumnVersusColumn(self, dataFrame, columnX, columnY, kind='scatter', color='red'): + 'kind : line|scatter' + dataFrame.plot(kind=kind, x=columnX, y=columnY, color=color) + mpl.xlabel(columnX) + mpl.ylabel(columnY) + mpl.savefig("ColumnVersusColumn.png") + + def plotHistograms(self, df, variables, n_rows, n_cols): + sns.set(style="white") + fig = mpl.figure(figsize=(16, 12)) + for i, var_name in enumerate(variables): + ax = fig.add_subplot(n_rows, n_cols, i + 1) + df[var_name].hist(bins=10, ax=ax) + # + ' ' + var_name ) #var_name+" Distribution") + ax.set_title(var_name, fontweight='bold') + # ax.set_xticklabels([], visible=False) + # ax.set_yticklabels([], visible=False) + fig.tight_layout() # Improves appearance a bit. + + mpl.show() + return fig + + def plotCategories(self, df, cat, target, **kwargs): + row = kwargs.get('row', None) + col = kwargs.get('col', None) + facet = sns.FacetGrid(df, row=row, col=col) + facet.map(sns.barplot, cat, target) + facet.add_legend() + + def describeMore(self, df): + var = [] + l = [] + t = [] + for x in df: + var.append(x) + l.append(len(pd.value_counts(df[x]))) + t.append(df[x].dtypes) + levels = pd.DataFrame({'Variable': var, 'Levels': l, 'Datatype': t}) + levels.sort_values(by='Levels', inplace=True) + return levels + + def plotVariableImportance(self, X, y): + tree = DecisionTreeClassifier(random_state=99) + tree.fit(X, y) + # self.plotModelVarImp(tree, X, y) + return tree.feature_importances_,X.columns + + def plotModelVarImp(self, model, X, y): + imp = pd.DataFrame( + model.feature_importances_, + columns=['Importance'], + index=X.columns + ) + mpl.figure(figsize=(60, 80)) + imp = imp.sort_values(['Importance'], ascending=False) + imp[: 20].plot(kind='barh') + print(model.score(X, y)) + mpl.tight_layout() + mpl.savefig('importance.png', dpi=150) + + def boxPlotOnTwoColumn(self, dataFrame, column, columnBy): + f, ax = mpl.subplots(figsize=(12, 8)) + fig = sns.boxplot(x=column, y=columnBy, data=dataFrame) + + def convertColumnsToRow(self, dataFrame, id, columns): + return pd.melt(frame=dataFrame, id_vars=id, value_vars=columns) + + def convertRowsToColumn(self, melted, id, columns, values): + return melted.pivot(index=id, columns=columns, values=values) + + def concatDataFramesFromRow(self, dataFrame1, dataFrame2): + return pd.concat([dataFrame1, dataFrame2], axis=0, ignore_index=True) + + def concatDataFramesFromColumn(self, dataFrame1, dataFrame2): + return pd.concat([dataFrame1, dataFrame2], axis=1) + + def checkMissingData(self, df): + flag = df.isna().sum().any() + if flag == True: + total = df.isnull().sum() + percent = (df.isnull().sum()) / (df.isnull().count() * 100) + output = pd.concat([total, percent], axis=1, + keys=['Total', 'Percent']) + data_type = [] + # written by MJ Bahmani + for col in df.columns: + dtype = str(df[col].dtype) + data_type.append(dtype) + output['Types'] = data_type + return (np.transpose(output)) + else: + return (False) + + def randomForestClassifierGridSearch(self, X_train, y_train, estimator=[4, 6, 9], depth=[2, 3, 5, 10], sampleSplit=[2, 3, 5], sampleLeaf=[1, 5, 8]): + rfc = RandomForestClassifier() + + # Choose some parameter combinations to try + parameters = {'n_estimators': estimator, + 'max_features': ['log2', 'sqrt', 'auto'], + 'criterion': ['entropy', 'gini'], + 'max_depth': depth, + 'min_samples_split': sampleSplit, + 'min_samples_leaf': sampleLeaf + } + + # Type of scoring used to compare parameter combinations + acc_scorer = make_scorer(accuracy_score) + + # Run the grid search + grid_obj = GridSearchCV(rfc, parameters, scoring=acc_scorer) + grid_obj = grid_obj.fit(X_train, y_train) + + # Set the clf to the best combination of parameters + rfc = grid_obj.best_estimator_ + + # Fit the best algorithm to the data. + rfc.fit(X_train, y_train) + + return rfc + + def plot_confusion_matrix(self, y_true, y_pred, classes, + normalize=False, + title=None, + cmap=mpl.cm.Blues): + """ + This function prints and plots the confusion matrix. + Normalization can be applied by setting `normalize=True`. + """ + if not title: + if normalize: + title = 'Normalized confusion matrix' + else: + title = 'Confusion matrix, without normalization' + + # Compute confusion matrix + cm = confusion_matrix(y_true, y_pred) + # Only use the labels that appear in the data + classes = classes[unique_labels(y_true, y_pred)] + if normalize: + cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] + print("Normalized confusion matrix") + else: + print('Confusion matrix, without normalization') + + print(cm) + + fig, ax = mpl.subplots() + im = ax.imshow(cm, interpolation='nearest', cmap=cmap) + ax.figure.colorbar(im, ax=ax) + # We want to show all ticks... + ax.set(xticks=np.arange(cm.shape[1]), + yticks=np.arange(cm.shape[0]), + # ... and label them with the respective list entries + xticklabels=classes, yticklabels=classes, + title=title, + ylabel='True label', + xlabel='Predicted label') + + # Rotate the tick labels and set their alignment. + mpl.setp(ax.get_xticklabels(), rotation=45, ha="right", + rotation_mode="anchor") + + # Loop over data dimensions and create text annotations. + fmt = '.2f' if normalize else 'd' + thresh = cm.max() / 2. + for i in range(cm.shape[0]): + for j in range(cm.shape[1]): + ax.text(j, i, format(cm[i, j], fmt), + ha="center", va="center", + color="white" if cm[i, j] > thresh else "black") + fig.tight_layout() + return ax + + def plotGeoData(self, lati, longi, df, column, dtick=10): + scl = [0, "rgb(150,0,90)"], [0.125, "rgb(0, 0, 200)"], [0.25, "rgb(0, 25, 255)"], [0.375, "rgb(0, 152, 255)"], [0.5, "rgb(44, 255, 150)"], [0.625, "rgb(151, 255, 0)"], + [0.75, "rgb(255, 234, 0)"], [0.875, "rgb(255, 111, 0)"], [1, "rgb(255, 0, 0)"] + fig = go.Figure(data=go.Scattergeo( + lon=longi, + lat=lati, + text=df[column], + marker=dict( + color=df[column], + colorscale=scl, + reversescale=True, + opacity=0.7, + colorbar=dict( + titleside="right", + outlinecolor="rgba(68, 68, 68, 0)", + title=column, + dtick=dtick) + ) + )) + + fig.update_layout( + geo=dict( + + showland=True, + landcolor="rgb(212, 212, 212)", + subunitcolor="rgb(140, 255, 0)", + countrycolor="rgb(150, 255, 100)", + showlakes=True, + showcoastlines=True, + lakecolor="rgb(0, 150, 255)", + + resolution=50, + + lonaxis=dict( + showgrid=True, + gridwidth=0.5, + range=[-180.0, -55.0], + dtick=5 + ), + lataxis=dict( + showgrid=True, + gridwidth=0.5, + range=[45, 85], + dtick=5 + ) + ), + ) + fig.show() + nameOfFile = column + '_MAP.png' + fig.write_image(nameOfFile)