--- a +++ b/datasort.py @@ -0,0 +1,175 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Aug 15 11:20:27 2018 + +@author: Wesley +""" + +import pandas +# from pattern.en import sentiment +# import HTMLParser +import re +import pandas as pd +from collections import Counter +from nltk.corpus import stopwords +import string +from collections import OrderedDict +from nltk import bigrams +from nltk.tokenize import word_tokenize +import matplotlib.pyplot as plt +import numpy as np +# import plotly.plotly as py + +# import pandas as pd +# import matplotlib.pyplot as plt +import numpy as np +from sklearn.metrics import recall_score, precision_score, accuracy_score +import math +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.model_selection import train_test_split +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.naive_bayes import MultinomialNB +from sklearn.metrics import confusion_matrix +from sklearn.feature_selection import RFE +import requests +from bs4 import BeautifulSoup +# import numpy as np +# import matplotlib.pyplot as plt +# from matplotlib import style +# style.use("ggplot") +import os + + +os.getcwd() +#os.chdir(r'D:\Data Science\Text for Findings') + + +data = pd.read_csv("C:\Shashank Reddy\DataSet.csv", encoding="ISO-8859-1") +print(data.count()) +data = data[data.columns[0:8]] +# data = data.dropna(subset=['Random_ID','Year','Month','dt','Indications','Findings']) +# data = data.dropna(subset=['Findings']) +#print(data.count()) + +# print(data) +data['PredictionColumn'] = data["Findings"].map(str) + + + +print(data.head(1)) +print(data.count()) +# data = data.dropna(subset=['PredictionColumn']) +# print(data.count()) +# data['PredictionColumn'].to_csv(r'D:\Data Science\Cancer dataset', header=None, index=None, sep=' ') + +list_data = list(data['PredictionColumn']) + + + + +type(list_data) +print(list_data[0:5]) + +from collections import Counter + +Counter(list_data) +# Remove Nan from the list, otherwise it throws error while doing further operations +# cleanedList = [x for x in list_data if str(x) != 'nan'] +Counter() + +# Regex Cleansing of data - Links,Hashtags,UserTags,ReTweet tags,Converting emoticons,Punctuation,Parenthesis +Cleansed_data = [] +for j in list_data: + Special_chars = re.sub(r'[\-\!\@\#\$\%\^\&\*\(\)\_\+\[\]\;\'\.\,\/\{\}\:\"\<\>\?\|]', '', j) + lower = Special_chars.lower() + Widespace = lower.strip() + Cleansed_data.append(Widespace) +print(Cleansed_data[0:5]) +print(len(Cleansed_data)) + +data1 = [] +words = ['piermeal', 'cold Snare', 'hot Snare', 'snare', 'electocautery snare', 'excisional biopsy', 'biopsy forcep', + 'cold biopsy', 'resection', 'removed', 'not removed', 'retrieval', 'not retrieval'] + +for w in Cleansed_data: + tokens = re.findall( + 'piermeal|cold Snare|hot Snare|snare|electocautery snare|excisional biopsy|biopsy forcep|cold biopsy', + w) + data1.append(tokens) +print("DATA 1**************************************") +#print(data1[0:5]) +print(len(data1)) +print(data1) +data_mat1 = pd.DataFrame(data1) +data_mat1.to_dense().to_csv(r"C:\Shashank Reddy\1.csv") + + +data2 = [] + +for w in Cleansed_data: + tokens = re.findall('removed|not removed|retrieval|non retrieval',w) + data2.append(tokens) + +data_mat2 = pd.DataFrame(data2) +data_mat2.to_dense().to_csv(r"C:\Shashank Reddy\2.csv") + + +data3 = [] + +for w in Cleansed_data: + tokens = re.findall('right|left',w) + data3.append(tokens) +data_mat3 = pd.DataFrame(data3) +data_mat3.to_dense().to_csv(r"C:\Shashank Reddy\3.csv") + +data4 = [] +for w in Cleansed_data: + tokens = re.findall('cecal|ascending|ileum|ileocecal|hepatic|transverse|splenic|descending|sigmoid|recto-sigmoid|rectal|appendix|Cecum',w) + data4.append(tokens) +data_mat4 = pd.DataFrame(data4) +data_mat4.to_dense().to_csv(r"C:\Shashank Reddy\4.csv") + + +data5 =[] +for w in Cleansed_data: + tokens = re.findall('sessile|pedunculated|flat|mass|smooth|serrated',w) + data5.append(tokens) +data_mat5 = pd.DataFrame(data5) +data_mat5.to_dense().to_csv(r"C:\Shashank Reddy\5.csv") + + +data6 =[] +for w in Cleansed_data: + tokens = re.findall('small|medium|large|diminutive',w) + data6.append(tokens) +data_mat6 = pd.DataFrame(data6) +data_mat6.to_dense().to_csv(r"C:\Shashank Reddy\6.csv") + + +data7 =[] + +for w in Cleansed_data: + tokens = re.findall('one|two|three|four|five|six|seven|eight|nine|ten',w) + data7.append(tokens) +data_mat7 = pd.DataFrame(data7) +data_mat7.to_dense().to_csv(r"C:\Shashank Reddy\SessileNumber.csv") + + + + + + + + + + + + + + + +#df = pd.DataFrame(Cleansed_data) +#Full_data = pd.concat([data['Random_ID'], df, data_mat], axis=1) + +# data.to_excel("CountVectorizerOutput.xls",index=False) +#Full_data.to_dense().to_csv(r"C:\Shashank Reddy\DataSet_Final.csv", sep='\t', index=False)