Diff of /dataWrapping.py [000000] .. [16d75d]

Switch to unified view

a b/dataWrapping.py
1
import pandas
2
# from pattern.en import sentiment
3
# import HTMLParser
4
import re
5
import pandas as pd
6
from collections import Counter
7
from nltk.corpus import stopwords
8
import string
9
from collections import OrderedDict
10
from nltk import bigrams
11
from nltk.tokenize import word_tokenize
12
import matplotlib.pyplot as plt
13
import numpy as np
14
# import plotly.plotly as py
15
16
# import pandas as pd
17
# import matplotlib.pyplot as plt
18
import numpy as np
19
from sklearn.metrics import recall_score, precision_score, accuracy_score
20
import math
21
from sklearn.feature_extraction.text import CountVectorizer
22
from sklearn.model_selection import train_test_split
23
from sklearn.feature_extraction.text import TfidfVectorizer
24
from sklearn.naive_bayes import MultinomialNB
25
from sklearn.metrics import confusion_matrix
26
from sklearn.feature_selection import RFE
27
import requests
28
from bs4 import BeautifulSoup
29
# import numpy as np
30
# import matplotlib.pyplot as plt
31
# from matplotlib import style
32
# style.use("ggplot")
33
import os
34
35
data_outcome = pd.read_csv("C:\Shashank Reddy\Outcome.csv",sep='\s*,\s*',header=0, encoding='ascii', engine='python')
36
37
38
39
#print(data_outcome)
40
41
data_outcome = data_outcome.fillna("zero")
42
43
44
45
46
47
# outcome dictionary
48
outcome = {'removed': 1, 'not removed': 2, 'retrieval': 3, 'non retrieval': 4, 'zero' : 0}
49
data_outcome["Outcome"] = [outcome[item] for item in data_outcome["Outcome"]]
50
51
list_outcome = pd.DataFrame(list(data_outcome["Outcome"]))
52
53
list_outcome.to_csv(r"C:\Shashank Reddy\FinalOutcome.csv",sep='\t', index=False)
54
55
#print(data_outcome)
56
57
58
59
60
#data_outcome.to_dense().to_csv(r"C:\Shashank Reddy\FinalOutcome.csv")
61
62
63
sessilelocation = pd.read_csv("C:\Shashank Reddy\SessileLocation.csv",sep='\s*,\s*',header=0, encoding='ascii', engine='python').fillna("zero")
64
#print(sessilelocation.columns.tolist())
65
#print(sessilelocation)
66
67
location = {'cecal': 1, 'ascending': 2, 'ileum': 3, 'ileocecal': 3, 'hepatic': 4, 'transverse': 5, 'splenic': 6, 'descending': 7, 'sigmoid': 8, 'recto-sigmoid': 9, 'rectal': 10, 'appendix': 11,'zero': 0}
68
sessilelocation["PositionA"] = [location[item] for item in sessilelocation["PositionA"]]
69
sessilelocation["PositionB"] = [location[item] for item in sessilelocation["PositionB"]]
70
sessilelocation["PositionC"] = [location[item] for item in sessilelocation["PositionC"]]
71
sessilelocation["PositionD"] = [location[item] for item in sessilelocation["PositionD"]]
72
sessilelocation["PositionE"] = [location[item] for item in sessilelocation["PositionE"]]
73
sessilelocation["PositionF"] = [location[item] for item in sessilelocation["PositionF"]]
74
sessilelocation["PositionG"] = [location[item] for item in sessilelocation["PositionG"]]
75
76
#print(sessilelocation)
77
78
79
sessileshape = pd.read_csv("C:\Shashank Reddy\SessileShape.csv",sep='\s*,\s*',header=0, encoding='ascii', engine='python').fillna("zero")
80
#print(sessileshape)
81
shape = {'zero':0,'sessile':1,'pedunculated':2,'flat':3,'mass':4,'smooth':5,'serrated':6}
82
83
sessileshape["Shape"] = [shape[item] for item in sessileshape["Shape"]]
84
#print(sessileshape)
85
86
87
sessilesize = pd.read_csv("C:\Shashank Reddy\SessileSize.csv",sep='\s*,\s*',header=0, encoding='ascii', engine='python').fillna("zero")
88
#print(sessilesize)
89
90
size = {'zero':0,'diminutive':1,'small':2,'medium':3,'large':4}
91
sessilesize["Size"] = [size[item] for item in sessilesize["Size"]]
92
#print(sessilesize)
93
94
95
sessileside = pd.read_csv("C:\Shashank Reddy\Sides.csv",sep='\s*,\s*',header=0, encoding='ascii', engine='python').fillna("zero")
96
#print(sessileside)
97
98
side = {'zero':0,'left':1,'right':2}
99
sessileside["Sides"] = [side[item] for item in sessileside["Sides"]]
100
#print(sessileside)
101
102
103
cancer_treatment = pd.read_csv("C:\Shashank Reddy\Treatment.csv",sep='\s*,\s*',header=0, encoding='ascii', engine='python').fillna("zero")
104
#print(cancer_treatment)
105
106
treatment = {'zero':0,'piermeal':1,'cold snare':2,'hot snare':3,'snare':4,'electocautery snare':5,'excisional biopsy':6,'biopsy forcep':7,'cold biopsy':8}
107
cancer_treatment["Treatment"] = [treatment[item] for item in cancer_treatment["Treatment"]]
108
109
list_treatment = pd.DataFrame(list(cancer_treatment["Treatment"]))
110
111
list_treatment.to_csv(r"C:\Shashank Reddy\FinalTreatment.csv",sep='\t', index=False)
112
#print(cancer_treatment)
113
114
115
sessile_number = pd.read_csv("C:\Shashank Reddy\SessileNumber.csv",sep='\s*,\s*',header=0, encoding='ascii', engine='python').fillna("zero")
116
#print(sessile_number)
117
118
number = {'zero':0,'one':1,'two':2,'three':3,'four':4,'five':5,'six':7,'eight':8,'nine':9,'ten':10}
119
120
sessile_number["Number1"] = [number[item] for item in sessile_number["Number1"]]
121
sessile_number["Number2"] = [number[item] for item in sessile_number["Number2"]]
122
sessile_number["Number3"] = [number[item] for item in sessile_number["Number3"]]
123
sessile_number["Number4"] = [number[item] for item in sessile_number["Number4"]]
124
125
#print(sessile_number)
126
127
#************************************* Data Union *********************************************************************************
128
129
list_mat1= pd.DataFrame(list(sessilelocation["PositionA"]))
130
list_mat2= pd.DataFrame(list(sessilelocation["PositionB"]))
131
list_mat3= pd.DataFrame(list(sessilelocation["PositionC"]))
132
list_mat4= pd.DataFrame(list(sessilelocation["PositionD"]))
133
list_mat5= pd.DataFrame(list(sessilelocation["PositionE"]))
134
list_mat6= pd.DataFrame(list(sessilelocation["PositionF"]))
135
list_mat7= pd.DataFrame(list(sessilelocation["PositionG"]))
136
137
list_mat8= pd.DataFrame(list(sessileshape["Shape"]))
138
list_mat9= pd.DataFrame(list(sessilesize["Size"]))
139
list_mat10= pd.DataFrame(list(sessileside["Sides"]))
140
141
list_mat11= pd.DataFrame(list(sessile_number["Number1"]))
142
list_mat12= pd.DataFrame(list(sessile_number["Number2"]))
143
list_mat13= pd.DataFrame(list(sessile_number["Number3"]))
144
list_mat14= pd.DataFrame(list(sessile_number["Number4"]))
145
146
147
Final_Data = pd.concat([list_mat1,list_mat2,list_mat3,list_mat4,list_mat5,list_mat6,list_mat7,list_mat8,list_mat9,list_mat10,list_mat11
148
                        ,list_mat12,list_mat13,list_mat14],axis = 1)
149
150
151
print(Final_Data)
152
153
154
Final_Data.to_csv(r"C:\Shashank Reddy\DataSet_Final.csv",index = False)
155
156
157
#print(Final_Data)
158
159
160
161
162
163
164