|
a |
|
b/data.py |
|
|
1 |
import pandas as pd |
|
|
2 |
|
|
|
3 |
|
|
|
4 |
|
|
|
5 |
def process_raw_data(data_path): |
|
|
6 |
full_data = pd.read_csv(data_path) |
|
|
7 |
|
|
|
8 |
# drop None values from the relevant columns |
|
|
9 |
full_data = full_data.dropna(subset=['description', 'transcription', 'medical_specialty']).reset_index(drop=True) |
|
|
10 |
|
|
|
11 |
full_data['description'] = full_data['description'].str.lower() # convert text data to lower case |
|
|
12 |
full_data['transcription'] = full_data['transcription'].str.lower() # convert text data to lower case |
|
|
13 |
full_data["medical_specialty"] = full_data["medical_specialty"].str.strip() # delete leading / trailing whitespaces |
|
|
14 |
|
|
|
15 |
|
|
|
16 |
#data["words"] = data["description"].apply(lambda x: nltk.word_tokenize(x)) # description transcription |
|
|
17 |
#stop_words = set(stopwords.words('english')) |
|
|
18 |
#data['words_without_stopwords'] = data["words"].apply(lambda x: [word for word in x if word not in (stop_words)]) |
|
|
19 |
|
|
|
20 |
# drop general categories (for example "Surgery" category is kind of superset as there can be surgeries belonging to specializations like cardiology,neurolrogy etc) |
|
|
21 |
general_categories_rows = full_data["medical_specialty"].isin(["Surgery", 'SOAP / Chart / Progress Notes', 'Office Notes', 'Consult - History and Phy.', 'Emergency Room Reports', 'Discharge Summary', 'Pain Management', 'General Medicine']) |
|
|
22 |
data = full_data.drop(full_data[general_categories_rows].index) |
|
|
23 |
data = data.reset_index(drop=True) |
|
|
24 |
|
|
|
25 |
# Combine similar categories |
|
|
26 |
data["medical_specialty"] = data["medical_specialty"].str.replace("Neurosurgery", "Neurology") |
|
|
27 |
|
|
|
28 |
# add "labels" column |
|
|
29 |
data['medical_specialty'] = pd.Categorical(data['medical_specialty']) |
|
|
30 |
data['labels'] = data['medical_specialty'].cat.codes |
|
|
31 |
|
|
|
32 |
|
|
|
33 |
categories_mapping = dict(enumerate(data['medical_specialty'].cat.categories)) |
|
|
34 |
|
|
|
35 |
# Take only top 5 categories |
|
|
36 |
top_categories_num = 5 |
|
|
37 |
cause_dist = data['medical_specialty'].value_counts()[0:top_categories_num] |
|
|
38 |
cause_dist_unseen_cat = data['medical_specialty'].value_counts()[top_categories_num:] |
|
|
39 |
|
|
|
40 |
test_unseen_categories = data[data["medical_specialty"].isin(cause_dist_unseen_cat.keys())] |
|
|
41 |
test_unseen_categories = test_unseen_categories.reset_index(drop=True) |
|
|
42 |
|
|
|
43 |
# take in unseen data only categories which have more than 50 samples |
|
|
44 |
unseen_categories_groups = test_unseen_categories.groupby(test_unseen_categories['medical_specialty']) |
|
|
45 |
test_unseen_categories = unseen_categories_groups.filter(lambda x:x.shape[0] > 50) |
|
|
46 |
|
|
|
47 |
unseen_categories_mapping = dict(enumerate(test_unseen_categories['medical_specialty'].cat.categories)) |
|
|
48 |
|
|
|
49 |
data = data[data["medical_specialty"].isin(cause_dist.keys())] |
|
|
50 |
data = data.reset_index(drop=True) |
|
|
51 |
|
|
|
52 |
return data, test_unseen_categories |
|
|
53 |
|
|
|
54 |
|
|
|
55 |
|