a b/data.py
1
import pandas as pd
2
3
4
5
def process_raw_data(data_path):
6
    full_data = pd.read_csv(data_path)
7
8
    # drop None values from the relevant columns
9
    full_data = full_data.dropna(subset=['description', 'transcription', 'medical_specialty']).reset_index(drop=True)
10
11
    full_data['description'] = full_data['description'].str.lower() # convert text data to lower case
12
    full_data['transcription'] = full_data['transcription'].str.lower() # convert text data to lower case
13
    full_data["medical_specialty"] = full_data["medical_specialty"].str.strip() # delete leading / trailing whitespaces
14
15
16
    #data["words"] =  data["description"].apply(lambda x: nltk.word_tokenize(x))  # description         transcription
17
    #stop_words = set(stopwords.words('english'))
18
    #data['words_without_stopwords'] = data["words"].apply(lambda x: [word for word in x if word not in (stop_words)])
19
20
    # drop general categories (for example "Surgery" category is kind of superset as there can be surgeries belonging to specializations like cardiology,neurolrogy etc)
21
    general_categories_rows = full_data["medical_specialty"].isin(["Surgery", 'SOAP / Chart / Progress Notes', 'Office Notes', 'Consult - History and Phy.', 'Emergency Room Reports', 'Discharge Summary', 'Pain Management', 'General Medicine'])
22
    data = full_data.drop(full_data[general_categories_rows].index)
23
    data = data.reset_index(drop=True)
24
25
    # Combine similar categories
26
    data["medical_specialty"] = data["medical_specialty"].str.replace("Neurosurgery", "Neurology")
27
28
    # add "labels" column
29
    data['medical_specialty'] = pd.Categorical(data['medical_specialty'])
30
    data['labels'] = data['medical_specialty'].cat.codes
31
32
33
    categories_mapping = dict(enumerate(data['medical_specialty'].cat.categories))
34
35
    # Take only top 5 categories
36
    top_categories_num = 5
37
    cause_dist = data['medical_specialty'].value_counts()[0:top_categories_num]
38
    cause_dist_unseen_cat = data['medical_specialty'].value_counts()[top_categories_num:]
39
40
    test_unseen_categories = data[data["medical_specialty"].isin(cause_dist_unseen_cat.keys())]
41
    test_unseen_categories = test_unseen_categories.reset_index(drop=True)
42
43
    # take in unseen data only categories which have more than 50 samples
44
    unseen_categories_groups  = test_unseen_categories.groupby(test_unseen_categories['medical_specialty'])
45
    test_unseen_categories = unseen_categories_groups.filter(lambda x:x.shape[0] > 50)
46
47
    unseen_categories_mapping = dict(enumerate(test_unseen_categories['medical_specialty'].cat.categories))
48
49
    data = data[data["medical_specialty"].isin(cause_dist.keys())]
50
    data = data.reset_index(drop=True)
51
52
    return data, test_unseen_categories
53
54
55