Diff of /src/utils.py [000000] .. [6ac965]

Switch to unified view

a b/src/utils.py
1
import seaborn
2
import pandas as pd
3
import numpy as np
4
import random
5
import string
6
from pathlib import Path
7
# pd.set_option('future.no_silent_downcasting', True) # for future compatibility of .replace method
8
9
def inter_len(set1, set2, dfs=False):
10
    """
11
    Compute the intersection of two sets and return ratio.
12
    """
13
    if dfs:     
14
        n = len(set1.index.intersection(set2.index))
15
        p = float(n)/max(len(set1.index), len(set2.index))
16
    else:
17
        n = len(set1.intersection(set2))
18
        p = float(n)/max(len(set1), len(set2))
19
    return (n, p)
20
21
def plot_pairwise_intersection_ratio(dfs, tech_names):
22
    """
23
    Creates heatmap with pairwise intersection ratios, i.e., how many indices of different 
24
    dataframes are the same. 
25
    """
26
    n = len(dfs)
27
    overlaps = np.zeros((n,n))
28
    for i in range(n):
29
        for j in range(n):
30
            overlaps[i][j] = inter_len(dfs[i].index, dfs[j].index)[1]
31
32
    seaborn.heatmap(overlaps, cmap='hot', xticklabels = tech_names+["ids"], yticklabels = tech_names+["ids"])
33
34
def join_selected_dfs(dfs, df_names, df_names_to_join):
35
    """ 
36
    Join list of dfs.
37
    """
38
    dfs_ids = [df_names.index(df_name) for df_name in df_names_to_join]
39
    result = pd.DataFrame()
40
41
    for i in dfs_ids:
42
        if result.empty:
43
            result = dfs[i].copy()
44
        else:
45
            result = result.join(dfs[i], how="inner")
46
47
    return result
48
49
def add_prefix_to_cols(df, prefix):
50
    """ 
51
    Adds prefix to colnames.
52
    """
53
    df.columns = [prefix+col for col in df.columns]
54
55
def generate_sample_id(characters, length=10):
56
    """ 
57
    Generates random sample id. Same style of id as used in medical datasets. 
58
    """
59
    rand_string = ''.join(random.choices(characters, k=length))
60
61
    return rand_string[:length//2]+'-'+rand_string[length//2:]
62
63
def has_duplicates(seq):
64
    return len(seq) != len(set(seq))
65
66
def gen_unique_ids(n):
67
    """ 
68
    Generate unique list of random ids for a medical dataframe.
69
    """
70
    length = 10
71
    characters = string.ascii_lowercase
72
         
73
    ids = [generate_sample_id(characters, length) for i in range(n)]
74
    assert(not has_duplicates(ids))
75
    return ids
76
77
def prepare_and_save_data(df: pd.DataFrame, 
78
                          directory_path_: str, 
79
                          file_name: str, 
80
                          treatment_col: str ='tre_nivolumab', 
81
                          directory_name: str ="", 
82
                          counterfactual_outcomes = None):
83
    """ 
84
    Brings data into the format required for the Tumor board and creates directories where necessary.
85
    """
86
    # Get colnames
87
    outcome_cols = [col for col in df.columns if col.startswith("out")]
88
89
    # Create directories
90
    prepared_path_ = directory_path_ + file_name[:-4] + '/' + directory_name + '/' + treatment_col + '/'
91
92
    #Return to this if errors occur
93
    #prepared_path_ = directory_path_ + file_name[:-4] + directory_name + '/' + treatment_col + '/'
94
95
    outcomes_path_ = prepared_path_+'outcomes/'
96
    treatment_path_ = prepared_path_+'treatment/'
97
    features_path_ = prepared_path_+'features/'
98
99
    # Create directory and subdirectories 
100
    Path(prepared_path_).mkdir(parents=True, exist_ok=True)
101
    Path(outcomes_path_).mkdir(parents=True, exist_ok=True)
102
    Path(treatment_path_).mkdir(parents=True, exist_ok=True)
103
    Path(features_path_).mkdir(parents=True, exist_ok=True)
104
    
105
    # Create treatment file
106
    df_treatment_factual = pd.get_dummies(df[treatment_col]).replace({False:0, True:1})
107
    df_treatment_factual.columns = ['a'+str(i)+'_'+str(name) for i,name in enumerate(df_treatment_factual.columns)]
108
    df_treatment_factual.to_csv(treatment_path_ + 'treatment_factual.csv', index_label='ID')
109
110
    if counterfactual_outcomes is not None:
111
        df_treatment_data = df_treatment_factual.replace({0:1})
112
        df_treatment_data.to_csv(treatment_path_ + 'treatment_data.csv', index_label='ID')
113
        counterfactual_outcomes.to_csv(outcomes_path_ + 'outcomes_data.csv', index_label='ID')
114
115
    # Create outcome file
116
    df_outcomes_factual = df[outcome_cols]
117
    df_outcomes_factual.columns = ['y'+str(i)+'_'+str(name) for i,name in enumerate(df_outcomes_factual.columns)]
118
    df_outcomes_factual.to_csv(outcomes_path_ + 'outcomes_factual.csv', index_label='ID')
119
120
    # Find out how many technologies are used
121
    feature_names = df.columns[df.columns.str.match(r'^\d{2}_')]
122
    tech_numbers = set([feature[:2] for feature in feature_names])
123
124
    # Create features files
125
    df_features = pd.DataFrame()
126
    for tech in tech_numbers:
127
        df_feature_factual = df[df.columns[df.columns.str.startswith(tech)]]
128
        df_feature_factual.columns = ['p'+str(int(tech))+'_'+str(i)+'_'+tech_name[3:] for i, tech_name in enumerate(df_feature_factual.columns)]
129
        df_features = pd.concat([df_features, df_feature_factual], axis=1)
130
        df_feature_factual.to_csv(features_path_ + 'patient_data_'+str(int(tech))+'.csv', index_label='ID')
131
    df_features.to_csv(features_path_ + 'patient_data_all.csv', index_label='ID')
132
133
    # Return prepared path
134
    return prepared_path_