|
a |
|
b/src/utils.py |
|
|
1 |
import seaborn |
|
|
2 |
import pandas as pd |
|
|
3 |
import numpy as np |
|
|
4 |
import random |
|
|
5 |
import string |
|
|
6 |
from pathlib import Path |
|
|
7 |
# pd.set_option('future.no_silent_downcasting', True) # for future compatibility of .replace method |
|
|
8 |
|
|
|
9 |
def inter_len(set1, set2, dfs=False): |
|
|
10 |
""" |
|
|
11 |
Compute the intersection of two sets and return ratio. |
|
|
12 |
""" |
|
|
13 |
if dfs: |
|
|
14 |
n = len(set1.index.intersection(set2.index)) |
|
|
15 |
p = float(n)/max(len(set1.index), len(set2.index)) |
|
|
16 |
else: |
|
|
17 |
n = len(set1.intersection(set2)) |
|
|
18 |
p = float(n)/max(len(set1), len(set2)) |
|
|
19 |
return (n, p) |
|
|
20 |
|
|
|
21 |
def plot_pairwise_intersection_ratio(dfs, tech_names): |
|
|
22 |
""" |
|
|
23 |
Creates heatmap with pairwise intersection ratios, i.e., how many indices of different |
|
|
24 |
dataframes are the same. |
|
|
25 |
""" |
|
|
26 |
n = len(dfs) |
|
|
27 |
overlaps = np.zeros((n,n)) |
|
|
28 |
for i in range(n): |
|
|
29 |
for j in range(n): |
|
|
30 |
overlaps[i][j] = inter_len(dfs[i].index, dfs[j].index)[1] |
|
|
31 |
|
|
|
32 |
seaborn.heatmap(overlaps, cmap='hot', xticklabels = tech_names+["ids"], yticklabels = tech_names+["ids"]) |
|
|
33 |
|
|
|
34 |
def join_selected_dfs(dfs, df_names, df_names_to_join): |
|
|
35 |
""" |
|
|
36 |
Join list of dfs. |
|
|
37 |
""" |
|
|
38 |
dfs_ids = [df_names.index(df_name) for df_name in df_names_to_join] |
|
|
39 |
result = pd.DataFrame() |
|
|
40 |
|
|
|
41 |
for i in dfs_ids: |
|
|
42 |
if result.empty: |
|
|
43 |
result = dfs[i].copy() |
|
|
44 |
else: |
|
|
45 |
result = result.join(dfs[i], how="inner") |
|
|
46 |
|
|
|
47 |
return result |
|
|
48 |
|
|
|
49 |
def add_prefix_to_cols(df, prefix): |
|
|
50 |
""" |
|
|
51 |
Adds prefix to colnames. |
|
|
52 |
""" |
|
|
53 |
df.columns = [prefix+col for col in df.columns] |
|
|
54 |
|
|
|
55 |
def generate_sample_id(characters, length=10): |
|
|
56 |
""" |
|
|
57 |
Generates random sample id. Same style of id as used in medical datasets. |
|
|
58 |
""" |
|
|
59 |
rand_string = ''.join(random.choices(characters, k=length)) |
|
|
60 |
|
|
|
61 |
return rand_string[:length//2]+'-'+rand_string[length//2:] |
|
|
62 |
|
|
|
63 |
def has_duplicates(seq): |
|
|
64 |
return len(seq) != len(set(seq)) |
|
|
65 |
|
|
|
66 |
def gen_unique_ids(n): |
|
|
67 |
""" |
|
|
68 |
Generate unique list of random ids for a medical dataframe. |
|
|
69 |
""" |
|
|
70 |
length = 10 |
|
|
71 |
characters = string.ascii_lowercase |
|
|
72 |
|
|
|
73 |
ids = [generate_sample_id(characters, length) for i in range(n)] |
|
|
74 |
assert(not has_duplicates(ids)) |
|
|
75 |
return ids |
|
|
76 |
|
|
|
77 |
def prepare_and_save_data(df: pd.DataFrame, |
|
|
78 |
directory_path_: str, |
|
|
79 |
file_name: str, |
|
|
80 |
treatment_col: str ='tre_nivolumab', |
|
|
81 |
directory_name: str ="", |
|
|
82 |
counterfactual_outcomes = None): |
|
|
83 |
""" |
|
|
84 |
Brings data into the format required for the Tumor board and creates directories where necessary. |
|
|
85 |
""" |
|
|
86 |
# Get colnames |
|
|
87 |
outcome_cols = [col for col in df.columns if col.startswith("out")] |
|
|
88 |
|
|
|
89 |
# Create directories |
|
|
90 |
prepared_path_ = directory_path_ + file_name[:-4] + '/' + directory_name + '/' + treatment_col + '/' |
|
|
91 |
|
|
|
92 |
#Return to this if errors occur |
|
|
93 |
#prepared_path_ = directory_path_ + file_name[:-4] + directory_name + '/' + treatment_col + '/' |
|
|
94 |
|
|
|
95 |
outcomes_path_ = prepared_path_+'outcomes/' |
|
|
96 |
treatment_path_ = prepared_path_+'treatment/' |
|
|
97 |
features_path_ = prepared_path_+'features/' |
|
|
98 |
|
|
|
99 |
# Create directory and subdirectories |
|
|
100 |
Path(prepared_path_).mkdir(parents=True, exist_ok=True) |
|
|
101 |
Path(outcomes_path_).mkdir(parents=True, exist_ok=True) |
|
|
102 |
Path(treatment_path_).mkdir(parents=True, exist_ok=True) |
|
|
103 |
Path(features_path_).mkdir(parents=True, exist_ok=True) |
|
|
104 |
|
|
|
105 |
# Create treatment file |
|
|
106 |
df_treatment_factual = pd.get_dummies(df[treatment_col]).replace({False:0, True:1}) |
|
|
107 |
df_treatment_factual.columns = ['a'+str(i)+'_'+str(name) for i,name in enumerate(df_treatment_factual.columns)] |
|
|
108 |
df_treatment_factual.to_csv(treatment_path_ + 'treatment_factual.csv', index_label='ID') |
|
|
109 |
|
|
|
110 |
if counterfactual_outcomes is not None: |
|
|
111 |
df_treatment_data = df_treatment_factual.replace({0:1}) |
|
|
112 |
df_treatment_data.to_csv(treatment_path_ + 'treatment_data.csv', index_label='ID') |
|
|
113 |
counterfactual_outcomes.to_csv(outcomes_path_ + 'outcomes_data.csv', index_label='ID') |
|
|
114 |
|
|
|
115 |
# Create outcome file |
|
|
116 |
df_outcomes_factual = df[outcome_cols] |
|
|
117 |
df_outcomes_factual.columns = ['y'+str(i)+'_'+str(name) for i,name in enumerate(df_outcomes_factual.columns)] |
|
|
118 |
df_outcomes_factual.to_csv(outcomes_path_ + 'outcomes_factual.csv', index_label='ID') |
|
|
119 |
|
|
|
120 |
# Find out how many technologies are used |
|
|
121 |
feature_names = df.columns[df.columns.str.match(r'^\d{2}_')] |
|
|
122 |
tech_numbers = set([feature[:2] for feature in feature_names]) |
|
|
123 |
|
|
|
124 |
# Create features files |
|
|
125 |
df_features = pd.DataFrame() |
|
|
126 |
for tech in tech_numbers: |
|
|
127 |
df_feature_factual = df[df.columns[df.columns.str.startswith(tech)]] |
|
|
128 |
df_feature_factual.columns = ['p'+str(int(tech))+'_'+str(i)+'_'+tech_name[3:] for i, tech_name in enumerate(df_feature_factual.columns)] |
|
|
129 |
df_features = pd.concat([df_features, df_feature_factual], axis=1) |
|
|
130 |
df_feature_factual.to_csv(features_path_ + 'patient_data_'+str(int(tech))+'.csv', index_label='ID') |
|
|
131 |
df_features.to_csv(features_path_ + 'patient_data_all.csv', index_label='ID') |
|
|
132 |
|
|
|
133 |
# Return prepared path |
|
|
134 |
return prepared_path_ |