|
a |
|
b/openomics/clinical.py |
|
|
1 |
import io |
|
|
2 |
import os |
|
|
3 |
from typing import List, Union |
|
|
4 |
|
|
|
5 |
import dask.dataframe as dd |
|
|
6 |
import pandas as pd |
|
|
7 |
import validators |
|
|
8 |
|
|
|
9 |
from openomics.io.files import get_pkg_data_filename |
|
|
10 |
|
|
|
11 |
BCR_PATIENT_BARCODE_COL = "bcr_patient_barcode" |
|
|
12 |
HISTOLOGIC_SUBTYPE_COL = "histologic_subtype" |
|
|
13 |
PATHOLOGIC_STAGE_COL = "pathologic_stage" |
|
|
14 |
TUMOR_NORMAL_COL = 'tumor_normal' |
|
|
15 |
PREDICTED_SUBTYPE_COL = 'predicted_subtype' |
|
|
16 |
|
|
|
17 |
TUMOR = "Tumor" |
|
|
18 |
NORMAL = "Normal" |
|
|
19 |
|
|
|
20 |
__all__ = ['ClinicalData'] |
|
|
21 |
|
|
|
22 |
class ClinicalData: |
|
|
23 |
"""This class manages the clinical data tables to handle the patient's |
|
|
24 |
phenotype data, as well as the treatment, and sample data associated to each |
|
|
25 |
patient. |
|
|
26 |
""" |
|
|
27 |
|
|
|
28 |
pathologic_stage_map = {'Stage IA': 'Stage I', 'Stage IB': 'Stage I', |
|
|
29 |
'Stage IIA': 'Stage II', 'Stage IIB': 'Stage II', |
|
|
30 |
'Stage IIIA': 'Stage III', 'Stage IIIB': 'Stage III'} |
|
|
31 |
|
|
|
32 |
def __init__(self, |
|
|
33 |
file: Union[str, io.StringIO, pd.DataFrame, dd.DataFrame], |
|
|
34 |
patient_index: str, |
|
|
35 |
columns: List[str] = None): |
|
|
36 |
""" |
|
|
37 |
Args: |
|
|
38 |
file (str, io.StringIO, pd.DataFrame): either a path to the |
|
|
39 |
patients clinical data file, or a DataFrame. |
|
|
40 |
patient_index (str): the patient's ID column name |
|
|
41 |
columns (List[str]): default None. Specifies the columns to import, |
|
|
42 |
if None, then import all columns. |
|
|
43 |
""" |
|
|
44 |
# self.cohort_name = cohort_name |
|
|
45 |
self.patient_column = patient_index |
|
|
46 |
|
|
|
47 |
if columns and patient_index not in columns: |
|
|
48 |
columns.append(patient_index) |
|
|
49 |
|
|
|
50 |
if isinstance(file, (pd.DataFrame, dd.DataFrame)): |
|
|
51 |
self.patient = file |
|
|
52 |
|
|
|
53 |
elif isinstance(file, io.StringIO): |
|
|
54 |
file.seek(0) # Needed since the file was previous read to extract columns information |
|
|
55 |
self.patient = pd.read_table(file, |
|
|
56 |
skiprows=[1, 2], |
|
|
57 |
na_values=["[Not Available]", "[Unknown]", "[Not Applicable]", |
|
|
58 |
"[Discrepancy]"], |
|
|
59 |
usecols=columns |
|
|
60 |
) |
|
|
61 |
|
|
|
62 |
elif isinstance(file, str) and validators.url(file): |
|
|
63 |
dataurl, filename = os.path.split(file) |
|
|
64 |
file = get_pkg_data_filename(dataurl + "/", filename) |
|
|
65 |
self.patient = pd.read_table(file) |
|
|
66 |
|
|
|
67 |
|
|
|
68 |
elif isinstance(file, str) and os.path.isfile(file): |
|
|
69 |
self.patient = pd.read_table(file, |
|
|
70 |
skiprows=[1, 2], |
|
|
71 |
na_values=["[Not Available]", "[Unknown]", "[Not Applicable]", |
|
|
72 |
"[Discrepancy]"], |
|
|
73 |
usecols=columns |
|
|
74 |
) |
|
|
75 |
|
|
|
76 |
else: |
|
|
77 |
raise FileNotFoundError("{}".format(file)) |
|
|
78 |
|
|
|
79 |
self.patient_barcodes = self.patient[patient_index].tolist() |
|
|
80 |
self.patient.set_index(patient_index, inplace=True) |
|
|
81 |
|
|
|
82 |
# Rename columns |
|
|
83 |
self.patient.rename({"ajcc_pathologic_tumor_stage": PATHOLOGIC_STAGE_COL, |
|
|
84 |
"histological_type": HISTOLOGIC_SUBTYPE_COL, |
|
|
85 |
"histologic_diagnosis.1": HISTOLOGIC_SUBTYPE_COL}, axis=1, inplace=True) |
|
|
86 |
|
|
|
87 |
self.patient.replace({PATHOLOGIC_STAGE_COL: ClinicalData.pathologic_stage_map}, inplace=True) |
|
|
88 |
|
|
|
89 |
@classmethod |
|
|
90 |
def name(self): |
|
|
91 |
"""Returns the name of the class, i.e. 'ClinicalData'""" |
|
|
92 |
return self.__class__.__name__ |
|
|
93 |
|
|
|
94 |
def build_clinical_samples(self, all_samples, index="bcr_patient_barcode"): |
|
|
95 |
"""Build table with samples clinical data from patients :param |
|
|
96 |
all_samples: |
|
|
97 |
|
|
|
98 |
Args: |
|
|
99 |
all_samples: |
|
|
100 |
index: |
|
|
101 |
""" |
|
|
102 |
self.samples = pd.DataFrame(index=all_samples) |
|
|
103 |
self.samples.index.name = index |
|
|
104 |
self.samples.index = self.samples.index.str[:-4] # Cut sample barcode for TCGA |
|
|
105 |
|
|
|
106 |
# Merge patients clinical data with patient barcode as index |
|
|
107 |
# target = pd.merge(target, self.patient, |
|
|
108 |
# how="left", left_on="patient_barcode", right_on="patient_barcode") |
|
|
109 |
|
|
|
110 |
self.samples = self.samples.join(self.patient, on=index, how="left", rsuffix="_") |
|
|
111 |
|
|
|
112 |
# self.samples.dropna(axis=0, subset=["bcr_patient_barcode"], inplace=True) # Remove samples without clinical data |
|
|
113 |
|
|
|
114 |
self.samples = self.samples[self.samples[PATHOLOGIC_STAGE_COL] != "[Discrepancy]"] |
|
|
115 |
self.samples.loc[self.samples.index.str.contains( |
|
|
116 |
"-11"), TUMOR_NORMAL_COL] = NORMAL # Change stage label of normal samples to "Normal" |
|
|
117 |
self.samples.loc[self.samples.index.str.contains( |
|
|
118 |
"-01"), TUMOR_NORMAL_COL] = TUMOR # Change stage label of normal samples to "Normal" |
|
|
119 |
|
|
|
120 |
def add_drug_response_data(self, file_path="nationwidechildrens.org_clinical_drug.txt", |
|
|
121 |
patient_column="bcr_patient_barcode", |
|
|
122 |
columns=None, |
|
|
123 |
drug_name_col=None, response_column=None): |
|
|
124 |
""" |
|
|
125 |
Args: |
|
|
126 |
file_path: |
|
|
127 |
patient_column: |
|
|
128 |
columns: |
|
|
129 |
drug_name_col: |
|
|
130 |
response_column: |
|
|
131 |
""" |
|
|
132 |
if columns is None: |
|
|
133 |
columns = ['bcr_patient_barcode', 'pharmaceutical_therapy_drug_name', |
|
|
134 |
'pharmaceutical_therapy_type', 'treatment_best_response'] |
|
|
135 |
|
|
|
136 |
if not os.path.exists(file_path): |
|
|
137 |
raise FileNotFoundError(file_path) |
|
|
138 |
|
|
|
139 |
self.drug_name_col = drug_name_col |
|
|
140 |
self.response_column = response_column |
|
|
141 |
|
|
|
142 |
self.drugs = pd.read_table(file_path, |
|
|
143 |
sep="\t", |
|
|
144 |
skiprows=[1, 2], |
|
|
145 |
na_values=["[Not Available]", "[Unknown]", "[Not Applicable]"], |
|
|
146 |
usecols=columns |
|
|
147 |
) |
|
|
148 |
self.drugs.set_index(patient_column, inplace=True) |
|
|
149 |
|
|
|
150 |
def add_biospecimen_data(self, file_path="genome.wustl.edu_biospecimen_sample.txt", |
|
|
151 |
patient_col_name="bcr_patient_barcode", |
|
|
152 |
columns=['bcr_sample_barcode', 'sample_type']): |
|
|
153 |
""" |
|
|
154 |
Args: |
|
|
155 |
file_path: |
|
|
156 |
patient_col_name: |
|
|
157 |
columns: |
|
|
158 |
""" |
|
|
159 |
if not os.path.exists(file_path): |
|
|
160 |
raise FileNotFoundError(file_path) |
|
|
161 |
|
|
|
162 |
self.biospecimen = pd.read_table(file_path, sep="\t", skiprows=[1, ], |
|
|
163 |
na_values=["[Not Available]", "[Unknown]", "[Not Applicable]"], |
|
|
164 |
usecols=columns |
|
|
165 |
) |
|
|
166 |
self.sample_barcodes = self.biospecimen[patient_col_name].tolist() |
|
|
167 |
self.biospecimen.set_index(patient_col_name, inplace=True) |
|
|
168 |
|
|
|
169 |
|
|
|
170 |
def get_patient_barcodes(self): |
|
|
171 |
return self.patient_barcodes |
|
|
172 |
|
|
|
173 |
def get_sample_barcodes(self): |
|
|
174 |
return self.sample_barcodes |
|
|
175 |
|
|
|
176 |
|