Diff of /openomics/clinical.py [000000] .. [548210]

Switch to unified view

a b/openomics/clinical.py
1
import io
2
import os
3
from typing import List, Union
4
5
import dask.dataframe as dd
6
import pandas as pd
7
import validators
8
9
from openomics.io.files import get_pkg_data_filename
10
11
BCR_PATIENT_BARCODE_COL = "bcr_patient_barcode"
12
HISTOLOGIC_SUBTYPE_COL = "histologic_subtype"
13
PATHOLOGIC_STAGE_COL = "pathologic_stage"
14
TUMOR_NORMAL_COL = 'tumor_normal'
15
PREDICTED_SUBTYPE_COL = 'predicted_subtype'
16
17
TUMOR = "Tumor"
18
NORMAL = "Normal"
19
20
__all__ = ['ClinicalData']
21
22
class ClinicalData:
23
    """This class manages the clinical data tables to handle the patient's
24
    phenotype data, as well as the treatment, and sample data associated to each
25
    patient.
26
    """
27
28
    pathologic_stage_map = {'Stage IA': 'Stage I', 'Stage IB': 'Stage I',
29
                            'Stage IIA': 'Stage II', 'Stage IIB': 'Stage II',
30
                            'Stage IIIA': 'Stage III', 'Stage IIIB': 'Stage III'}
31
32
    def __init__(self,
33
                 file: Union[str, io.StringIO, pd.DataFrame, dd.DataFrame],
34
                 patient_index: str,
35
                 columns: List[str] = None):
36
        """
37
        Args:
38
            file (str, io.StringIO, pd.DataFrame): either a path to the
39
                patients clinical data file, or a DataFrame.
40
            patient_index (str): the patient's ID column name
41
            columns (List[str]): default None. Specifies the columns to import,
42
                if None, then import all columns.
43
        """
44
        # self.cohort_name = cohort_name
45
        self.patient_column = patient_index
46
47
        if columns and patient_index not in columns:
48
            columns.append(patient_index)
49
50
        if isinstance(file, (pd.DataFrame, dd.DataFrame)):
51
            self.patient = file
52
53
        elif isinstance(file, io.StringIO):
54
            file.seek(0)  # Needed since the file was previous read to extract columns information
55
            self.patient = pd.read_table(file,
56
                                         skiprows=[1, 2],
57
                                         na_values=["[Not Available]", "[Unknown]", "[Not Applicable]",
58
                                                    "[Discrepancy]"],
59
                                         usecols=columns
60
                                         )
61
62
        elif isinstance(file, str) and validators.url(file):
63
            dataurl, filename = os.path.split(file)
64
            file = get_pkg_data_filename(dataurl + "/", filename)
65
            self.patient = pd.read_table(file)
66
67
68
        elif isinstance(file, str) and os.path.isfile(file):
69
            self.patient = pd.read_table(file,
70
                                         skiprows=[1, 2],
71
                                         na_values=["[Not Available]", "[Unknown]", "[Not Applicable]",
72
                                                    "[Discrepancy]"],
73
                                         usecols=columns
74
                                         )
75
76
        else:
77
            raise FileNotFoundError("{}".format(file))
78
79
        self.patient_barcodes = self.patient[patient_index].tolist()
80
        self.patient.set_index(patient_index, inplace=True)
81
82
        # Rename columns
83
        self.patient.rename({"ajcc_pathologic_tumor_stage": PATHOLOGIC_STAGE_COL,
84
                             "histological_type": HISTOLOGIC_SUBTYPE_COL,
85
                             "histologic_diagnosis.1": HISTOLOGIC_SUBTYPE_COL}, axis=1, inplace=True)
86
87
        self.patient.replace({PATHOLOGIC_STAGE_COL: ClinicalData.pathologic_stage_map}, inplace=True)
88
89
    @classmethod
90
    def name(self):
91
        """Returns the name of the class, i.e. 'ClinicalData'"""
92
        return self.__class__.__name__
93
94
    def build_clinical_samples(self, all_samples, index="bcr_patient_barcode"):
95
        """Build table with samples clinical data from patients :param
96
        all_samples:
97
98
        Args:
99
            all_samples:
100
            index:
101
        """
102
        self.samples = pd.DataFrame(index=all_samples)
103
        self.samples.index.name = index
104
        self.samples.index = self.samples.index.str[:-4]  # Cut sample barcode for TCGA
105
106
        # Merge patients clinical data with patient barcode as index
107
        # target = pd.merge(target, self.patient,
108
        #                      how="left", left_on="patient_barcode", right_on="patient_barcode")
109
110
        self.samples = self.samples.join(self.patient, on=index, how="left", rsuffix="_")
111
112
        # self.samples.dropna(axis=0, subset=["bcr_patient_barcode"], inplace=True) # Remove samples without clinical data
113
114
        self.samples = self.samples[self.samples[PATHOLOGIC_STAGE_COL] != "[Discrepancy]"]
115
        self.samples.loc[self.samples.index.str.contains(
116
            "-11"), TUMOR_NORMAL_COL] = NORMAL  # Change stage label of normal samples to "Normal"
117
        self.samples.loc[self.samples.index.str.contains(
118
            "-01"), TUMOR_NORMAL_COL] = TUMOR  # Change stage label of normal samples to "Normal"
119
120
    def add_drug_response_data(self, file_path="nationwidechildrens.org_clinical_drug.txt",
121
                               patient_column="bcr_patient_barcode",
122
                               columns=None,
123
                               drug_name_col=None, response_column=None):
124
        """
125
        Args:
126
            file_path:
127
            patient_column:
128
            columns:
129
            drug_name_col:
130
            response_column:
131
        """
132
        if columns is None:
133
            columns = ['bcr_patient_barcode', 'pharmaceutical_therapy_drug_name',
134
                       'pharmaceutical_therapy_type', 'treatment_best_response']
135
136
        if not os.path.exists(file_path):
137
            raise FileNotFoundError(file_path)
138
139
        self.drug_name_col = drug_name_col
140
        self.response_column = response_column
141
142
        self.drugs = pd.read_table(file_path,
143
                                   sep="\t",
144
                                   skiprows=[1, 2],
145
                                   na_values=["[Not Available]", "[Unknown]", "[Not Applicable]"],
146
                                   usecols=columns
147
                                   )
148
        self.drugs.set_index(patient_column, inplace=True)
149
150
    def add_biospecimen_data(self, file_path="genome.wustl.edu_biospecimen_sample.txt",
151
                             patient_col_name="bcr_patient_barcode",
152
                             columns=['bcr_sample_barcode', 'sample_type']):
153
        """
154
        Args:
155
            file_path:
156
            patient_col_name:
157
            columns:
158
        """
159
        if not os.path.exists(file_path):
160
            raise FileNotFoundError(file_path)
161
162
        self.biospecimen = pd.read_table(file_path, sep="\t", skiprows=[1, ],
163
                                         na_values=["[Not Available]", "[Unknown]", "[Not Applicable]"],
164
                                         usecols=columns
165
                                         )
166
        self.sample_barcodes = self.biospecimen[patient_col_name].tolist()
167
        self.biospecimen.set_index(patient_col_name, inplace=True)
168
169
170
    def get_patient_barcodes(self):
171
        return self.patient_barcodes
172
173
    def get_sample_barcodes(self):
174
        return self.sample_barcodes
175
176