Diff of /openomics/clinical.py [000000] .. [548210]

Switch to side-by-side view

--- a
+++ b/openomics/clinical.py
@@ -0,0 +1,176 @@
+import io
+import os
+from typing import List, Union
+
+import dask.dataframe as dd
+import pandas as pd
+import validators
+
+from openomics.io.files import get_pkg_data_filename
+
+BCR_PATIENT_BARCODE_COL = "bcr_patient_barcode"
+HISTOLOGIC_SUBTYPE_COL = "histologic_subtype"
+PATHOLOGIC_STAGE_COL = "pathologic_stage"
+TUMOR_NORMAL_COL = 'tumor_normal'
+PREDICTED_SUBTYPE_COL = 'predicted_subtype'
+
+TUMOR = "Tumor"
+NORMAL = "Normal"
+
+__all__ = ['ClinicalData']
+
+class ClinicalData:
+    """This class manages the clinical data tables to handle the patient's
+    phenotype data, as well as the treatment, and sample data associated to each
+    patient.
+    """
+
+    pathologic_stage_map = {'Stage IA': 'Stage I', 'Stage IB': 'Stage I',
+                            'Stage IIA': 'Stage II', 'Stage IIB': 'Stage II',
+                            'Stage IIIA': 'Stage III', 'Stage IIIB': 'Stage III'}
+
+    def __init__(self,
+                 file: Union[str, io.StringIO, pd.DataFrame, dd.DataFrame],
+                 patient_index: str,
+                 columns: List[str] = None):
+        """
+        Args:
+            file (str, io.StringIO, pd.DataFrame): either a path to the
+                patients clinical data file, or a DataFrame.
+            patient_index (str): the patient's ID column name
+            columns (List[str]): default None. Specifies the columns to import,
+                if None, then import all columns.
+        """
+        # self.cohort_name = cohort_name
+        self.patient_column = patient_index
+
+        if columns and patient_index not in columns:
+            columns.append(patient_index)
+
+        if isinstance(file, (pd.DataFrame, dd.DataFrame)):
+            self.patient = file
+
+        elif isinstance(file, io.StringIO):
+            file.seek(0)  # Needed since the file was previous read to extract columns information
+            self.patient = pd.read_table(file,
+                                         skiprows=[1, 2],
+                                         na_values=["[Not Available]", "[Unknown]", "[Not Applicable]",
+                                                    "[Discrepancy]"],
+                                         usecols=columns
+                                         )
+
+        elif isinstance(file, str) and validators.url(file):
+            dataurl, filename = os.path.split(file)
+            file = get_pkg_data_filename(dataurl + "/", filename)
+            self.patient = pd.read_table(file)
+
+
+        elif isinstance(file, str) and os.path.isfile(file):
+            self.patient = pd.read_table(file,
+                                         skiprows=[1, 2],
+                                         na_values=["[Not Available]", "[Unknown]", "[Not Applicable]",
+                                                    "[Discrepancy]"],
+                                         usecols=columns
+                                         )
+
+        else:
+            raise FileNotFoundError("{}".format(file))
+
+        self.patient_barcodes = self.patient[patient_index].tolist()
+        self.patient.set_index(patient_index, inplace=True)
+
+        # Rename columns
+        self.patient.rename({"ajcc_pathologic_tumor_stage": PATHOLOGIC_STAGE_COL,
+                             "histological_type": HISTOLOGIC_SUBTYPE_COL,
+                             "histologic_diagnosis.1": HISTOLOGIC_SUBTYPE_COL}, axis=1, inplace=True)
+
+        self.patient.replace({PATHOLOGIC_STAGE_COL: ClinicalData.pathologic_stage_map}, inplace=True)
+
+    @classmethod
+    def name(self):
+        """Returns the name of the class, i.e. 'ClinicalData'"""
+        return self.__class__.__name__
+
+    def build_clinical_samples(self, all_samples, index="bcr_patient_barcode"):
+        """Build table with samples clinical data from patients :param
+        all_samples:
+
+        Args:
+            all_samples:
+            index:
+        """
+        self.samples = pd.DataFrame(index=all_samples)
+        self.samples.index.name = index
+        self.samples.index = self.samples.index.str[:-4]  # Cut sample barcode for TCGA
+
+        # Merge patients clinical data with patient barcode as index
+        # target = pd.merge(target, self.patient,
+        #                      how="left", left_on="patient_barcode", right_on="patient_barcode")
+
+        self.samples = self.samples.join(self.patient, on=index, how="left", rsuffix="_")
+
+        # self.samples.dropna(axis=0, subset=["bcr_patient_barcode"], inplace=True) # Remove samples without clinical data
+
+        self.samples = self.samples[self.samples[PATHOLOGIC_STAGE_COL] != "[Discrepancy]"]
+        self.samples.loc[self.samples.index.str.contains(
+            "-11"), TUMOR_NORMAL_COL] = NORMAL  # Change stage label of normal samples to "Normal"
+        self.samples.loc[self.samples.index.str.contains(
+            "-01"), TUMOR_NORMAL_COL] = TUMOR  # Change stage label of normal samples to "Normal"
+
+    def add_drug_response_data(self, file_path="nationwidechildrens.org_clinical_drug.txt",
+                               patient_column="bcr_patient_barcode",
+                               columns=None,
+                               drug_name_col=None, response_column=None):
+        """
+        Args:
+            file_path:
+            patient_column:
+            columns:
+            drug_name_col:
+            response_column:
+        """
+        if columns is None:
+            columns = ['bcr_patient_barcode', 'pharmaceutical_therapy_drug_name',
+                       'pharmaceutical_therapy_type', 'treatment_best_response']
+
+        if not os.path.exists(file_path):
+            raise FileNotFoundError(file_path)
+
+        self.drug_name_col = drug_name_col
+        self.response_column = response_column
+
+        self.drugs = pd.read_table(file_path,
+                                   sep="\t",
+                                   skiprows=[1, 2],
+                                   na_values=["[Not Available]", "[Unknown]", "[Not Applicable]"],
+                                   usecols=columns
+                                   )
+        self.drugs.set_index(patient_column, inplace=True)
+
+    def add_biospecimen_data(self, file_path="genome.wustl.edu_biospecimen_sample.txt",
+                             patient_col_name="bcr_patient_barcode",
+                             columns=['bcr_sample_barcode', 'sample_type']):
+        """
+        Args:
+            file_path:
+            patient_col_name:
+            columns:
+        """
+        if not os.path.exists(file_path):
+            raise FileNotFoundError(file_path)
+
+        self.biospecimen = pd.read_table(file_path, sep="\t", skiprows=[1, ],
+                                         na_values=["[Not Available]", "[Unknown]", "[Not Applicable]"],
+                                         usecols=columns
+                                         )
+        self.sample_barcodes = self.biospecimen[patient_col_name].tolist()
+        self.biospecimen.set_index(patient_col_name, inplace=True)
+
+
+    def get_patient_barcodes(self):
+        return self.patient_barcodes
+
+    def get_sample_barcodes(self):
+        return self.sample_barcodes
+
+