OpenOmics / Git / [548210] /openomics/clinical.py

Models:
AlyssaS/
OpenOmics
Downloads: 1
[548210]: / openomics / clinical.py
History
Download this file
177 lines (140 with data), 7.1 kB

import io
import os
from typing import List, Union

import dask.dataframe as dd
import pandas as pd
import validators

from openomics.io.files import get_pkg_data_filename

BCR_PATIENT_BARCODE_COL = "bcr_patient_barcode"
HISTOLOGIC_SUBTYPE_COL = "histologic_subtype"
PATHOLOGIC_STAGE_COL = "pathologic_stage"
TUMOR_NORMAL_COL = 'tumor_normal'
PREDICTED_SUBTYPE_COL = 'predicted_subtype'

TUMOR = "Tumor"
NORMAL = "Normal"

__all__ = ['ClinicalData']

class ClinicalData:
    """This class manages the clinical data tables to handle the patient's
    phenotype data, as well as the treatment, and sample data associated to each
    patient.
    """

    pathologic_stage_map = {'Stage IA': 'Stage I', 'Stage IB': 'Stage I',
                            'Stage IIA': 'Stage II', 'Stage IIB': 'Stage II',
                            'Stage IIIA': 'Stage III', 'Stage IIIB': 'Stage III'}

    def __init__(self,
                 file: Union[str, io.StringIO, pd.DataFrame, dd.DataFrame],
                 patient_index: str,
                 columns: List[str] = None):
        """
        Args:
            file (str, io.StringIO, pd.DataFrame): either a path to the
                patients clinical data file, or a DataFrame.
            patient_index (str): the patient's ID column name
            columns (List[str]): default None. Specifies the columns to import,
                if None, then import all columns.
        """
        # self.cohort_name = cohort_name
        self.patient_column = patient_index

        if columns and patient_index not in columns:
            columns.append(patient_index)

        if isinstance(file, (pd.DataFrame, dd.DataFrame)):
            self.patient = file

        elif isinstance(file, io.StringIO):
            file.seek(0)  # Needed since the file was previous read to extract columns information
            self.patient = pd.read_table(file,
                                         skiprows=[1, 2],
                                         na_values=["[Not Available]", "[Unknown]", "[Not Applicable]",
                                                    "[Discrepancy]"],
                                         usecols=columns
                                         )

        elif isinstance(file, str) and validators.url(file):
            dataurl, filename = os.path.split(file)
            file = get_pkg_data_filename(dataurl + "/", filename)
            self.patient = pd.read_table(file)


        elif isinstance(file, str) and os.path.isfile(file):
            self.patient = pd.read_table(file,
                                         skiprows=[1, 2],
                                         na_values=["[Not Available]", "[Unknown]", "[Not Applicable]",
                                                    "[Discrepancy]"],
                                         usecols=columns
                                         )

        else:
            raise FileNotFoundError("{}".format(file))

        self.patient_barcodes = self.patient[patient_index].tolist()
        self.patient.set_index(patient_index, inplace=True)

        # Rename columns
        self.patient.rename({"ajcc_pathologic_tumor_stage": PATHOLOGIC_STAGE_COL,
                             "histological_type": HISTOLOGIC_SUBTYPE_COL,
                             "histologic_diagnosis.1": HISTOLOGIC_SUBTYPE_COL}, axis=1, inplace=True)

        self.patient.replace({PATHOLOGIC_STAGE_COL: ClinicalData.pathologic_stage_map}, inplace=True)

    @classmethod
    def name(self):
        """Returns the name of the class, i.e. 'ClinicalData'"""
        return self.__class__.__name__

    def build_clinical_samples(self, all_samples, index="bcr_patient_barcode"):
        """Build table with samples clinical data from patients :param
        all_samples:

        Args:
            all_samples:
            index:
        """
        self.samples = pd.DataFrame(index=all_samples)
        self.samples.index.name = index
        self.samples.index = self.samples.index.str[:-4]  # Cut sample barcode for TCGA

        # Merge patients clinical data with patient barcode as index
        # target = pd.merge(target, self.patient,
        #                      how="left", left_on="patient_barcode", right_on="patient_barcode")

        self.samples = self.samples.join(self.patient, on=index, how="left", rsuffix="_")

        # self.samples.dropna(axis=0, subset=["bcr_patient_barcode"], inplace=True) # Remove samples without clinical data

        self.samples = self.samples[self.samples[PATHOLOGIC_STAGE_COL] != "[Discrepancy]"]
        self.samples.loc[self.samples.index.str.contains(
            "-11"), TUMOR_NORMAL_COL] = NORMAL  # Change stage label of normal samples to "Normal"
        self.samples.loc[self.samples.index.str.contains(
            "-01"), TUMOR_NORMAL_COL] = TUMOR  # Change stage label of normal samples to "Normal"

    def add_drug_response_data(self, file_path="nationwidechildrens.org_clinical_drug.txt",
                               patient_column="bcr_patient_barcode",
                               columns=None,
                               drug_name_col=None, response_column=None):
        """
        Args:
            file_path:
            patient_column:
            columns:
            drug_name_col:
            response_column:
        """
        if columns is None:
            columns = ['bcr_patient_barcode', 'pharmaceutical_therapy_drug_name',
                       'pharmaceutical_therapy_type', 'treatment_best_response']

        if not os.path.exists(file_path):
            raise FileNotFoundError(file_path)

        self.drug_name_col = drug_name_col
        self.response_column = response_column

        self.drugs = pd.read_table(file_path,
                                   sep="\t",
                                   skiprows=[1, 2],
                                   na_values=["[Not Available]", "[Unknown]", "[Not Applicable]"],
                                   usecols=columns
                                   )
        self.drugs.set_index(patient_column, inplace=True)

    def add_biospecimen_data(self, file_path="genome.wustl.edu_biospecimen_sample.txt",
                             patient_col_name="bcr_patient_barcode",
                             columns=['bcr_sample_barcode', 'sample_type']):
        """
        Args:
            file_path:
            patient_col_name:
            columns:
        """
        if not os.path.exists(file_path):
            raise FileNotFoundError(file_path)

        self.biospecimen = pd.read_table(file_path, sep="\t", skiprows=[1, ],
                                         na_values=["[Not Available]", "[Unknown]", "[Not Applicable]"],
                                         usecols=columns
                                         )
        self.sample_barcodes = self.biospecimen[patient_col_name].tolist()
        self.biospecimen.set_index(patient_col_name, inplace=True)


    def get_patient_barcodes(self):
        return self.patient_barcodes

    def get_sample_barcodes(self):
        return self.sample_barcodes