[de9ba9]: / notebooks / GenomicDataProcessing.ipynb

Download this file

1 lines (1 with data), 21.8 kB

{"cells":[{"cell_type":"code","source":["# %% [markdown]\n","# # Genomic Data Processing Pipeline\n","# Comprehensive analysis of biospecimen data with proper handling of both numeric and categorical columns\n","\n","# %%\n","# Initialize Spark Session\n","from pyspark.sql import SparkSession\n","from pyspark.sql.functions import col, when, isnull, isnan, count, lit\n","from pyspark.sql.types import DoubleType, IntegerType, StringType\n","import pandas as pd\n","import matplotlib.pyplot as plt\n","from sklearn.decomposition import PCA\n","\n","# Configuration\n","input_path = \"Files/PDC_biospecimen_manifest_03272025_214257.csv\"\n","output_path = \"Files/processed-data/analysis_results.parquet\"\n","\n","# Initialize Spark\n","spark = SparkSession.builder.appName(\"GenomicDataProcessing\").getOrCreate()\n","\n","# %%\n","def load_and_analyze_data(spark, input_path):\n","    \"\"\"Load and perform initial analysis of the biospecimen data\"\"\"\n","    try:\n","        # Load data with schema inference\n","        df = (spark.read\n","              .format(\"csv\")\n","              .option(\"header\", \"true\")\n","              .option(\"inferSchema\", \"true\")\n","              .load(input_path))\n","        \n","        print(\"Initial data schema:\")\n","        df.printSchema()\n","        \n","        # Show basic stats\n","        print(f\"\\nTotal records: {df.count()}\")\n","        print(f\"Columns: {len(df.columns)}\")\n","        \n","        return df\n","        \n","    except Exception as e:\n","        print(f\"Data loading error: {str(e)}\")\n","        raise\n","\n","# %%\n","def analyze_numeric_data(df):\n","    \"\"\"Analyze numeric columns if they exist\"\"\"\n","    try:\n","        numeric_cols = [f.name for f in df.schema.fields \n","                       if isinstance(f.dataType, (DoubleType, IntegerType))]\n","        \n","        if not numeric_cols:\n","            print(\"\\nNo numeric columns found in dataset\")\n","            return None\n","        \n","        print(\"\\nNumeric columns analysis:\")\n","        \n","        # Count nulls in numeric columns\n","        null_counts = df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) \n","                       for c in numeric_cols]).collect()[0]\n","        \n","        print(\"\\nNull values in numeric columns:\")\n","        for col_name, null_count in zip(numeric_cols, null_counts):\n","            print(f\"{col_name}: {null_count} nulls\")\n","        \n","        # Show summary stats for non-null columns\n","        non_null_cols = [c for c in numeric_cols if null_counts[numeric_cols.index(c)] < df.count()]\n","        \n","        if non_null_cols:\n","            print(\"\\nSummary statistics:\")\n","            df.select(non_null_cols).describe().show()\n","            return non_null_cols\n","        else:\n","            print(\"\\nAll numeric columns are completely null\")\n","            return None\n","            \n","    except Exception as e:\n","        print(f\"Numeric analysis failed: {str(e)}\")\n","        return None\n","\n","# %%\n","def analyze_categorical_data(df, top_n=5):\n","    \"\"\"Analyze categorical columns in the data\"\"\"\n","    try:\n","        categorical_cols = [f.name for f in df.schema.fields \n","                          if isinstance(f.dataType, StringType)]\n","        \n","        if not categorical_cols:\n","            print(\"\\nNo categorical columns found\")\n","            return None\n","            \n","        print(\"\\nCategorical columns analysis:\")\n","        \n","        for col_name in categorical_cols:\n","            print(f\"\\nColumn: {col_name}\")\n","            df.groupBy(col_name).count().orderBy(\"count\", ascending=False).show(top_n, truncate=False)\n","            \n","        return categorical_cols\n","        \n","    except Exception as e:\n","        print(f\"Categorical analysis failed: {str(e)}\")\n","        return None\n","\n","# %%\n","def perform_pca_if_possible(df, numeric_cols):\n","    \"\"\"Perform PCA if we have sufficient numeric data\"\"\"\n","    try:\n","        if not numeric_cols or len(numeric_cols) < 2:\n","            print(\"\\nInsufficient numeric data for PCA\")\n","            return None\n","            \n","        # Prepare data - fill nulls with mean\n","        from pyspark.sql.functions import mean\n","        for col_name in numeric_cols:\n","            col_mean = df.select(mean(col(col_name))).collect()[0][0]\n","            df = df.withColumn(col_name, when(col(col_name).isNull(), col_mean).otherwise(col(col_name)))\n","        \n","        # Convert to pandas for PCA\n","        pdf = df.select(numeric_cols).toPandas()\n","        \n","        # Perform PCA\n","        pca = PCA(n_components=2)\n","        X_pca = pca.fit_transform(pdf)\n","        \n","        # Create results DataFrame\n","        pca_results = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])\n","        \n","        # Visualize\n","        plt.figure(figsize=(10, 6))\n","        plt.scatter(X_pca[:, 0], X_pca[:, 1], alpha=0.5)\n","        plt.xlabel(f\"PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)\")\n","        plt.ylabel(f\"PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)\")\n","        plt.title(\"PCA of Numeric Features\")\n","        plt.grid(True)\n","        plt.show()\n","        \n","        return spark.createDataFrame(pca_results)\n","        \n","    except Exception as e:\n","        print(f\"PCA failed: {str(e)}\")\n","        return None\n","\n","# %%\n","# Main execution pipeline\n","try:\n","    print(\"Starting analysis pipeline...\")\n","    \n","    # 1. Load data\n","    print(\"\\n=== Loading Data ===\")\n","    df = load_and_analyze_data(spark, input_path)\n","    \n","    # 2. Analyze numeric data\n","    print(\"\\n=== Numeric Analysis ===\")\n","    numeric_cols = analyze_numeric_data(df)\n","    \n","    # 3. Analyze categorical data\n","    print(\"\\n=== Categorical Analysis ===\")\n","    categorical_cols = analyze_categorical_data(df)\n","    \n","    # 4. Try PCA if we have numeric data\n","    print(\"\\n=== Dimensionality Reduction ===\")\n","    pca_results = perform_pca_if_possible(df, numeric_cols)\n","    \n","    # 5. Save results if we have them\n","    if pca_results:\n","        print(\"\\nSaving PCA results...\")\n","        (pca_results.write\n","         .mode(\"overwrite\")\n","         .format(\"parquet\")\n","         .save(output_path))\n","        print(f\"Results saved to {output_path}\")\n","    else:\n","        print(\"\\nNo PCA results to save\")\n","    \n","    print(\"\\nPipeline completed successfully!\")\n","    \n","except Exception as e:\n","    print(f\"\\nPipeline failed: {str(e)}\")\n","    \n","finally:\n","    spark.stop()\n","    print(\"Spark session closed\")"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":22,"statement_ids":[22],"state":"finished","livy_statement_state":"available","session_id":"56701f99-4897-447d-b694-5a8c7be6acc3","normalized_state":"finished","queued_time":"2025-04-01T02:03:31.7785243Z","session_start_time":null,"execution_start_time":"2025-04-01T02:03:31.7800208Z","execution_finish_time":"2025-04-01T02:03:44.1011225Z","parent_msg_id":"a0750c0c-7e4c-4002-b12a-b343924d38d9"},"text/plain":"StatementMeta(, 56701f99-4897-447d-b694-5a8c7be6acc3, 22, Finished, Available, Finished)"},"metadata":{}},{"output_type":"stream","name":"stdout","text":["Starting analysis pipeline...\n\n=== Loading Data ===\nInitial data schema:\nroot\n |-- Aliquot ID: string (nullable = true)\n |-- Aliquot Submitter ID: string (nullable = true)\n |-- Sample ID: string (nullable = true)\n |-- Sample Submitter ID: string (nullable = true)\n |-- Case ID: string (nullable = true)\n |-- Case Submitter ID: string (nullable = true)\n |-- Project Name: string (nullable = true)\n |-- Sample Type: string (nullable = true)\n |-- Primary Site: string (nullable = true)\n |-- Disease Type: string (nullable = true)\n |-- Aliquot Is Ref: string (nullable = true)\n |-- Aliquot Status: string (nullable = true)\n |-- Aliquot Quantity: string (nullable = true)\n |-- Aliquot Volume: string (nullable = true)\n |-- Amount: string (nullable = true)\n |-- Analyte Type: string (nullable = true)\n |-- Concentration: string (nullable = true)\n |-- Case Status: string (nullable = true)\n |-- Sample Status: string (nullable = true)\n |-- Sample Is Ref: string (nullable = true)\n |-- Biospecimen Anatomic Site: string (nullable = true)\n |-- Biospecimen Laterality: string (nullable = true)\n |-- Composition: string (nullable = true)\n |-- Current Weight: string (nullable = true)\n |-- Days To Collection: string (nullable = true)\n |-- Days To Sample Procurement: string (nullable = true)\n |-- Diagnosis Pathologically Confirmed: string (nullable = true)\n |-- Freezing Method: string (nullable = true)\n |-- Initial Weight: string (nullable = true)\n |-- Intermediate Dimension: string (nullable = true)\n |-- Longest Dimension: string (nullable = true)\n |-- Method Of Sample Procurement: string (nullable = true)\n |-- Pathology Report UUID: string (nullable = true)\n |-- Preservation Method: string (nullable = true)\n |-- Sample Type id: string (nullable = true)\n |-- Sample Ordinal: string (nullable = true)\n |-- Shortest Dimension: string (nullable = true)\n |-- Time Between Clamping And Freezing: string (nullable = true)\n |-- Time Between Excision and Freezing: string (nullable = true)\n |-- Tissue Collection Type: string (nullable = true)\n |-- Tissue Type: string (nullable = true)\n |-- Tumor Code: string (nullable = true)\n |-- Tumor Code ID: string (nullable = true)\n |-- Tumor Descriptor: string (nullable = true)\n |-- Program Name: string (nullable = true)\n\n\nTotal records: 452\nColumns: 45\n\n=== Numeric Analysis ===\n\nNo numeric columns found in dataset\n\n=== Categorical Analysis ===\n\nCategorical columns analysis:\n\nColumn: Aliquot ID\n+------------------------------------+-----+\n|Aliquot ID                          |count|\n+------------------------------------+-----+\n|e5a2428c-9391-4169-ba99-7a76a5da3216|1    |\n|15c01b6e-6429-11e8-bcf1-0a2705229b82|1    |\n|32813d1d-0a5d-11eb-bc0e-0aad30af8a83|1    |\n|8501e008-ea04-45ea-bfe9-b72e0c34b0da|1    |\n|8dbd6471-9ef4-4be6-89e3-c26409b030ee|1    |\n+------------------------------------+-----+\nonly showing top 5 rows\n\n\nColumn: Aliquot Submitter ID\n+---------------------------------+-----+\n|Aliquot Submitter ID             |count|\n+---------------------------------+-----+\n|TCGA-BH-A0HP-01A-41-A21V-30      |1    |\n|e2c7113f-957f-4088-b57d-55d2f2_D2|1    |\n|1990853a-0e82-40b3-a025-91cdc1_D2|1    |\n|x1048334_1                       |1    |\n|X1509534                         |1    |\n+---------------------------------+-----+\nonly showing top 5 rows\n\n\nColumn: Sample ID\n+------------------------------------+-----+\n|Sample ID                           |count|\n+------------------------------------+-----+\n|b1f9c781-7efb-4ae8-a97b-a3031fe3dd50|2    |\n|9181e16e-363a-4593-b921-b8c2ab086b1c|1    |\n|e07fe22f-a657-472c-8d1e-b8b38eb81fe5|1    |\n|5df08a96-12cb-444a-860f-7a4f4cd1ba21|1    |\n|3280cf23-0a5d-11eb-bc0e-0aad30af8a83|1    |\n+------------------------------------+-----+\nonly showing top 5 rows\n\n\nColumn: Sample Submitter ID\n+------------------------------+-----+\n|Sample Submitter ID           |count|\n+------------------------------+-----+\n|7316UP-1883                   |2    |\n|e45c73a8-da96-4269-a039-d43942|1    |\n|c875a9a4-035f-425a-9348-26129c|1    |\n|a7d58d61-5049-488f-9d0a-0a4411|1    |\n|TCGA-A8-A076-01A              |1    |\n+------------------------------+-----+\nonly showing top 5 rows\n\n\nColumn: Case ID\n+------------------------------------+-----+\n|Case ID                             |count|\n+------------------------------------+-----+\n|e544fb5a-5b53-4e1d-a3ab-e80239baca87|6    |\n|3dd29f9b-382e-44c5-9f89-51799a668359|4    |\n|45dcaca6-c4bb-4f11-a014-c55c78ee175d|4    |\n|af051b71-95cf-495b-93cb-da7d5457a586|4    |\n|327fcac6-0a5d-11eb-bc0e-0aad30af8a83|2    |\n+------------------------------------+-----+\nonly showing top 5 rows\n\n\nColumn: Case Submitter ID\n+-----------------+-----+\n|Case Submitter ID|count|\n+-----------------+-----+\n|BCN 1326         |6    |\n|BCN 1367         |4    |\n|BCN 1369         |4    |\n|BCN 1300         |4    |\n|WHIM14           |3    |\n+-----------------+-----+\nonly showing top 5 rows\n\n\nColumn: Project Name\n+---------------------------------------------------+-----+\n|Project Name                                       |count|\n+---------------------------------------------------+-----+\n|Proteogenomic Translational Research Centers (PTRC)|156  |\n|CPTAC2 Confirmatory                                |143  |\n|CPTAC2 Retrospective                               |108  |\n|CPTAC3-Other                                       |43   |\n|CPTAC3 Discovery and Confirmatory                  |2    |\n+---------------------------------------------------+-----+\n\n\nColumn: Sample Type\n+----------------------+-----+\n|Sample Type           |count|\n+----------------------+-----+\n|Primary Tumor         |377  |\n|Tumor                 |35   |\n|Solid Tissue Normal   |18   |\n|Xenograft Tissue      |17   |\n|Normal Adjacent Tissue|3    |\n+----------------------+-----+\nonly showing top 5 rows\n\n\nColumn: Primary Site\n+------------+-----+\n|Primary Site|count|\n+------------+-----+\n|Breast      |444  |\n|Not Reported|8    |\n+------------+-----+\n\n\nColumn: Disease Type\n+-------------------------+-----+\n|Disease Type             |count|\n+-------------------------+-----+\n|Breast Invasive Carcinoma|452  |\n+-------------------------+-----+\n\n\nColumn: Aliquot Is Ref\n+--------------+-----+\n|Aliquot Is Ref|count|\n+--------------+-----+\n|No            |452  |\n+--------------+-----+\n\n\nColumn: Aliquot Status\n+--------------+-----+\n|Aliquot Status|count|\n+--------------+-----+\n|Qualified     |451  |\n|Disqualified  |1    |\n+--------------+-----+\n\n\nColumn: Aliquot Quantity\n+----------------+-----+\n|Aliquot Quantity|count|\n+----------------+-----+\n|null            |452  |\n+----------------+-----+\n\n\nColumn: Aliquot Volume\n+--------------+-----+\n|Aliquot Volume|count|\n+--------------+-----+\n|null          |452  |\n+--------------+-----+\n\n\nColumn: Amount\n+------+-----+\n|Amount|count|\n+------+-----+\n|null  |452  |\n+------+-----+\n\n\nColumn: Analyte Type\n+------------+-----+\n|Analyte Type|count|\n+------------+-----+\n|Protein     |376  |\n|null        |76   |\n+------------+-----+\n\n\nColumn: Concentration\n+-------------+-----+\n|Concentration|count|\n+-------------+-----+\n|null         |452  |\n+-------------+-----+\n\n\nColumn: Case Status\n+------------+-----+\n|Case Status |count|\n+------------+-----+\n|Qualified   |451  |\n|Disqualified|1    |\n+------------+-----+\n\n\nColumn: Sample Status\n+-------------+-----+\n|Sample Status|count|\n+-------------+-----+\n|Qualified    |376  |\n|qualified    |75   |\n|Disqualified |1    |\n+-------------+-----+\n\n\nColumn: Sample Is Ref\n+-------------+-----+\n|Sample Is Ref|count|\n+-------------+-----+\n|No           |294  |\n|null         |158  |\n+-------------+-----+\n\n\nColumn: Biospecimen Anatomic Site\n+-------------------------+-----+\n|Biospecimen Anatomic Site|count|\n+-------------------------+-----+\n|Not Reported             |251  |\n|null                     |201  |\n+-------------------------+-----+\n\n\nColumn: Biospecimen Laterality\n+----------------------+-----+\n|Biospecimen Laterality|count|\n+----------------------+-----+\n|null                  |452  |\n+----------------------+-----+\n\n\nColumn: Composition\n+------------+-----+\n|Composition |count|\n+------------+-----+\n|Not Reported|375  |\n|Solid Tissue|77   |\n+------------+-----+\n\n\nColumn: Current Weight\n+--------------+-----+\n|Current Weight|count|\n+--------------+-----+\n|null          |452  |\n+--------------+-----+\n\n\nColumn: Days To Collection\n+------------------+-----+\n|Days To Collection|count|\n+------------------+-----+\n|null              |347  |\n|57                |5    |\n|69                |3    |\n|162               |2    |\n|343               |2    |\n+------------------+-----+\nonly showing top 5 rows\n\n\nColumn: Days To Sample Procurement\n+--------------------------+-----+\n|Days To Sample Procurement|count|\n+--------------------------+-----+\n|null                      |452  |\n+--------------------------+-----+\n\n\nColumn: Diagnosis Pathologically Confirmed\n+----------------------------------+-----+\n|Diagnosis Pathologically Confirmed|count|\n+----------------------------------+-----+\n|Not Reported                      |251  |\n|null                              |201  |\n+----------------------------------+-----+\n\n\nColumn: Freezing Method\n+---------------+-----+\n|Freezing Method|count|\n+---------------+-----+\n|null           |452  |\n+---------------+-----+\n\n\nColumn: Initial Weight\n+--------------+-----+\n|Initial Weight|count|\n+--------------+-----+\n|null          |347  |\n|310.00        |5    |\n|340.00        |4    |\n|330.00        |4    |\n|300.00        |4    |\n+--------------+-----+\nonly showing top 5 rows\n\n\nColumn: Intermediate Dimension\n+----------------------+-----+\n|Intermediate Dimension|count|\n+----------------------+-----+\n|null                  |347  |\n|NULL                  |105  |\n+----------------------+-----+\n\n\nColumn: Longest Dimension\n+-----------------+-----+\n|Longest Dimension|count|\n+-----------------+-----+\n|null             |347  |\n|NULL             |105  |\n+-----------------+-----+\n\n\nColumn: Method Of Sample Procurement\n+----------------------------+-----+\n|Method Of Sample Procurement|count|\n+----------------------------+-----+\n|Not Reported                |251  |\n|Biopsy                      |110  |\n|null                        |91   |\n+----------------------------+-----+\n\n\nColumn: Pathology Report UUID\n+---------------------+-----+\n|Pathology Report UUID|count|\n+---------------------+-----+\n|null                 |452  |\n+---------------------+-----+\n\n\nColumn: Preservation Method\n+-------------------+-----+\n|Preservation Method|count|\n+-------------------+-----+\n|Not Reported       |251  |\n|null               |201  |\n+-------------------+-----+\n\n\nColumn: Sample Type id\n+--------------+-----+\n|Sample Type id|count|\n+--------------+-----+\n|null          |344  |\n|\"\"\"01\"\"\"      |108  |\n+--------------+-----+\n\n\nColumn: Sample Ordinal\n+--------------+-----+\n|Sample Ordinal|count|\n+--------------+-----+\n|null          |452  |\n+--------------+-----+\n\n\nColumn: Shortest Dimension\n+------------------+-----+\n|Shortest Dimension|count|\n+------------------+-----+\n|null              |347  |\n|NULL              |105  |\n+------------------+-----+\n\n\nColumn: Time Between Clamping And Freezing\n+----------------------------------+-----+\n|Time Between Clamping And Freezing|count|\n+----------------------------------+-----+\n|null                              |452  |\n+----------------------------------+-----+\n\n\nColumn: Time Between Excision and Freezing\n+----------------------------------+-----+\n|Time Between Excision and Freezing|count|\n+----------------------------------+-----+\n|null                              |452  |\n+----------------------------------+-----+\n\n\nColumn: Tissue Collection Type\n+----------------------+-----+\n|Tissue Collection Type|count|\n+----------------------+-----+\n|null                  |452  |\n+----------------------+-----+\n\n\nColumn: Tissue Type\n+------------+-----+\n|Tissue Type |count|\n+------------+-----+\n|Not Reported|259  |\n|Tumor       |193  |\n+------------+-----+\n\n\nColumn: Tumor Code\n+----------+-----+\n|Tumor Code|count|\n+----------+-----+\n|null      |452  |\n+----------+-----+\n\n\nColumn: Tumor Code ID\n+-------------+-----+\n|Tumor Code ID|count|\n+-------------+-----+\n|null         |452  |\n+-------------+-----+\n\n\nColumn: Tumor Descriptor\n+----------------+-----+\n|Tumor Descriptor|count|\n+----------------+-----+\n|Not Reported    |251  |\n|null            |199  |\n|Metastatic      |2    |\n+----------------+-----+\n\n\nColumn: Program Name\n+--------------------------------------------+-----+\n|Program Name                                |count|\n+--------------------------------------------+-----+\n|Clinical Proteomic Tumor Analysis Consortium|452  |\n+--------------------------------------------+-----+\n\n\n=== Dimensionality Reduction ===\n\nInsufficient numeric data for PCA\n\nNo PCA results to save\n\nPipeline completed successfully!\nSpark session closed\n"]}],"execution_count":20,"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"de0ecaf3-d336-4b39-a729-8bfa1c3bc626"}],"metadata":{"kernel_info":{"name":"synapse_pyspark"},"kernelspec":{"name":"synapse_pyspark","language":"Python","display_name":"Synapse PySpark"},"language_info":{"name":"python"},"microsoft":{"language":"python","language_group":"synapse_pyspark","ms_spell_check":{"ms_spell_check_language":"en"}},"nteract":{"version":"nteract-front-end@1.0.0"},"spark_compute":{"compute_id":"/trident/default","session_options":{"conf":{"spark.synapse.nbs.session.timeout":"1200000"}}},"dependencies":{"lakehouse":{"default_lakehouse":"53477481-ba13-4a4f-a8ea-d1f736d0f87e","known_lakehouses":[{"id":"53477481-ba13-4a4f-a8ea-d1f736d0f87e"}],"default_lakehouse_name":"GenomeLH","default_lakehouse_workspace_id":"cde60769-1208-4712-9d88-602cb5dae476"}}},"nbformat":4,"nbformat_minor":5}