[2ff6cc]: / notebooks / Biospecimen_Report_Generator.ipynb

Download this file

1 lines (1 with data), 13.7 kB

{"cells":[{"cell_type":"code","source":["# Install required packages\n","%pip install openai pandas matplotlib seaborn --quiet\n","print(\"Packages installed successfully!\")"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":10,"statement_ids":[5,6,7,8,9,10],"state":"finished","livy_statement_state":"available","session_id":"fb2683e7-6b16-4aa6-87c7-0baa65e90922","normalized_state":"finished","queued_time":"2025-04-02T01:10:12.9879675Z","session_start_time":null,"execution_start_time":"2025-04-02T01:10:12.9900412Z","execution_finish_time":"2025-04-02T01:10:35.8202238Z","parent_msg_id":"02c7cf21-a729-4659-8b59-2a567b6a1753"},"text/plain":"StatementMeta(, fb2683e7-6b16-4aa6-87c7-0baa65e90922, 10, Finished, Available, Finished)"},"metadata":{}},{"output_type":"stream","name":"stdout","text":["\n\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.0.1\u001b[0m\n\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython -m pip install --upgrade pip\u001b[0m\nNote: you may need to restart the kernel to use updated packages.\nPackages installed successfully!\nWarning: PySpark kernel has been restarted to use updated packages.\n\n"]}],"execution_count":3,"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"1be969b8-444e-4171-8553-aef664360a27"},{"cell_type":"code","source":["from notebookutils import mssparkutils\n","import pandas as pd\n","import os\n","\n","# Azure OpenAI Configuration\n","key_vault_name = \"your-key-vault-name\"              # Update with your Key Vault name\n","openai_secret_name = \"azure-openai-api-key\"        # Update with your secret name\n","openai_endpoint = \"your-resource.openai.azure.com\" # Update with your endpoint\n","openai_version = \"2023-05-15\"\n","openai_model = \"gpt-4\"\n","\n","# File paths\n","data_path = \"/lakehouse/default/Files/PDC_biospecimen_manifest_03272025_214257.csv\"\n","report_path = \"/lakehouse/default/Files/biospecimen_analysis_report.html\""],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":12,"statement_ids":[12],"state":"finished","livy_statement_state":"available","session_id":"fb2683e7-6b16-4aa6-87c7-0baa65e90922","normalized_state":"finished","queued_time":"2025-04-02T01:11:09.6737809Z","session_start_time":null,"execution_start_time":"2025-04-02T01:11:12.5156367Z","execution_finish_time":"2025-04-02T01:11:12.8255574Z","parent_msg_id":"4b3ffa4b-cbc8-4d32-acc7-dd10e02d4382"},"text/plain":"StatementMeta(, fb2683e7-6b16-4aa6-87c7-0baa65e90922, 12, Finished, Available, Finished)"},"metadata":{}}],"execution_count":4,"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"9bfb1abd-28b9-4e3b-bf4b-562aff34f8e1"},{"cell_type":"code","source":["from openai import AzureOpenAI\n","import pandas as pd\n","import matplotlib.pyplot as plt\n","import seaborn as sns\n","from notebookutils import mssparkutils\n","import os\n","from IPython.display import display, HTML\n","\n","def generate_biospecimen_report():\n","    \"\"\"\n","    Generate comprehensive biospecimen report with statistics, visualizations,\n","    and AI-powered insights in Microsoft Fabric.\n","    \"\"\"\n","    try:\n","        # 1. Load data from Fabric Lakehouse\n","        print(\"⏳ Loading data from Lakehouse...\")\n","        df = pd.read_csv(data_path)\n","        print(f\"✅ Successfully loaded data with shape: {df.shape}\")\n","        \n","        # 2. Generate basic statistics\n","        print(\"\\n📊 Generating statistics...\")\n","        numerical_summary = df.describe(include=['number']).to_string()\n","        categorical_summary = \"\"\n","        \n","        for col in df.select_dtypes(include=['object']).columns:\n","            categorical_summary += f\"\\n\\n=== {col} ===\\n{df[col].value_counts(dropna=False).to_string()}\"\n","        \n","        # 3. Create visualizations\n","        print(\"\\n🎨 Creating visualizations...\")\n","        os.makedirs('temp_visualizations', exist_ok=True)\n","        \n","        # Visualization 1: Sample Type Distribution\n","        plt.figure(figsize=(12, 6))\n","        df['Sample Type'].value_counts().plot(kind='bar', color='skyblue')\n","        plt.title('Sample Type Distribution', fontsize=14)\n","        plt.xlabel('Sample Type', fontsize=12)\n","        plt.ylabel('Count', fontsize=12)\n","        plt.xticks(rotation=45, ha='right')\n","        plt.tight_layout()\n","        sample_type_path = 'temp_visualizations/sample_type_distribution.png'\n","        plt.savefig(sample_type_path, dpi=300)\n","        plt.close()\n","        \n","        # Visualization 2: Disease Type Distribution (Top 15)\n","        plt.figure(figsize=(12, 6))\n","        df['Disease Type'].value_counts().nlargest(15).plot(kind='barh', color='lightgreen')\n","        plt.title('Top 15 Disease Types', fontsize=14)\n","        plt.xlabel('Count', fontsize=12)\n","        plt.ylabel('Disease Type', fontsize=12)\n","        plt.tight_layout()\n","        disease_type_path = 'temp_visualizations/disease_type_distribution.png'\n","        plt.savefig(disease_type_path, dpi=300)\n","        plt.close()\n","        \n","        # 4. Generate AI Insights\n","        print(\"\\n🧠 Generating AI insights...\")\n","        client = AzureOpenAI(\n","            api_key=mssparkutils.credentials.getSecret(key_vault_name, openai_secret_name),\n","            api_version=openai_version,\n","            azure_endpoint=f\"https://{openai_endpoint}\"\n","        )\n","        \n","        prompt = f\"\"\"You are a biomedical data analyst. Analyze this biospecimen dataset:\n","\n","        NUMERICAL SUMMARY:\n","        {numerical_summary}\n","\n","        CATEGORICAL DISTRIBUTIONS:\n","        {categorical_summary}\n","\n","        Provide:\n","        1. 3 key observations about the data composition\n","        2. 2 potential data quality issues to investigate\n","        3. 3 recommendations for further analysis\n","        4. 1 interesting pattern worth exploring\n","        \n","        Format the response with clear headings and bullet points.\"\"\"\n","        \n","        response = client.chat.completions.create(\n","            model=openai_model,\n","            messages=[{\"role\": \"user\", \"content\": prompt}],\n","            temperature=0.3,\n","            max_tokens=1000\n","        )\n","        insights = response.choices[0].message.content\n","        \n","        # 5. Compile and Save Report\n","        print(\"\\n📝 Compiling final report...\")\n","        \n","        # Create HTML content with proper escaping\n","        insights_html = insights.replace('\\n', '<br>')\n","        html_content = f\"\"\"\n","<html>\n","<head>\n","    <title>Biospecimen Analysis Report</title>\n","    <style>\n","        body {{ font-family: Arial, sans-serif; line-height: 1.6; }}\n","        h1 {{ color: #2e6c80; }}\n","        h2 {{ color: #3e7c90; margin-top: 30px; }}\n","        img {{ max-width: 100%; height: auto; margin: 20px 0; border: 1px solid #ddd; }}\n","        .insights {{ background-color: #f5f9fa; padding: 15px; border-radius: 5px; }}\n","        pre {{ white-space: pre-wrap; background-color: #f5f5f5; padding: 10px; border-radius: 5px; }}\n","    </style>\n","</head>\n","<body>\n","    <h1>Biospecimen Analysis Report</h1>\n","    <p>Generated on: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M')}</p>\n","    \n","    <h2>1. Data Overview</h2>\n","    <p>Total records: {len(df):,}</p>\n","    <p>Total columns: {len(df.columns)}</p>\n","    \n","    <h2>2. Sample Type Distribution</h2>\n","    <img src=\"sample_type_distribution.png\" alt=\"Sample Type Distribution\">\n","    \n","    <h2>3. Disease Type Distribution (Top 15)</h2>\n","    <img src=\"disease_type_distribution.png\" alt=\"Disease Type Distribution\">\n","    \n","    <h2>4. Statistical Summary</h2>\n","    <pre>{numerical_summary}</pre>\n","    \n","    <h2>5. AI-Generated Insights</h2>\n","    <div class=\"insights\">\n","        {insights_html}\n","    </div>\n","</body>\n","</html>\n","        \"\"\"\n","        \n","        # Save HTML report\n","        with open(report_path, 'w') as f:\n","            f.write(html_content)\n","        \n","        # Save visualizations to Lakehouse\n","        mssparkutils.fs.cp(f\"file:{sample_type_path}\", \"/lakehouse/default/Files/sample_type_distribution.png\")\n","        mssparkutils.fs.cp(f\"file:{disease_type_path}\", \"/lakehouse/default/Files/disease_type_distribution.png\")\n","        \n","        print(f\"\\n🎉 Report successfully generated and saved to:\")\n","        print(f\"- HTML Report: {report_path}\")\n","        print(f\"- Visualization 1: /lakehouse/default/Files/sample_type_distribution.png\")\n","        print(f\"- Visualization 2: /lakehouse/default/Files/disease_type_distribution.png\")\n","        \n","        # Display report preview\n","        display(HTML(f\"<a href='{report_path}' target='_blank'>Open Full Report</a>\"))\n","        \n","    except Exception as e:\n","        print(f\"\\n❌ Error generating report: {str(e)}\")\n","        print(\"\\n🛠️ Troubleshooting steps:\")\n","        print(\"1. Verify packages are installed (%pip install openai pandas matplotlib seaborn)\")\n","        print(\"2. Check the data file exists in your Lakehouse Files\")\n","        print(\"3. Verify Azure OpenAI credentials in Key Vault\")\n","        print(\"4. Ensure your Fabric capacity has network access to Azure OpenAI\")\n","\n","# Execute the report generation\n","generate_biospecimen_report()"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":13,"statement_ids":[13],"state":"finished","livy_statement_state":"available","session_id":"fb2683e7-6b16-4aa6-87c7-0baa65e90922","normalized_state":"finished","queued_time":"2025-04-02T01:12:02.3428096Z","session_start_time":null,"execution_start_time":"2025-04-02T01:12:02.3442411Z","execution_finish_time":"2025-04-02T01:12:12.1568723Z","parent_msg_id":"aa22cbaf-ee1e-4eb4-84be-36f28d07a873"},"text/plain":"StatementMeta(, fb2683e7-6b16-4aa6-87c7-0baa65e90922, 13, Finished, Available, Finished)"},"metadata":{}},{"output_type":"stream","name":"stdout","text":["⏳ Loading data from Lakehouse...\n✅ Successfully loaded data with shape: (452, 45)\n\n📊 Generating statistics...\n\n🎨 Creating visualizations...\n\n🧠 Generating AI insights...\n\n❌ Error generating report: An error occurred while calling z:mssparkutils.credentials.getSecret.\n: com.microsoft.azure.trident.tokenlibrary.util.AkvHttpClientException: Invalid vault uri. Uri should match azure key vault URI like https://<keyVaultName>.vault.azure.net/\n\tat com.microsoft.azure.trident.tokenlibrary.util.AkvBasedSecretProviderClientImpl.invokeGetTarget(AkvBasedSecretProviderClient.scala:122)\n\tat com.microsoft.azure.trident.tokenlibrary.util.AkvBasedSecretProviderClientImpl.getAkvSecretWithAccessToken(AkvBasedSecretProviderClient.scala:153)\n\tat com.microsoft.azure.trident.tokenlibrary.TokenLibrary.getSecretWithToken(TokenLibrary.scala:806)\n\tat com.microsoft.azure.trident.tokenlibrary.TokenLibrary$.getSecretWithToken(TokenLibrary.scala:1359)\n\tat mssparkutils.credentials$.getSecret(credentials.scala:166)\n\tat mssparkutils.credentials.getSecret(credentials.scala)\n\tat java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.base/java.lang.reflect.Method.invoke(Method.java:566)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)\n\tat py4j.Gateway.invoke(Gateway.java:282)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:238)\n\tat java.base/java.lang.Thread.run(Thread.java:829)\n\n\n🛠️ Troubleshooting steps:\n1. Verify packages are installed (%pip install openai pandas matplotlib seaborn)\n2. Check the data file exists in your Lakehouse Files\n3. Verify Azure OpenAI credentials in Key Vault\n4. Ensure your Fabric capacity has network access to Azure OpenAI\n"]}],"execution_count":5,"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"4d2a38ac-f4c3-4425-83a2-0262b5cab948"}],"metadata":{"kernel_info":{"name":"synapse_pyspark"},"kernelspec":{"name":"synapse_pyspark","language":"Python","display_name":"Synapse PySpark"},"language_info":{"name":"python"},"microsoft":{"language":"python","language_group":"synapse_pyspark","ms_spell_check":{"ms_spell_check_language":"en"}},"nteract":{"version":"nteract-front-end@1.0.0"},"spark_compute":{"compute_id":"/trident/default","session_options":{"conf":{"spark.synapse.nbs.session.timeout":"1200000"}}},"dependencies":{"lakehouse":{"known_lakehouses":[{"id":"53477481-ba13-4a4f-a8ea-d1f736d0f87e"}],"default_lakehouse":"53477481-ba13-4a4f-a8ea-d1f736d0f87e","default_lakehouse_name":"GenomeLH","default_lakehouse_workspace_id":"cde60769-1208-4712-9d88-602cb5dae476"}}},"nbformat":4,"nbformat_minor":5}