a b/scripts/data-prepare.py
1
from openai import OpenAI
2
from dotenv import load_dotenv
3
import openai
4
import json
5
import os
6
import argparse
7
8
def generate_adverse_event_report(prompt, model="gpt-4-1106-preview", max_tokens=3500, temperature=1, top_p=1, frequency_penalty=0, presence_penalty=0):
9
    """
10
    Generate Adverse Event Reports for the Drug using the OpenAI API.
11
12
    Args:
13
        prompt (str): Prompt for the OpenAI API
14
15
    Returns:
16
        response (str): Response from the OpenAI API
17
    """
18
    # OpenAI Client
19
    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
20
    
21
    # OpenAI Completion API
22
    response = client.chat.completions.create(model=model,
23
                                            messages=[
24
                                                {"role": "system", "content": "Act as an expert Analyst with 20+ years of experience in Pharma and Healthcare industry. You have to generate Adverse Event Reports in properly formatted JSON"},
25
                                                {"role": "user", "content": prompt}],
26
                                            response_format={ "type": "json_object" },
27
                                            temperature=temperature,
28
                                            max_tokens=max_tokens,
29
                                            top_p=top_p,
30
                                            frequency_penalty=frequency_penalty,
31
                                            presence_penalty=presence_penalty
32
                                        )
33
34
    return response.choices[0].message.content.strip()
35
36
def create_prompt(drug_name, drug_report):
37
    """
38
    Create a prompt for the OpenAI API using the drug name and the drug report.
39
40
    Args:
41
        drug_name (str): Name of the drug
42
        drug_report (str): Information about the drug
43
    
44
    Returns:
45
        prompt (str): Prompt for the OpenAI API
46
    """
47
    return f"""Sample Adverse Event reports:
48
    [
49
        {{
50
            "input": "Nicole Moore
51
                    moore123nicole@hotmail.com
52
                    32 McMurray Court, Columbia, SC 41250
53
                    9840105113, United States 
54
                    
55
                    Relationship to XYZ Pharma Inc.: Patient or Caregiver
56
                    Reason for contacting: Adverse Event
57
                    
58
                    Message: Yes, I have been taking Mylan’s brand of Metroprolol for two years now and with no problem. I recently had my prescription refilled with the same Mylan Metoprolol and I’m having a hard time sleeping at night along with running nose. Did you possibly change something with the pill...possibly different fillers? The pharmacist at CVS didn’t have any information for me. Thank you, Nicole Moore", 
59
            "output": {{
60
                "drug_name":"Metroprolol",
61
                "adverse_events": ["hard time sleeping at night", "running nose"]
62
            }}
63
        }},
64
        {{
65
            "input": "Jack Ryan,
66
                    jack3rayan@gmail.com
67
                    120 Erwin RD, Canonsburg, PA 21391,
68
                    2133681441, United States
69
                    
70
                    Relationship to XYZ Pharma Inc.: Patient
71
                    Reason for contacting: Defective Product
72
                    
73
                    Message: I recently purchased a Wixela inhub 250/50 at my local CVS pharmacy and the inhaler appears to be defective. When I try and activate it, the yellow knob only goes down halfway. I just removed this one from the wrapper so I know it's not empty. The pharmacy wouldn't exchange it so I am contacting you to get a replacement. Thank you for your time and consideration in this matter",
74
            "output": {{
75
                "drug_name":"Wixela inhub 250/50",
76
                "adverse_events": ["defective inhaler"]
77
            }}
78
        }},
79
    ]
80
81
    Now create Adverse Event Reports in a similar way for the Drug - {drug_name}. 
82
83
    You have more information about the drug's use and its side effects below:
84
    {drug_report}
85
86
    Generate 15 different reports each with different side effects. Mention one or two side effects in each report at max. You have to prepare data for Entity Extraction of 2 entities: "drug_name" and "adverse_events" only.
87
    Followng the following format for the final output:
88
89
    [
90
        {{
91
        "input":"## Generated Report Here",
92
        "output": {{ "drug_name":"## Name of Drug", "adverse_events": ["side effect 1", "side effect 2"] }}
93
        }},
94
        {{
95
        "input":"## Generated Report Here",
96
        "output": {{ "drug_name":"## Name of Drug", "adverse_events": ["side effect 1", "side effect 2"] }}
97
        }},
98
    ]
99
    """
100
101
def create_dataset(folder_path):
102
    """
103
    Create a dataset of Adverse Event Reports for the Drugs using the OpenAI Chat Completions API.
104
105
    Args:
106
        folder_path (str): Path to the folder containing the Drug Information files
107
108
    Returns:
109
        None
110
    """
111
112
    # Create the folder if it doesn't exist
113
    if not os.path.exists("data/entity_extraction_reports/"):
114
        os.makedirs("data/entity_extraction_reports/")
115
116
    # Iterate through the files in the folder
117
    for filename in os.listdir(folder_path):
118
        if filename.endswith('.txt'):
119
            file_path = os.path.join(folder_path, filename)
120
121
            # Read the contents of the file
122
            with open(file_path, 'r') as file:
123
                file_contents = file.read()
124
125
            # Get the name of the drug from the filename
126
            drug_name = filename.split('.')[0]
127
            # Get the information about the drug from the file contents
128
            drug_report = file_contents
129
130
            # Create a dynamic prompt
131
            prompt = create_prompt(drug_name, drug_report)
132
133
            # Generate Adverse Event Reports for the Drug
134
            reports = generate_adverse_event_report(prompt)
135
136
            # Convert the string response to a Python Dict object
137
            output_list = json.loads(reports)
138
139
            # Save the generated data as a JSON file
140
            with open(f"data/entity_extraction_reports/{drug_name}.txt", 'w') as text_file:
141
                text_file.write(output_list)
142
143
144
if __name__ == '__main__':
145
    # Load the .env file with the API key
146
    load_dotenv()
147
148
    # Parse the arguments
149
    parser = argparse.ArgumentParser(description="Data Preparation Script")
150
    parser.add_argument('--folder-path', type=str, default='data/raw_drug_info/', help="Path to the folder containing the raw Drug Information files scraped form web")
151
    args = parser.parse_args()
152
153
    # Create the folder if it doesn't exist
154
    if not os.path.exists(args.folder_path):
155
        os.makedirs(args.folder_path)
156
157
    create_dataset(args.folder_path)