Diff of /CleanData.py [000000] .. [6e0d8e]

Switch to unified view

a b/CleanData.py
1
# Import packages
2
import numpy as np
3
import matplotlib
4
import pd
5
import pandas as pd
6
from datetime import datetime
7
import us
8
import uszipcode
9
matplotlib.rcParams['figure.figsize'] = (10, 10)
10
11
12
# first step: vectorize the columns to convert strings into floats
13
# grab the columns
14
15
clin_addresses = pd.read_csv("/Users/shania/PycharmProjects/ClinicalAttritionRateMap/updated_clinical_attrition.csv")
16
17
# define a function that converts the dates into just the month and year
18
def parse_date(date_str):
19
    if pd.isna(date_str):
20
        return np.nan  # Return null
21
    try: # if it's in a month day year format, convert to month year
22
        return datetime.strptime(date_str, '%B %d, %Y').strftime('%B %Y')
23
    except ValueError:
24
        try:
25
            # already in the month year format, keep it/no change
26
            return datetime.strptime(date_str, '%B %Y').strftime('%B %Y')
27
        except ValueError:
28
            return np.nan  # Return null if both formats fail
29
30
# Apply the parse_date function the dates columns
31
clin_addresses['New Completion Date'] = clin_addresses['Completion Date'].apply(parse_date)
32
clin_addresses['New Start Date'] = clin_addresses['Start Date'].apply(parse_date)
33
34
# convert from strings to integers
35
clin_addresses['New Completion Date'] = pd.to_datetime(clin_addresses['New Completion Date'], errors='coerce', format='%B %Y')
36
clin_addresses['New Start Date'] = pd.to_datetime(clin_addresses['New Start Date'], errors='coerce', format='%B %Y')
37
38
#  difference in days
39
clin_addresses['length_of_trial'] = (clin_addresses['New Completion Date'] - clin_addresses['New Start Date']).dt.days
40
41
# Drop the original start_date and completion_date columns
42
# clin_addresses = clin_addresses.drop(columns=['Start Date', 'Completion Date'])
43
44
# Save the updated DataFrame to a new CSV file
45
clin_addresses.to_csv("/Users/shania/PycharmProjects/ClinicalAttritionRateMap/updated_clinical_attrition.csv", index=False)
46
47
# add the ruca codes
48
ruca_codes = pd.read_excel("2006 Complete Excel RUCA file 3.xls")
49
clin_addresses['Zipcode'] = clin_addresses['Zipcode'].astype(str)
50
ruca_codes['ZIPA'] = ruca_codes['ZIPA'].astype(str)
51
final_table = pd.merge(clin_addresses, ruca_codes, left_on='Zipcode', right_on='ZIPA', how='left')
52
drop_columns = ['ZIPN', 'ZIPA', 'STNAME','COMMFLAG']
53
final_table = final_table.drop(columns=drop_columns)
54
final_table.to_csv("/Users/shania/PycharmProjects/ClinicalAttritionRateMap/final_table.csv", index=False)
55
56
# define a function for the zipcodes
57
## if the zipcodes do not have exactly 5 digits
58
## if the zipcodes contain a letter
59
## if the zipcodes contain a "dash" take the first part
60
61
def cleaned_zipcodes(final_table):
62
    search = uszipcode.SearchEngine() # search engine comes with package
63
    clean_zipcode = []
64
    for i, city in zip(final_table['Zipcode'], final_table['City']): # zip zipcodes and city together
65
        if "-" in str(i):
66
            clean_zipcode.append(str(i).split('-')[0])
67
        elif pd.isna(i):
68
            result = search.by_city(city)
69
            clean_zipcode.append(result[0].zipcode if result else None)
70
        else:
71
            clean_zipcode.append(i)
72
    return clean_zipcode
73
final_table['Cleaned_Zipcodes'] = cleaned_zipcodes(final_table)
74
final_table.to_csv("/Users/shania/PycharmProjects/ClinicalAttritionRateMap/final_table.csv", index=False)
75
76
def cleaned_states(final_table):
77
    clean_state = []
78
    for state in final_table['State']:
79
        if state in [str(s) for s in us.states.STATES]:
80
            clean_state.append(state)
81
        else:
82
            clean_state.append(None)
83
    return clean_state
84
final_table['Cleaned States'] = cleaned_states(final_table)
85
final_table.dropna(subset=['Cleaned States'], inplace=True)
86
final_table.drop(columns='Cleaned States', inplace=True)
87
final_table.to_csv("/Users/shania/PycharmProjects/ClinicalAttritionRateMap/final_table.csv", index=False)
88
89
def convert_minimum_age(age):
90
    age = str(age)
91
    if "Years" in age:
92
        return int(''.join(filter(str.isdigit, age)))
93
    elif "Months" in age:
94
        return float(''.join(filter(str.isdigit, age))) / 12
95
    else:
96
        return None
97
final_table['Minimum Age'] = final_table['Minimum Age'].apply(convert_minimum_age)
98
final_table.to_csv("/Users/shania/PycharmProjects/ClinicalAttritionRateMap/final_table.csv", index=False)