|
a |
|
b/CleanData.py |
|
|
1 |
# Import packages |
|
|
2 |
import numpy as np |
|
|
3 |
import matplotlib |
|
|
4 |
import pd |
|
|
5 |
import pandas as pd |
|
|
6 |
from datetime import datetime |
|
|
7 |
import us |
|
|
8 |
import uszipcode |
|
|
9 |
matplotlib.rcParams['figure.figsize'] = (10, 10) |
|
|
10 |
|
|
|
11 |
|
|
|
12 |
# first step: vectorize the columns to convert strings into floats |
|
|
13 |
# grab the columns |
|
|
14 |
|
|
|
15 |
clin_addresses = pd.read_csv("/Users/shania/PycharmProjects/ClinicalAttritionRateMap/updated_clinical_attrition.csv") |
|
|
16 |
|
|
|
17 |
# define a function that converts the dates into just the month and year |
|
|
18 |
def parse_date(date_str): |
|
|
19 |
if pd.isna(date_str): |
|
|
20 |
return np.nan # Return null |
|
|
21 |
try: # if it's in a month day year format, convert to month year |
|
|
22 |
return datetime.strptime(date_str, '%B %d, %Y').strftime('%B %Y') |
|
|
23 |
except ValueError: |
|
|
24 |
try: |
|
|
25 |
# already in the month year format, keep it/no change |
|
|
26 |
return datetime.strptime(date_str, '%B %Y').strftime('%B %Y') |
|
|
27 |
except ValueError: |
|
|
28 |
return np.nan # Return null if both formats fail |
|
|
29 |
|
|
|
30 |
# Apply the parse_date function the dates columns |
|
|
31 |
clin_addresses['New Completion Date'] = clin_addresses['Completion Date'].apply(parse_date) |
|
|
32 |
clin_addresses['New Start Date'] = clin_addresses['Start Date'].apply(parse_date) |
|
|
33 |
|
|
|
34 |
# convert from strings to integers |
|
|
35 |
clin_addresses['New Completion Date'] = pd.to_datetime(clin_addresses['New Completion Date'], errors='coerce', format='%B %Y') |
|
|
36 |
clin_addresses['New Start Date'] = pd.to_datetime(clin_addresses['New Start Date'], errors='coerce', format='%B %Y') |
|
|
37 |
|
|
|
38 |
# difference in days |
|
|
39 |
clin_addresses['length_of_trial'] = (clin_addresses['New Completion Date'] - clin_addresses['New Start Date']).dt.days |
|
|
40 |
|
|
|
41 |
# Drop the original start_date and completion_date columns |
|
|
42 |
# clin_addresses = clin_addresses.drop(columns=['Start Date', 'Completion Date']) |
|
|
43 |
|
|
|
44 |
# Save the updated DataFrame to a new CSV file |
|
|
45 |
clin_addresses.to_csv("/Users/shania/PycharmProjects/ClinicalAttritionRateMap/updated_clinical_attrition.csv", index=False) |
|
|
46 |
|
|
|
47 |
# add the ruca codes |
|
|
48 |
ruca_codes = pd.read_excel("2006 Complete Excel RUCA file 3.xls") |
|
|
49 |
clin_addresses['Zipcode'] = clin_addresses['Zipcode'].astype(str) |
|
|
50 |
ruca_codes['ZIPA'] = ruca_codes['ZIPA'].astype(str) |
|
|
51 |
final_table = pd.merge(clin_addresses, ruca_codes, left_on='Zipcode', right_on='ZIPA', how='left') |
|
|
52 |
drop_columns = ['ZIPN', 'ZIPA', 'STNAME','COMMFLAG'] |
|
|
53 |
final_table = final_table.drop(columns=drop_columns) |
|
|
54 |
final_table.to_csv("/Users/shania/PycharmProjects/ClinicalAttritionRateMap/final_table.csv", index=False) |
|
|
55 |
|
|
|
56 |
# define a function for the zipcodes |
|
|
57 |
## if the zipcodes do not have exactly 5 digits |
|
|
58 |
## if the zipcodes contain a letter |
|
|
59 |
## if the zipcodes contain a "dash" take the first part |
|
|
60 |
|
|
|
61 |
def cleaned_zipcodes(final_table): |
|
|
62 |
search = uszipcode.SearchEngine() # search engine comes with package |
|
|
63 |
clean_zipcode = [] |
|
|
64 |
for i, city in zip(final_table['Zipcode'], final_table['City']): # zip zipcodes and city together |
|
|
65 |
if "-" in str(i): |
|
|
66 |
clean_zipcode.append(str(i).split('-')[0]) |
|
|
67 |
elif pd.isna(i): |
|
|
68 |
result = search.by_city(city) |
|
|
69 |
clean_zipcode.append(result[0].zipcode if result else None) |
|
|
70 |
else: |
|
|
71 |
clean_zipcode.append(i) |
|
|
72 |
return clean_zipcode |
|
|
73 |
final_table['Cleaned_Zipcodes'] = cleaned_zipcodes(final_table) |
|
|
74 |
final_table.to_csv("/Users/shania/PycharmProjects/ClinicalAttritionRateMap/final_table.csv", index=False) |
|
|
75 |
|
|
|
76 |
def cleaned_states(final_table): |
|
|
77 |
clean_state = [] |
|
|
78 |
for state in final_table['State']: |
|
|
79 |
if state in [str(s) for s in us.states.STATES]: |
|
|
80 |
clean_state.append(state) |
|
|
81 |
else: |
|
|
82 |
clean_state.append(None) |
|
|
83 |
return clean_state |
|
|
84 |
final_table['Cleaned States'] = cleaned_states(final_table) |
|
|
85 |
final_table.dropna(subset=['Cleaned States'], inplace=True) |
|
|
86 |
final_table.drop(columns='Cleaned States', inplace=True) |
|
|
87 |
final_table.to_csv("/Users/shania/PycharmProjects/ClinicalAttritionRateMap/final_table.csv", index=False) |
|
|
88 |
|
|
|
89 |
def convert_minimum_age(age): |
|
|
90 |
age = str(age) |
|
|
91 |
if "Years" in age: |
|
|
92 |
return int(''.join(filter(str.isdigit, age))) |
|
|
93 |
elif "Months" in age: |
|
|
94 |
return float(''.join(filter(str.isdigit, age))) / 12 |
|
|
95 |
else: |
|
|
96 |
return None |
|
|
97 |
final_table['Minimum Age'] = final_table['Minimum Age'].apply(convert_minimum_age) |
|
|
98 |
final_table.to_csv("/Users/shania/PycharmProjects/ClinicalAttritionRateMap/final_table.csv", index=False) |