Diff of /CleanData.py [000000] .. [6e0d8e]

Switch to side-by-side view

--- a
+++ b/CleanData.py
@@ -0,0 +1,98 @@
+# Import packages
+import numpy as np
+import matplotlib
+import pd
+import pandas as pd
+from datetime import datetime
+import us
+import uszipcode
+matplotlib.rcParams['figure.figsize'] = (10, 10)
+
+
+# first step: vectorize the columns to convert strings into floats
+# grab the columns
+
+clin_addresses = pd.read_csv("/Users/shania/PycharmProjects/ClinicalAttritionRateMap/updated_clinical_attrition.csv")
+
+# define a function that converts the dates into just the month and year
+def parse_date(date_str):
+    if pd.isna(date_str):
+        return np.nan  # Return null
+    try: # if it's in a month day year format, convert to month year
+        return datetime.strptime(date_str, '%B %d, %Y').strftime('%B %Y')
+    except ValueError:
+        try:
+            # already in the month year format, keep it/no change
+            return datetime.strptime(date_str, '%B %Y').strftime('%B %Y')
+        except ValueError:
+            return np.nan  # Return null if both formats fail
+
+# Apply the parse_date function the dates columns
+clin_addresses['New Completion Date'] = clin_addresses['Completion Date'].apply(parse_date)
+clin_addresses['New Start Date'] = clin_addresses['Start Date'].apply(parse_date)
+
+# convert from strings to integers
+clin_addresses['New Completion Date'] = pd.to_datetime(clin_addresses['New Completion Date'], errors='coerce', format='%B %Y')
+clin_addresses['New Start Date'] = pd.to_datetime(clin_addresses['New Start Date'], errors='coerce', format='%B %Y')
+
+#  difference in days
+clin_addresses['length_of_trial'] = (clin_addresses['New Completion Date'] - clin_addresses['New Start Date']).dt.days
+
+# Drop the original start_date and completion_date columns
+# clin_addresses = clin_addresses.drop(columns=['Start Date', 'Completion Date'])
+
+# Save the updated DataFrame to a new CSV file
+clin_addresses.to_csv("/Users/shania/PycharmProjects/ClinicalAttritionRateMap/updated_clinical_attrition.csv", index=False)
+
+# add the ruca codes
+ruca_codes = pd.read_excel("2006 Complete Excel RUCA file 3.xls")
+clin_addresses['Zipcode'] = clin_addresses['Zipcode'].astype(str)
+ruca_codes['ZIPA'] = ruca_codes['ZIPA'].astype(str)
+final_table = pd.merge(clin_addresses, ruca_codes, left_on='Zipcode', right_on='ZIPA', how='left')
+drop_columns = ['ZIPN', 'ZIPA', 'STNAME','COMMFLAG']
+final_table = final_table.drop(columns=drop_columns)
+final_table.to_csv("/Users/shania/PycharmProjects/ClinicalAttritionRateMap/final_table.csv", index=False)
+
+# define a function for the zipcodes
+## if the zipcodes do not have exactly 5 digits
+## if the zipcodes contain a letter
+## if the zipcodes contain a "dash" take the first part
+
+def cleaned_zipcodes(final_table):
+    search = uszipcode.SearchEngine() # search engine comes with package
+    clean_zipcode = []
+    for i, city in zip(final_table['Zipcode'], final_table['City']): # zip zipcodes and city together
+        if "-" in str(i):
+            clean_zipcode.append(str(i).split('-')[0])
+        elif pd.isna(i):
+            result = search.by_city(city)
+            clean_zipcode.append(result[0].zipcode if result else None)
+        else:
+            clean_zipcode.append(i)
+    return clean_zipcode
+final_table['Cleaned_Zipcodes'] = cleaned_zipcodes(final_table)
+final_table.to_csv("/Users/shania/PycharmProjects/ClinicalAttritionRateMap/final_table.csv", index=False)
+
+def cleaned_states(final_table):
+    clean_state = []
+    for state in final_table['State']:
+        if state in [str(s) for s in us.states.STATES]:
+            clean_state.append(state)
+        else:
+            clean_state.append(None)
+    return clean_state
+final_table['Cleaned States'] = cleaned_states(final_table)
+final_table.dropna(subset=['Cleaned States'], inplace=True)
+final_table.drop(columns='Cleaned States', inplace=True)
+final_table.to_csv("/Users/shania/PycharmProjects/ClinicalAttritionRateMap/final_table.csv", index=False)
+
+def convert_minimum_age(age):
+    age = str(age)
+    if "Years" in age:
+        return int(''.join(filter(str.isdigit, age)))
+    elif "Months" in age:
+        return float(''.join(filter(str.isdigit, age))) / 12
+    else:
+        return None
+final_table['Minimum Age'] = final_table['Minimum Age'].apply(convert_minimum_age)
+final_table.to_csv("/Users/shania/PycharmProjects/ClinicalAttritionRateMap/final_table.csv", index=False)