growth-curves / Git / Diff of /code/init

Models:
RaymondKing/
growth-curves
Downloads: 1
Diff of /code/init_processing/aggregate.py [000000] .. [d8a979]
Switch to side-by-side view

--- a
+++ b/code/init_processing/aggregate.py
@@ -0,0 +1,63 @@
+import numpy as np
+import pandas as pd
+
+def calculate_aggregations(df, groupby_attributes, percentiles):
+
+    ## Group datapoints by interested attributes
+    grouped = df.groupby(groupby_attributes)
+
+    ## Attributes to calculate percentiles for
+    attributes = ["ht", "wt", "bmi"]
+
+    ## Create list of dictionaries to append to
+    df_row_list = []
+
+    ## Iterate through each group
+    for name, group in grouped:
+        print name
+
+        ## Create new dictionary
+        df_row = dict()
+        
+        ## Compute/save relevant information
+        df_row["gender"] = name[0]      
+        df_row["age"] = name[1]
+
+        ## If race/ethnicity is not a groupby attribute, calculate percentiles for all races/ethnicities lumped together
+        ## under "All"
+        if "race_ethnicity" in groupby_attributes:
+            df_row["race_ethnicity"] = name[2]
+        else:
+            df_row["race_ethnicity"] = "All"
+
+        ## Compute number of patients in group
+        num_patients = group.shape[0]
+        df_row["count"] = num_patients
+
+        ## Determine indices for each percentile
+        percentile_indices = np.round(percentiles/100.0*num_patients).astype(int).tolist()
+
+        ## If a percentile index is equal to the number of patients, reduce it by 1 so that it is a valid index
+        ## (sinces indices go from 0 to num_patients - 1)
+        percentile_indices = [num_patients - 1 if x == num_patients else x for x in percentile_indices]
+
+        for attribute in attributes:
+            ## Groupby attribute
+            group = group.sort([attribute])
+
+            ## Retrive percentile values
+            percentile_values = [group.iloc[x] for x in percentile_indices]
+
+            ## And save these percentile values
+            for index in range(len(percentiles)):
+                df_row[attribute + "_" + str(percentiles[index])] = percentile_values[index][attribute]
+                
+            ## Compute means and standard deivations
+            df_row[attribute + "_mean"] = group[attribute].mean()
+            df_row[attribute + "_std"] = group[attribute].std()
+
+        ## Append new dictionary to list
+        df_row_list.append(df_row)
+
+    ## Return dataframe from list of dictionaries
+    return pd.DataFrame(df_row_list)