COVID19Tracking · julia326 · Jan 26, 2021 · Jan 26, 2021 · Jan 26, 2021
diff --git a/data-collection-scripts/ltc-scrapers/lambda-functions/aggregate-cumulative-outbreaks.py b/data-collection-scripts/ltc-scrapers/lambda-functions/aggregate-cumulative-outbreaks.py
@@ -0,0 +1,78 @@
+"""
+This is the content of the lambda function that aggregates cumulative and outbreak data for
+a set of states (e.g. ME) that otherwise have multiple rows per date/facility, in a case of
+several outbreaks.
+"""
+
+import io
+import json
+
+import pandas as pd
+
+
+def make_matching_column_name_map(df):
+    # make a map of corresponding column names (cumulative to current)
+    num_numeric_cols = 12  # number of metrics
+    first_metric_col = 14  # position of 1st metric, "Cumulative Resident Positives"
+    col_map = {}
+    for i in range(num_numeric_cols):
+        cumulative_col = df.columns[first_metric_col+i]
+        outbreak_col = df.columns[first_metric_col+i+num_numeric_cols]
+        col_map[cumulative_col] = outbreak_col
+    return col_map
+
+
+# takes a dataframe containing the same facility name/date data and collapses the rows.
+# Finds conceptually paired columns based on the content of col_map.
+def collapse_rows(df_group, col_map):
+    # if only one row, return the row
+    if df_group.shape[0] == 1:
+        return df_group
+
+    new_df_subset = df_group.loc[df_group['Outbreak Status'] == 'Open'].copy()
+    if new_df_subset.empty:  # no open outbreaks, but we may want to merge some closed ones
+        new_df_subset = df_group.head(1).copy()
+
+    # expecting only one row/open outbreak; if this isn't true, return the group as is
+    if new_df_subset.shape[0] > 1:
+        print('Check for duplicates: %s %s' % (
+            set(new_df_subset['Facility Name']), set(new_df_subset['Date Collected'])))
+        return df_group
+
+    for colname in col_map.keys():
+        try:
+            cumulative_val = int(df_group[colname].astype(float).sum())
+            current_open_val = int(df_group[col_map[colname]].astype(float).sum())
+            val = cumulative_val + current_open_val
+            if val > 0:
+                index = list(df_group.columns).index(colname)
+                new_df_subset.iat[0,index] = val
+        except ValueError:  # some date fields in numeric places, return group without collapsing
+            return df_group
+    return new_df_subset
+
+
+def process_df(df):
+    num_numeric_cols = 12  # number of metrics
+    first_metric_col = 14  # position of 1st metric, "Cumulative Resident Positives"
+    col_map = {}
+    for i in range(num_numeric_cols):
+        cumulative_col = df.columns[first_metric_col+i]
+        outbreak_col = df.columns[first_metric_col+i+num_numeric_cols]
+        col_map[cumulative_col] = outbreak_col
+
+    # group by facility name and date, collapse each group into one row
+    processed_df = df.groupby(
+        ['Date Collected', 'Facility Name']).apply(
+            lambda x: collapse_rows(x, col_map))
+    return processed_df
+
+
+def lambda_handler(event, context):
+    df = pd.read_csv(io.StringIO(event['body']))
+    processed_df = process_df(df)
+
+    return {
+        'statusCode': 200,
+        'body': processed_df.to_csv(index=False, header=False)  # don't return the header row
+    }