From c54a4b69e29b5f3bdcbae3e61aeda2bb82d79552 Mon Sep 17 00:00:00 2001
From: Julia K <julia326@gmail.com>
Date: Tue, 26 Jan 2021 15:50:55 -0500
Subject: [PATCH 1/3] Adding a directory for LTC script lambda functions; these
 should correspond to what we actually have in prod.

---
 .../ltc-scrapers/lambda-functions/me.py       | 63 +++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100644 data-collection-scripts/ltc-scrapers/lambda-functions/me.py

diff --git a/data-collection-scripts/ltc-scrapers/lambda-functions/me.py b/data-collection-scripts/ltc-scrapers/lambda-functions/me.py
new file mode 100644
index 00000000000..e2161cf31d1
--- /dev/null
+++ b/data-collection-scripts/ltc-scrapers/lambda-functions/me.py
@@ -0,0 +1,63 @@
+"""
+This is the content of the lambda function that aggregates cumulative and outbreak data for
+a set of states (e.g. ME) that otherwise have multiple rows per date/facility, in a case of
+several outbreaks.
+"""
+
+import io
+import json
+
+import pandas as pd
+
+
+def make_matching_column_name_map(df):
+    # make a map of corresponding column names (cumulative to current)
+    num_numeric_cols = 12  # number of metrics
+    first_metric_col = 14  # position of 1st metric, "Cumulative Resident Positives"
+    col_map = {}
+    for i in range(num_numeric_cols):
+        cumulative_col = df.columns[first_metric_col+i]
+        outbreak_col = df.columns[first_metric_col+i+num_numeric_cols]
+        col_map[cumulative_col] = outbreak_col
+    return col_map
+
+
+# takes a dataframe containing the same facility name/date data and collapses the rows.
+# Finds conceptually paired columns based on the content of col_map.
+def collapse_rows(df_group, col_map):
+    new_df_subset = df_group.loc[df_group['Outbreak Status'] == 'Open'].copy()
+    if new_df_subset.empty:  # no open outbreaks
+        new_df_subset = df_group.head(1).copy()
+    assert new_df_subset.shape[0] == 1  # expecting only one row/open outbreak
+    for colname in col_map.keys():
+        try:
+            cumulative_val = int(df_group[colname].astype(float).sum())
+            current_open_val = int(df_group[col_map[colname]].astype(float).sum())
+            val = cumulative_val + current_open_val
+            if val > 0:
+                index = list(df_group.columns).index(colname)
+                new_df_subset.iat[0,index] = val
+        except ValueError:  # some date fields in numeric places, return group as is without collapsing
+            return df_group
+    return new_df_subset
+
+
+def lambda_handler(event, context):
+    me_df = pd.read_csv(io.StringIO(event['body']))
+    num_numeric_cols = 12  # number of metrics
+    first_metric_col = 14  # position of 1st metric, "Cumulative Resident Positives"
+    col_map = {}
+    for i in range(num_numeric_cols):
+        cumulative_col = me_df.columns[first_metric_col+i]
+        outbreak_col = me_df.columns[first_metric_col+i+num_numeric_cols]
+        col_map[cumulative_col] = outbreak_col
+
+    # group by facility name and date, collapse each group into one row
+    processed_df = me_df.groupby(
+        ['Date Collected', 'Facility Name']).apply(
+            lambda x: collapse_rows(x, col_map))
+    
+    return {
+        'statusCode': 200,
+        'body': processed_df.to_csv(index=False, header=False)  # don't return the header row
+    }

From 476d1bc4ecb8e43a560d1a9dea3bb38c3c7b7afd Mon Sep 17 00:00:00 2001
From: Julia K <julia326@gmail.com>
Date: Tue, 26 Jan 2021 16:46:45 -0500
Subject: [PATCH 2/3] Making the aggregation function more efficient for large
 datasets

---
 .../ltc-scrapers/lambda-functions/me.py       | 31 ++++++++++++++-----
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/data-collection-scripts/ltc-scrapers/lambda-functions/me.py b/data-collection-scripts/ltc-scrapers/lambda-functions/me.py
index e2161cf31d1..2315599b1d4 100644
--- a/data-collection-scripts/ltc-scrapers/lambda-functions/me.py
+++ b/data-collection-scripts/ltc-scrapers/lambda-functions/me.py
@@ -25,10 +25,20 @@ def make_matching_column_name_map(df):
 # takes a dataframe containing the same facility name/date data and collapses the rows.
 # Finds conceptually paired columns based on the content of col_map.
 def collapse_rows(df_group, col_map):
+    # if only one row, return the row
+    if df_group.shape[0] == 1:
+        return df_group
+
     new_df_subset = df_group.loc[df_group['Outbreak Status'] == 'Open'].copy()
-    if new_df_subset.empty:  # no open outbreaks
+    if new_df_subset.empty:  # no open outbreaks, but we may want to merge some closed ones
         new_df_subset = df_group.head(1).copy()
-    assert new_df_subset.shape[0] == 1  # expecting only one row/open outbreak
+
+    # expecting only one row/open outbreak; if this isn't true, return the group as is
+    if new_df_subset.shape[0] > 1:
+        print('Check for duplicates: %s %s' % (
+            set(new_df_subset['Facility Name']), set(new_df_subset['Date Collected'])))
+        return df_group
+
     for colname in col_map.keys():
         try:
             cumulative_val = int(df_group[colname].astype(float).sum())
@@ -37,25 +47,30 @@ def collapse_rows(df_group, col_map):
             if val > 0:
                 index = list(df_group.columns).index(colname)
                 new_df_subset.iat[0,index] = val
-        except ValueError:  # some date fields in numeric places, return group as is without collapsing
+        except ValueError:  # some date fields in numeric places, return group without collapsing
             return df_group
     return new_df_subset
 
 
-def lambda_handler(event, context):
-    me_df = pd.read_csv(io.StringIO(event['body']))
+def process_df(df):
     num_numeric_cols = 12  # number of metrics
     first_metric_col = 14  # position of 1st metric, "Cumulative Resident Positives"
     col_map = {}
     for i in range(num_numeric_cols):
-        cumulative_col = me_df.columns[first_metric_col+i]
-        outbreak_col = me_df.columns[first_metric_col+i+num_numeric_cols]
+        cumulative_col = df.columns[first_metric_col+i]
+        outbreak_col = df.columns[first_metric_col+i+num_numeric_cols]
         col_map[cumulative_col] = outbreak_col
 
     # group by facility name and date, collapse each group into one row
-    processed_df = me_df.groupby(
+    processed_df = df.groupby(
         ['Date Collected', 'Facility Name']).apply(
             lambda x: collapse_rows(x, col_map))
+    return processed_df
+
+
+def lambda_handler(event, context):
+    df = pd.read_csv(io.StringIO(event['body']))
+    processed_df = process_df(df)
     
     return {
         'statusCode': 200,

From 907e0c5abb11a6f7076e7d506ccc689d0dfb37a3 Mon Sep 17 00:00:00 2001
From: Julia K <julia326@gmail.com>
Date: Tue, 26 Jan 2021 16:48:02 -0500
Subject: [PATCH 3/3] Renaming file to what it actually does

---
 .../lambda-functions/{me.py => aggregate-cumulative-outbreaks.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename data-collection-scripts/ltc-scrapers/lambda-functions/{me.py => aggregate-cumulative-outbreaks.py} (100%)

diff --git a/data-collection-scripts/ltc-scrapers/lambda-functions/me.py b/data-collection-scripts/ltc-scrapers/lambda-functions/aggregate-cumulative-outbreaks.py
similarity index 100%
rename from data-collection-scripts/ltc-scrapers/lambda-functions/me.py
rename to data-collection-scripts/ltc-scrapers/lambda-functions/aggregate-cumulative-outbreaks.py