From c54a4b69e29b5f3bdcbae3e61aeda2bb82d79552 Mon Sep 17 00:00:00 2001 From: Julia K Date: Tue, 26 Jan 2021 15:50:55 -0500 Subject: [PATCH 1/3] Adding a directory for LTC script lambda functions; these should correspond to what we actually have in prod. --- .../ltc-scrapers/lambda-functions/me.py | 63 +++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 data-collection-scripts/ltc-scrapers/lambda-functions/me.py diff --git a/data-collection-scripts/ltc-scrapers/lambda-functions/me.py b/data-collection-scripts/ltc-scrapers/lambda-functions/me.py new file mode 100644 index 00000000000..e2161cf31d1 --- /dev/null +++ b/data-collection-scripts/ltc-scrapers/lambda-functions/me.py @@ -0,0 +1,63 @@ +""" +This is the content of the lambda function that aggregates cumulative and outbreak data for +a set of states (e.g. ME) that otherwise have multiple rows per date/facility, in a case of +several outbreaks. +""" + +import io +import json + +import pandas as pd + + +def make_matching_column_name_map(df): + # make a map of corresponding column names (cumulative to current) + num_numeric_cols = 12 # number of metrics + first_metric_col = 14 # position of 1st metric, "Cumulative Resident Positives" + col_map = {} + for i in range(num_numeric_cols): + cumulative_col = df.columns[first_metric_col+i] + outbreak_col = df.columns[first_metric_col+i+num_numeric_cols] + col_map[cumulative_col] = outbreak_col + return col_map + + +# takes a dataframe containing the same facility name/date data and collapses the rows. +# Finds conceptually paired columns based on the content of col_map. +def collapse_rows(df_group, col_map): + new_df_subset = df_group.loc[df_group['Outbreak Status'] == 'Open'].copy() + if new_df_subset.empty: # no open outbreaks + new_df_subset = df_group.head(1).copy() + assert new_df_subset.shape[0] == 1 # expecting only one row/open outbreak + for colname in col_map.keys(): + try: + cumulative_val = int(df_group[colname].astype(float).sum()) + current_open_val = int(df_group[col_map[colname]].astype(float).sum()) + val = cumulative_val + current_open_val + if val > 0: + index = list(df_group.columns).index(colname) + new_df_subset.iat[0,index] = val + except ValueError: # some date fields in numeric places, return group as is without collapsing + return df_group + return new_df_subset + + +def lambda_handler(event, context): + me_df = pd.read_csv(io.StringIO(event['body'])) + num_numeric_cols = 12 # number of metrics + first_metric_col = 14 # position of 1st metric, "Cumulative Resident Positives" + col_map = {} + for i in range(num_numeric_cols): + cumulative_col = me_df.columns[first_metric_col+i] + outbreak_col = me_df.columns[first_metric_col+i+num_numeric_cols] + col_map[cumulative_col] = outbreak_col + + # group by facility name and date, collapse each group into one row + processed_df = me_df.groupby( + ['Date Collected', 'Facility Name']).apply( + lambda x: collapse_rows(x, col_map)) + + return { + 'statusCode': 200, + 'body': processed_df.to_csv(index=False, header=False) # don't return the header row + } From 476d1bc4ecb8e43a560d1a9dea3bb38c3c7b7afd Mon Sep 17 00:00:00 2001 From: Julia K Date: Tue, 26 Jan 2021 16:46:45 -0500 Subject: [PATCH 2/3] Making the aggregation function more efficient for large datasets --- .../ltc-scrapers/lambda-functions/me.py | 31 ++++++++++++++----- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/data-collection-scripts/ltc-scrapers/lambda-functions/me.py b/data-collection-scripts/ltc-scrapers/lambda-functions/me.py index e2161cf31d1..2315599b1d4 100644 --- a/data-collection-scripts/ltc-scrapers/lambda-functions/me.py +++ b/data-collection-scripts/ltc-scrapers/lambda-functions/me.py @@ -25,10 +25,20 @@ def make_matching_column_name_map(df): # takes a dataframe containing the same facility name/date data and collapses the rows. # Finds conceptually paired columns based on the content of col_map. def collapse_rows(df_group, col_map): + # if only one row, return the row + if df_group.shape[0] == 1: + return df_group + new_df_subset = df_group.loc[df_group['Outbreak Status'] == 'Open'].copy() - if new_df_subset.empty: # no open outbreaks + if new_df_subset.empty: # no open outbreaks, but we may want to merge some closed ones new_df_subset = df_group.head(1).copy() - assert new_df_subset.shape[0] == 1 # expecting only one row/open outbreak + + # expecting only one row/open outbreak; if this isn't true, return the group as is + if new_df_subset.shape[0] > 1: + print('Check for duplicates: %s %s' % ( + set(new_df_subset['Facility Name']), set(new_df_subset['Date Collected']))) + return df_group + for colname in col_map.keys(): try: cumulative_val = int(df_group[colname].astype(float).sum()) @@ -37,25 +47,30 @@ def collapse_rows(df_group, col_map): if val > 0: index = list(df_group.columns).index(colname) new_df_subset.iat[0,index] = val - except ValueError: # some date fields in numeric places, return group as is without collapsing + except ValueError: # some date fields in numeric places, return group without collapsing return df_group return new_df_subset -def lambda_handler(event, context): - me_df = pd.read_csv(io.StringIO(event['body'])) +def process_df(df): num_numeric_cols = 12 # number of metrics first_metric_col = 14 # position of 1st metric, "Cumulative Resident Positives" col_map = {} for i in range(num_numeric_cols): - cumulative_col = me_df.columns[first_metric_col+i] - outbreak_col = me_df.columns[first_metric_col+i+num_numeric_cols] + cumulative_col = df.columns[first_metric_col+i] + outbreak_col = df.columns[first_metric_col+i+num_numeric_cols] col_map[cumulative_col] = outbreak_col # group by facility name and date, collapse each group into one row - processed_df = me_df.groupby( + processed_df = df.groupby( ['Date Collected', 'Facility Name']).apply( lambda x: collapse_rows(x, col_map)) + return processed_df + + +def lambda_handler(event, context): + df = pd.read_csv(io.StringIO(event['body'])) + processed_df = process_df(df) return { 'statusCode': 200, From 907e0c5abb11a6f7076e7d506ccc689d0dfb37a3 Mon Sep 17 00:00:00 2001 From: Julia K Date: Tue, 26 Jan 2021 16:48:02 -0500 Subject: [PATCH 3/3] Renaming file to what it actually does --- .../lambda-functions/{me.py => aggregate-cumulative-outbreaks.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename data-collection-scripts/ltc-scrapers/lambda-functions/{me.py => aggregate-cumulative-outbreaks.py} (100%) diff --git a/data-collection-scripts/ltc-scrapers/lambda-functions/me.py b/data-collection-scripts/ltc-scrapers/lambda-functions/aggregate-cumulative-outbreaks.py similarity index 100% rename from data-collection-scripts/ltc-scrapers/lambda-functions/me.py rename to data-collection-scripts/ltc-scrapers/lambda-functions/aggregate-cumulative-outbreaks.py