diff --git a/Data/df.Rds b/Data/df.Rds index a3d59c6..14b06ae 100644 Binary files a/Data/df.Rds and b/Data/df.Rds differ diff --git a/Functions/modelEcoli.R b/Functions/modelEcoli.R index 7984d62..d9cbd0a 100644 --- a/Functions/modelEcoli.R +++ b/Functions/modelEcoli.R @@ -5,54 +5,37 @@ modelEcoli <- function(trainData, testData, threshBegin, threshEnd, thresh, prod test_vars <- ncol(testData) model <- randomForest(Escherichia.coli ~ ., data = trainData[, - c(1:(train_vars - 1))]) + c(1:(train_vars - 2))], + ntree = 1000, + importance = TRUE, + proxmity = TRUE) + # maxnodes = 20) testData$predictionRF <- predict(model, testData[,c(1:(test_vars-2))]) tp <- c() fn <- c() tn <- c() fp <- c() - tpUSGS <- c() - fnUSGS <- c() - tnUSGS <- c() - fpUSGS <- c() tpr <- c() fpr <- c() - tprUSGS <- c() - fprUSGS <- c() precision <- c() recall <- c() - precisionUSGS <- c() - recallUSGS <- c() thresholds <- c() predictions <- data.frame() testData$actual_binary <- ifelse(testData$Escherichia.coli >= 235, 1, 0) for (threshold in seq(threshBegin, threshEnd, 1)) { testData$predictionRF_binary <- ifelse(testData$predictionRF >= threshold, 1, 0) - testData$USGS_binary <- ifelse(testData$Predicted.Level >= threshold, 1, 0) testData$true_positive <- ifelse((testData$actual_binary == 1 & testData$predictionRF_binary == 1), 1, 0) testData$true_negative <- ifelse((testData$actual_binary == 0 & testData$predictionRF_binary == 0), 1, 0) testData$false_negative <- ifelse((testData$actual_binary == 1 & testData$predictionRF_binary == 0), 1, 0) testData$false_positive <- ifelse((testData$actual_binary == 0 & testData$predictionRF_binary == 1), 1, 0) - testData$true_positiveUSGS <- ifelse((testData$actual_binary == 1 & testData$USGS_binary == 1), 1, 0) - testData$true_negativeUSGS <- ifelse((testData$actual_binary == 0 & testData$USGS_binary == 0), 1, 0) - testData$false_negativeUSGS <- ifelse((testData$actual_binary == 1 & testData$USGS_binary == 0), 1, 0) - testData$false_positiveUSGS <- ifelse((testData$actual_binary == 0 & testData$USGS_binary == 1), 1, 0) tp <- c(tp, sum(testData$true_positive)) fn <- c(fn, sum(testData$false_negative)) tn <- c(tn, sum(testData$true_negative)) fp <- c(fp, sum(testData$false_positive)) - tpUSGS <- c(tpUSGS, sum(testData$true_positiveUSGS)) - fnUSGS <- c(fnUSGS, sum(testData$false_negativeUSGS)) - tnUSGS <- c(tnUSGS, sum(testData$true_negativeUSGS)) - fpUSGS <- c(fpUSGS, sum(testData$false_positiveUSGS)) tpr = c(tpr, sum(testData$true_positive) / (sum(testData$true_positive) + sum(testData$false_negative))) fpr = c(fpr, sum(testData$false_positive) / (sum(testData$false_positive) + sum(testData$true_negative))) - tprUSGS <- c(tprUSGS, sum(testData$true_positiveUSGS) / (sum(testData$true_positiveUSGS) + sum(testData$false_negativeUSGS))) - fprUSGS <- c(fprUSGS, sum(testData$false_positiveUSGS) / (sum(testData$false_positiveUSGS) + sum(testData$true_negativeUSGS))) precision = c(precision, sum(testData$true_positive) / (sum(testData$true_positive) + sum(testData$false_positive))) recall = c(recall, sum(testData$true_positive) / (sum(testData$true_positive) + sum(testData$false_negative))) - precisionUSGS <- c(precisionUSGS, sum(testData$true_positiveUSGS) / (sum(testData$true_positiveUSGS) + sum(testData$false_positiveUSGS))) - recallUSGS <- c(recallUSGS, sum(testData$true_positiveUSGS) / (sum(testData$true_positiveUSGS) + sum(testData$false_negativeUSGS))) thresholds <- c(thresholds, threshold) if (threshold == thresh) { predictions <- rbind(predictions, testData) @@ -64,20 +47,12 @@ modelEcoli <- function(trainData, testData, threshBegin, threshEnd, thresh, prod } list("tpr"=tpr, "fpr"=fpr, - "tprUSGS"=tprUSGS, - "fprUSGS"=fprUSGS, "precision"=precision, "recall"=recall, - "precisionUSGS"=precisionUSGS, - "recallUSGS"=recallUSGS, "tp"=tp, "fn"=fn, "tn"=tn, "fp"=fp, - "tpUSGS"=tpUSGS, - "fnUSGS"=fnUSGS, - "tnUSGS"=tnUSGS, - "fpUSGS"=fpUSGS, "thresholds"=thresholds, "predictions"=predictions) } diff --git a/Master.R b/Master.R index af2fbde..550c954 100644 --- a/Master.R +++ b/Master.R @@ -29,8 +29,8 @@ source("R/00_Startup.R") df <- readRDS(paste0(getwd(),"/Data/df.Rds")) # remove prior modeling variables before starting up a new model -keep <- list("df", "modelCurves", "modelEcoli") -rm(list=ls()[!ls() %in% keep]) +# keep <- list("df", "modelCurves", "modelEcoli") +# rm(list=ls()[!ls() %in% keep]) #------------------------------------------------------------------------------- # CHOOSE PREDICTORS @@ -42,44 +42,51 @@ df_model <- df[, c("Escherichia.coli", #dependent variable "Client.ID", # "precipProbability", # "Water.Level", - "Rogers_Escherichia.coli", + # "n12th_Escherichia.coli", + "Foster_Escherichia.coli", + # "Ohio_Escherichia.coli", + "North_Avenue_Escherichia.coli", + "n31st_Escherichia.coli", + "Leone_Escherichia.coli", + # "Albion_Escherichia.coli", + # "Rogers_Escherichia.coli", # "Howard_Escherichia.coli", - # "n57th_Escherichia.coli", + # "n57th_Escherichia.coli", # "n63rd_Escherichia.coli", - # "South_Shore_Escherichia.coli", + "South_Shore_Escherichia.coli", # "Montrose_Escherichia.coli", # "Calumet_Escherichia.coli", # "Rainbow_Escherichia.coli", # "Ohio_DNA.Geo.Mean", # "North_Avenue_DNA.Geo.Mean", - "n63rd_DNA.Geo.Mean", - "South_Shore_DNA.Geo.Mean", - "Montrose_DNA.Geo.Mean", - "Calumet_DNA.Geo.Mean", - "Rainbow_DNA.Geo.Mean", - "Date", #Must use for splitting data, not included in model - "Predicted.Level" #Must use for USGS model comparison, not included in model - )] -# to run without USGS for comparison, comment out "Predicted.Level" above and uncomment next line -# df_model$Predicted.Level <- 1 #meaningless value + # "n63rd_DNA.Geo.Mean", + # "South_Shore_DNA.Geo.Mean", + # "Montrose_DNA.Geo.Mean", + # "Calumet_DNA.Geo.Mean", + # "Rainbow_DNA.Geo.Mean", + "Year", #used for splitting data + "Date" #used for splitting data +)] + +finaltest <- df_model[df_model$Year == "2016",] #------------------------------------------------------------------------------- # CHOOSE TEST/TRAIN SETS # You can decide whether to use kFolds cross validation or define your own sets # If you set kFolds to TRUE, the data will be separated into 10 sets # If you set kFolds to FALSE, the model will use trainStart, trainEnd, etc. (see below) +# CANNOT BE USED IF productionMode = TRUE #------------------------------------------------------------------------------- -kFolds <- TRUE #If TRUE next 4 lines will not be used but cannot be commented out -trainStart <- "2006-01-01" -trainEnd <- "2015-12-31" -testStart <- "2016-01-01" -testEnd <- "2016-12-31" +kFolds <- FALSE #If TRUE next 2 lines will not be used but cannot be commented out +testYears <- c("2016") +trainYears <- c("2006", "2007", "2008", "2009","2010", "2011", "2012", "2013", "2014", "2015") +# trainYears <- trainYears[! trainYears %in% testYears] # If productionMode is set to TRUE, a file named model.Rds will be generated # Its used is explained at https://github.com/Chicago/clear-water-app -# Set trainStart and trainEnd to what you would like the model to train on -# testStart and testEnd must still be specified, although not applicable +# Set trainYears to what you would like the model to train on +# testYears must still be specified, although not applicable # plots will not be accurate productionMode <- FALSE @@ -93,9 +100,9 @@ productionMode <- FALSE # downsample settings downsample <- FALSE #If FALSE comment out the next 3 lines -# highMin <- 200 -# highMax <- 2500 -# lowMax <- 200 +highMin <- 235 +highMax <- 2500 +lowMax <- 235 #------------------------------------------------------------------------------- @@ -108,24 +115,24 @@ downsample <- FALSE #If FALSE comment out the next 3 lines excludeBeaches <- c( # "12th", - # "31st", + "31st", # "39th", # "57th", "63rd", # "Albion", "Calumet", - # "Foster", + "Foster", # "Howard", # "Jarvis", # "Juneway", - # "Leone", + "Leone", "Montrose", - # "North Avenue", + "North Avenue", # "Oak Street", - # "Ohio", + "Ohio", # "Osterman", "Rainbow", - "Rogers", + # "Rogers", "South Shore" ) @@ -137,15 +144,11 @@ excludeBeaches <- c( title1 <- paste0("ROC", if(kFolds == TRUE) " - kFolds", if(kFolds == FALSE) " - validate on ", - if(kFolds == FALSE) testStart, - if(kFolds == FALSE) " to ", - if(kFolds == FALSE) testEnd) + if(kFolds == FALSE) testYears) title2 <- paste0("PR Curve", if(kFolds == TRUE) " - kFolds", if(kFolds == FALSE) " - validate on ", - if(kFolds == FALSE) testStart, - if(kFolds == FALSE) " to ", - if(kFolds == FALSE) testEnd) + if(kFolds == FALSE) testYears) #------------------------------------------------------------------------------- @@ -154,18 +157,18 @@ title2 <- paste0("PR Curve", #------------------------------------------------------------------------------- threshBegin <- 1 -threshEnd <- 500 +threshEnd <- 1000 thresh <- 235 #------------------------------------------------------------------------------- # RUN MODEL -# Plots will generate and results will be saved in "model_summary) +# Plots will generate and results will be saved in "model_summary" #------------------------------------------------------------------------------- # runs all modeling code -source("R/30_model.R", print.eval=TRUE) +source("R/30_Model.R", print.eval=TRUE) # creates a data frame with all model results # this aggregates the folds to generate one single curve @@ -174,18 +177,31 @@ model_summary <- plot_data %>% group_by(thresholds) %>% summarize(tpr = mean(tpr), fpr = mean(fpr), - tprUSGS = mean(tprUSGS), - fprUSGS = mean(fprUSGS), precision = mean(precision, na.rm = TRUE), recall = mean(recall), - precisionUSGS = mean(precisionUSGS, na.rm = TRUE), - recallUSGS = mean(recallUSGS), tp = mean(tp), fn = mean(fn), tn = mean(tn), - fp = mean(fp), - tpUSGS = mean(tpUSGS), - fnUSGS = mean(fnUSGS), - tnUSGS = mean(tnUSGS), - fpUSGS = mean(fpUSGS) - ) + fp = mean(fp) + ) + +# saveRDS(model_summary, paste0("model_results/y", testYears, ".Rds")) + +## final holdout validation + +model <- readRDS("model.Rds") +finalthresh <- 381 +finaltest <- finaltest[!finaltest$Client.ID %in% excludeBeaches,] +finaltest <- finaltest[complete.cases(finaltest),] +finaltest$prediction <- predict(model, finaltest) +finaltest$actualbin <- ifelse(finaltest$Escherichia.coli >= 235, 1, 0) +finaltest$predbin <- ifelse(finaltest$prediction >= finalthresh, 1, 0) +finaltest$tp <- ifelse(finaltest$actualbin == 1 & finaltest$predbin == 1, 1, 0) +finaltest$tn <- ifelse(finaltest$actualbin == 0 & finaltest$predbin == 0, 1, 0) +finaltest$fp <- ifelse(finaltest$actualbin == 0 & finaltest$predbin == 1, 1, 0) +finaltest$fn <- ifelse(finaltest$actualbin == 1 & finaltest$predbin == 0, 1, 0) +sum(finaltest$tp) +sum(finaltest$fn) +sum(finaltest$tn) +sum(finaltest$fp) + diff --git a/R/20_Clean.R b/R/20_Clean.R index e008f10..9f1c3be 100644 --- a/R/20_Clean.R +++ b/R/20_Clean.R @@ -83,7 +83,7 @@ for(i in 1:length(beach_dup$Year)){ if ( is.na(prod(as.numeric(check$Escherichia.coli,na.rm=TRUE)))){ df[row.names(check)[1],]$Escherichia.coli<-NA } - else{ + else if (!is.na(df[row.names(check)[1],]$Escherichia.coli)){ #Produce a geomean of all the duplicated beaches and assign it to the first #beach in a datafr),] df[row.names(check)[1],]$Escherichia.coli <-prod(as.numeric(check$Escherichia.coli,na.rm=TRUE))^(1/length(check$Escherichia.coli)) @@ -97,6 +97,9 @@ rm(beach_dup,daily_beach_obs,mult_obs_beach,check,drop_list,i,j) #Get rid of the duplicated rows df<-df[!duplicated(df[,1:4]),] +#remove artifact NA rows +df <- df[!is.na(df$DayOfYear),] + #add new columns for predictor beaches. #We will use the same-day test results for these beaches to predict other beaches for (beach in unique(df$Client.ID)) { @@ -126,6 +129,8 @@ names(df)[names(df) == "North Avenue_DNA.Geo.Mean"] <- "North_Avenue_DNA.Geo.Mea names(df)[names(df) == "Oak Street_DNA.Geo.Mean"] <- "Oak_Street_DNA.Geo.Mean" names(df)[names(df) == "South Shore_DNA.Geo.Mean"] <- "South_Shore_DNA.Geo.Mean" names(df)[names(df) == "South Shore_Escherichia.coli"] <- "South_Shore_Escherichia.coli" +names(df)[names(df) == "North Avenue_Escherichia.coli"] <- "North_Avenue_Escherichia.coli" + #remove times from Date variable diff --git a/R/30_Model.R b/R/30_Model.R index 78e9a69..a8aed93 100644 --- a/R/30_Model.R +++ b/R/30_Model.R @@ -2,12 +2,11 @@ # 2) model and 3) plot curves model_cols <- (ncol(df_model)) - +set.seed(111) if (kFolds & !productionMode) { print("Modeling with 10 folds validation") df_model <- df_model[complete.cases(df_model),] #remove NAs from df_model - set.seed(111) dates <- unique(df_model$Date) fold_size <- .1 * length(dates) dates_sample <- sample(dates, fold_size) @@ -19,15 +18,15 @@ if (kFolds & !productionMode) { testDays <- dates_sample testData <- df_model[df_model$Date %in% testDays, ] testData <- testData[which(!testData$Client.ID %in% excludeBeaches),] - trainData <- df_model[!df_model$Date %in% testDays, c(1:ncol(df_model) - 1)] + trainData <- df_model[!df_model$Date %in% testDays, c(1:(ncol(df_model) - 2))] trainData <- trainData[which(!trainData$Client.ID %in% excludeBeaches),] if (downsample) { train_high <- trainData[trainData$Escherichia.coli >= highMin & trainData$Escherichia.coli < highMax, ] train_low <- trainData[trainData$Escherichia.coli < lowMax, ] - # only use as many low days as you have high days + # use 1:5 ratio of high days to total days ind <- sample(c(1:nrow(train_low)), - nrow(train_high), + nrow(train_high) * 4, replace = TRUE) train_balanced <- rbind(train_high, train_low[ind, ]) trainData <- train_balanced @@ -36,20 +35,12 @@ if (kFolds & !productionMode) { fold_data <- data.frame(fold, "tpr" = model$tpr, "fpr" = model$fpr, - "tprUSGS" = model$tprUSGS, - "fprUSGS" = model$fprUSGS, "precision" = model$precision, "recall" = model$recall, - "precisionUSGS" = model$precisionUSGS, - "recallUSGS" = model$recallUSGS, "tp" = model$tp, "fn" = model$fn, "tn" = model$tn, "fp" = model$fp, - "tpUSGS" = model$tpUSGS, - "fnUSGS" = model$fnUSGS, - "tnUSGS" = model$tnUSGS, - "fpUSGS" = model$fpUSGS, "thresholds" = model$thresholds) plot_data <- rbind(plot_data, fold_data) predictions <- rbind(predictions, model$predictions) @@ -57,7 +48,6 @@ if (kFolds & !productionMode) { remaining_dates <- dates[!dates %in% used_dates] if (fold < 10) dates_sample <- sample(remaining_dates, fold_size) } - names(predictions)[names(predictions) == "Predicted.Level"] <- "USGS.Prediction" names(predictions)[names(predictions) == "predictionRF"] <- "DNAModel.Prediction" plot_data$fold <- as.factor(plot_data$fold) plot_data <- plot_data %>% @@ -65,65 +55,46 @@ if (kFolds & !productionMode) { summarize(tp = sum(tp), fn = sum(fn), tn = sum(tn), - fp = sum(fp), - tpUSGS = sum(tpUSGS), - fnUSGS = sum(fnUSGS), - tnUSGS = sum(tnUSGS), - fpUSGS = sum(fpUSGS) + fp = sum(fp) ) plot_data <- mutate(plot_data, tpr = tp/(tp+fn), fpr = fp/(fp+tn), precision = tp/(tp+fp), - recall = tp/(tp+fn), - tprUSGS = tpUSGS/(tpUSGS+fnUSGS), - fprUSGS = fpUSGS/(fpUSGS+tnUSGS), - precisionUSGS = tpUSGS/(tpUSGS+fpUSGS), - recallUSGS = tpUSGS/(tpUSGS+fnUSGS) + recall = tp/(tp+fn) ) p <- ggplot(data = plot_data) print(p + - geom_smooth(aes(x = fpr, y = tpr, - color = "DNA Model"), - span = .9) + - geom_smooth(aes(x = fprUSGS, y = tprUSGS, - color = "USGS Model"), - span = .9) + + geom_line(aes(x = fpr, y = tpr, + color = "DNA Model")) + ylim(0,1) + xlim(0,1) + ggtitle(title1)) print(p + - geom_smooth(aes(x = recall, y = precision, - color = "DNA Model"), - span = .9) + - geom_smooth(aes(x = recallUSGS, y = precisionUSGS, - color = "USGS Model"), - span = .9) + + geom_line(aes(x = recall, y = precision, + color = "DNA Model")) + ylim(0,1) + xlim(0,1) + ggtitle(title2)) } else { print("Modeling with user-defined validation data") - trainData <- df_model[, - c(1:model_cols - 1)] #remove EPA prediction from training data + trainData <- df_model[, c(1:model_cols)] # Reduce train set to non-predictor beaches trainData <- trainData[which(!trainData$Client.ID %in% excludeBeaches),] - trainData <- trainData[trainData$Date < trainEnd - & trainData$Date > trainStart,] + trainData <- trainData[trainData$Year %in% trainYears,] trainData <- trainData[complete.cases(trainData),] #remove NAs from train data if (downsample) { train_high <- trainData[trainData$Escherichia.coli >= highMin & trainData$Escherichia.coli < highMax, ] train_low <- trainData[trainData$Escherichia.coli < lowMax, ] - # only use as many low days as you have high days + # use 1:5 ratio of high days to total days ind <- sample(c(1:nrow(train_low)), - nrow(train_high), + nrow(train_high) * 4, replace = TRUE) train_balanced <- rbind(train_high, train_low[ind, ]) trainData <- train_balanced } - testData <- df_model[df_model$Date < testEnd - & df_model$Date > testStart, ] + testData <- df_model[df_model$Year %in% testYears, ] # Reduce test set to non-predictor beaches testData <- testData[which(!testData$Client.ID %in% excludeBeaches),] testData <- testData[complete.cases(testData),] #remove NAs from test data @@ -132,22 +103,14 @@ if (kFolds & !productionMode) { model <- modelEcoli(trainData, testData, threshBegin, threshEnd, thresh, productionMode) p <- ggplot() print(p + - geom_smooth(aes(x = model$fpr, y = model$tpr, - color = "DNA Model"), - span = .9) + - geom_smooth(aes(x = model$fprUSGS, y = model$tprUSGS, - color = "USGS Model"), - span = .9) + + geom_line(aes(x = model$fpr, y = model$tpr, + color = "DNA Model")) + ylim(0,1) + xlim(0,1) + ggtitle(title1)) print(p + - geom_smooth(aes(x = model$recall, y = model$precision, - color = "DNA Model"), - span = .9) + - geom_smooth(aes(x = model$recallUSGS, y = model$precisionUSGS, - color = "USGS Model"), - span = .9) + + geom_line(aes(x = model$recall, y = model$precision, + color = "DNA Model")) + ylim(0,1) + xlim(0,1) + ggtitle(title2)) diff --git a/R/31_RandomForest.R b/R/31_RandomForest.R deleted file mode 100644 index 8090ff4..0000000 --- a/R/31_RandomForest.R +++ /dev/null @@ -1,171 +0,0 @@ -#To be able to use the DNA testing (very expensive), we want to try and cluster -#beaches with similar beaches to be able to predict the levels of E.coli at -#other beaches to reduce the cost in the future on how much is spent on testing. - -#In this program we split the beaches into clusters, the clusters were chosen by -#the K-means method. Within the clusters we are going to : -#1- Use only one beach as our predicting beach and assume we know the correct -# mean for each day, therefore we cannot use that beach in our confusion -# matricies. -#2- Use Random Forest models to predict the level of E.Coli at the other beaches -# within the cluster. - -tp<- NULL -fn<- NULL -prec<- NULL -s<-1 -for(year in 2012:2012) -{ - for(n in 1:20) - { - ############################################################################ - # CLUSTER CREATION - ############################################################################ - - #Create the 6 clusters of beaches that are going to be used for predicting - Calumet_Cluster<- c("31st","Calumet","South Shore") - Rainbow_Cluster<- c("Rainbow") - SixtyThird_Cluster<- c("63rd") - Montrose_Cluster<- c("Montrose") - Southern_Cluster<- c("57th","12th","39th") - Northern_Cluster<- c("Albion","Foster","Howard","Jarvis","Juneway","Leone", - "North Avenue", "Oak Street", "Ohio", "Osterman", - "Rogers") - - ############################################################################ - # VARIABLE CREATION - ############################################################################ - - #Because the RF runs on factors and numeric variables only the - #qualitative_variables needed to join the data frames together later, - #but will be taken out of final data frame before the RF is run. - #These variables should not be changed! - qualitative_variables<-c("Client.ID","Day","Month","Year") - - #The numeric_variables are the variables that RF is using to form it's - #predictions, these varibles can be changed when needed. - numeric_variables<-c("Escherichia.coli", - "DayOfYear", - "precipIntensity", - # "temperatureMax", - # "temperatureMin", - "humidity") - # "Water.Level") - - - #A List of all the clusters to use later for the amount of beache in the - #cluster that is being predicted. - clusters<-list(Calumet_Cluster, Rainbow_Cluster, SixtyThird_Cluster, - Montrose_Cluster, Southern_Cluster, Northern_Cluster) - - #A list, in the order of the clusters that they are in, that are used as - #the beaches we are going to be taking the DNA test at. - client =c("South Shore","Rainbow","63rd","Montrose","57th","Rogers") - - #The year that we are looking at, this comes from the for loop above. - predict_year<-year - - #For downsampling what predictions are we going to take in from previous years - low_cutoff <-5 - high_cutoff<-10 - - #What is our cutoff for high E.Coli levels going to be? - high_ecoli_level <-150 - - #Create a df that we are going to use as the final df to create a conf_matrix - final<-NULL - - ############################################################################ - # DATA FRAME CREATION - ############################################################################ - #In this for loop we are going to take each cluster and make predictions - #within the cluster - for(j in 1:length(client)) - { - #Get the the qualitative_variables of the year that we are looking at from - #the main df, and assign them to a new df called `Cluster_df` - Cluster_df<- df[df$Client.ID%in%clusters[[j]],c(qualitative_variables,numeric_variables)] - - #Same thing as the qualitative but with the numeric variables, - #the difference is that we change these all to numeric. - for(i in 1:length(numeric_variables)){ - Cluster_df$numeric_variables[i]<-as.numeric(Cluster_df$numeric_variables[i]) - } - rm(i) - - #Take out all the rows that have an N/A in them - Cluster_df<- na.omit(Cluster_df) - - #Since we are going to know the levels at a particular beach, Use those - #levels to try and predict at other beaches in the cluster. - known_beach_df<-Cluster_df[Cluster_df$Client.ID==client[j],c("Day", - "Month", - "Year", - "Escherichia.coli")] - #Change the column name from `Escherichia.coli` to - #`Known_Beach.Escherichia.coli` so we don't have 2 `Escherichia.coli` when - #the merge happens. - names(known_beach_df)[names(known_beach_df)== 'Escherichia.coli']<-"Known_Beach.Escherichia.coli" - - #Merge cluster_df and known_beach_df and check for any N/A's - Cluster_df<-merge(Cluster_df,known_beach_df) - Cluster_df<- na.omit(Cluster_df) - - #Create a variable that has only the beaches we are predicting, so we can - #reduce the data frame in the future. - predicting_beaches<- clusters[[j]][clusters[[j]]!=client[j]] - ########################################################################## - # MODEL CREATION - ########################################################################## - - #If we are predicting for 1 or more beaches, we will do all the predictions - #for the specific beach - if(length(predicting_beaches)>0) - { - for(k in 1:length(predicting_beaches)) - { - #Build a data frame for the beach we are predicting - predicting_df<-Cluster_df[Cluster_df$Client.ID == predicting_beaches[k],] - - #Build the testing and training sets - test<-subset(predicting_df,Year == as.character(predict_year)) - train_low <- subset(predicting_df,Escherichia.coli<=low_cutoff - & Escherichia.coli>5 - & Client.ID != client[j] & Year != as.character(predict_year)) - train_high<-subset(predicting_df,Escherichia.coli>=high_cutoff - & Client.ID != client[j] & Year != as.character(predict_year)) - #Put the low and high training sets together - training<-rbind(train_low,train_high) - - #Set the binary outcome for the test and training sets - training$Escherichia.coli<- ifelse(training$Escherichia.coli= 235, exceedance := 1] +dt[Escherichia.coli < 235, exceedance := 0] +dt_byBeach <- dt[!is.na(exceedance), + .(exceedances = sum(exceedance)), + .(Client.ID, Latitude, Longitude)] +dt_byBeach$breakwater <- 0 +dt_byBeach[Client.ID == "12th"]$breakwater <- 108 +dt_byBeach[Client.ID == "31st"]$breakwater <- 415 +dt_byBeach[Client.ID == "39th"]$breakwater <- 128 +dt_byBeach[Client.ID == "57th"]$breakwater <- 442 +dt_byBeach[Client.ID == "63rd"]$breakwater <- 1215 +dt_byBeach[Client.ID == "Albion"]$breakwater <- 0 +dt_byBeach[Client.ID == "Calumet"]$breakwater <- 536 +dt_byBeach[Client.ID == "Foster"]$breakwater <- 202 +dt_byBeach[Client.ID == "Howard"]$breakwater <- 0 +dt_byBeach[Client.ID == "Juneway"]$breakwater <- 0 +dt_byBeach[Client.ID == "Leone"]$breakwater <- 41 +dt_byBeach[Client.ID == "Montrose"]$breakwater <- 977 +dt_byBeach[Client.ID == "North Avenue"]$breakwater <- 499 +dt_byBeach[Client.ID == "Oak Street"]$breakwater <- 0 +dt_byBeach[Client.ID == "Ohio"]$breakwater <- 1381 +dt_byBeach[Client.ID == "Osterman"]$breakwater <- 265 +dt_byBeach[Client.ID == "Rainbow"]$breakwater <- 1002 +dt_byBeach[Client.ID == "Rogers"]$breakwater <- 75 +dt_byBeach[Client.ID == "South Shore"]$breakwater <- 387 +dt_byBeach[Client.ID == "Jarvis"]$breakwater <- 0 + + +set.seed(317) +km <- kmeans(scale(dt_byBeach[,2:5]), + centers = clusters, + nstart = 100) +plot(dt_byBeach[,3:2], + col =(km$cluster +1), + main=paste0("K-Means result with ", clusters, " clusters"), + pch=20, + cex=2) +dt_byBeach$cluster <- km$cluster + +dt_byBeach[order(exceedances, decreasing = TRUE)] + +# remove worst clusters (these are the beaches to always test) + +km2 <- kmeans(scale(dt_byBeach[!cluster %in% c(2,5), + 2:3]), + centers = reclusters, + nstart = 100) + +plot(dt_byBeach[!cluster %in% c(2,5), + 2:3], + col =(km2$cluster +1), + main=paste0("K-Means result with ", reclusters, " reclusters"), + pch=20, + cex=2) +dt_byBeach[!cluster %in% c(2,5),"recluster"] <- km2$cluster + +dt_byBeach[order(exceedances, decreasing = TRUE)] + +lm(exceedances ~ breakwater, dt_byBeach) + +cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") +dt_byBeach$cluster <- as.factor(dt_byBeach$cluster) +ggplot(dt_byBeach, aes(breakwater, exceedances, color = cluster, shape = cluster)) + + geom_point(size = 3) + + geom_abline(slope = .1202, intercept = 60.8528, linetype = "dashed") + + labs(x = "Breakwater Length (ft)", y = "Total E. coli Exceedances", title = "Chicago Beaches 2006 - 2017") + + guides(fill = "legend") + + scale_fill_manual(values=cbPalette) + diff --git a/README.md b/README.md index 65eb396..567c02a 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # Clear Water -[![Stories in Ready](https://badge.waffle.io/Chicago/clear-water.svg?label=ready&title=Ready)](http://waffle.io/Chicago/clear-water) [![MIT License project](https://img.shields.io/github/license/mashape/apistatus.svg)](https://opensource.org/licenses/MIT) +[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.1053023.svg)](https://doi.org/10.5281/zenodo.1053023) [![MIT License project](https://img.shields.io/github/license/mashape/apistatus.svg)](https://opensource.org/licenses/MIT) The City of Chicago's Clear Water project brings an innovative approach to beach water quality monitoring. It uses a machine learning prediction technique to better forecast the bacteria levels at Chicago beaches. The model works by interpreting patterns in the results of DNA tests at a handful of beaches across the City, which are then extrapolated to forecast the water quality at other, untested beaches. This method provides a new way for beach managers to save money on expensive rapid water quality tests. @@ -36,12 +36,6 @@ If you are interested in contributing to this project, see our [Contribution Gui ## Notes -### A/B evaluation feature - -This project began as an attempt to improve on a model that was produced by the United States Geological Survey (USGS). As part of model development, an embedded feature was engineered that provides a way to do an A/B model peformance comparison against the USGS's model performance. - -This feature can be switched on and off, and another model could be substituted for the USGS model to do an A/B evaluation. - ### Collaboration with the Civic Tech Community This project originated as a [breakout group](https://chihacknight.org/projects/2016/05/01/e-coli-predictions.html) at [Chi Hack Night](https://chihacknight.org/about.html). diff --git a/model.Rds b/model.Rds new file mode 100644 index 0000000..d69b772 Binary files /dev/null and b/model.Rds differ