diff --git a/Data/df.Rds b/Data/df.Rds
index a3d59c6..14b06ae 100644
Binary files a/Data/df.Rds and b/Data/df.Rds differ
diff --git a/Functions/modelEcoli.R b/Functions/modelEcoli.R
index 7984d62..d9cbd0a 100644
--- a/Functions/modelEcoli.R
+++ b/Functions/modelEcoli.R
@@ -5,54 +5,37 @@ modelEcoli <- function(trainData, testData, threshBegin, threshEnd, thresh, prod
   test_vars <- ncol(testData)
   model <- randomForest(Escherichia.coli ~ .,
                         data = trainData[,
-                                         c(1:(train_vars - 1))])
+                                         c(1:(train_vars - 2))],
+                        ntree = 1000,
+                        importance = TRUE,
+                        proxmity = TRUE)
+                        # maxnodes = 20)
   testData$predictionRF <- predict(model, testData[,c(1:(test_vars-2))])
   tp <- c()
   fn <- c()
   tn <- c()
   fp <- c()
-  tpUSGS <- c()
-  fnUSGS <- c()
-  tnUSGS <- c()
-  fpUSGS <- c()
   tpr <- c()
   fpr <- c()
-  tprUSGS <- c()
-  fprUSGS <- c()
   precision <- c()
   recall <- c()
-  precisionUSGS <- c()
-  recallUSGS <- c()
   thresholds <- c()
   predictions <- data.frame()
   testData$actual_binary <- ifelse(testData$Escherichia.coli >= 235, 1, 0)
   for (threshold in seq(threshBegin, threshEnd, 1)) {
     testData$predictionRF_binary <- ifelse(testData$predictionRF >= threshold, 1, 0)
-    testData$USGS_binary <- ifelse(testData$Predicted.Level >= threshold, 1, 0)
     testData$true_positive <- ifelse((testData$actual_binary == 1 & testData$predictionRF_binary  == 1), 1, 0)
     testData$true_negative <- ifelse((testData$actual_binary == 0 & testData$predictionRF_binary  == 0), 1, 0)
     testData$false_negative <- ifelse((testData$actual_binary == 1 & testData$predictionRF_binary  == 0), 1, 0)
     testData$false_positive <- ifelse((testData$actual_binary == 0 & testData$predictionRF_binary  == 1), 1, 0)
-    testData$true_positiveUSGS <- ifelse((testData$actual_binary == 1 & testData$USGS_binary  == 1), 1, 0)
-    testData$true_negativeUSGS <- ifelse((testData$actual_binary == 0 & testData$USGS_binary  == 0), 1, 0)
-    testData$false_negativeUSGS <- ifelse((testData$actual_binary == 1 & testData$USGS_binary  == 0), 1, 0)
-    testData$false_positiveUSGS <- ifelse((testData$actual_binary == 0 & testData$USGS_binary  == 1), 1, 0)
     tp <- c(tp, sum(testData$true_positive))
     fn <- c(fn, sum(testData$false_negative))
     tn <- c(tn, sum(testData$true_negative))
     fp <- c(fp, sum(testData$false_positive))
-    tpUSGS <- c(tpUSGS, sum(testData$true_positiveUSGS))
-    fnUSGS <- c(fnUSGS, sum(testData$false_negativeUSGS))
-    tnUSGS <- c(tnUSGS, sum(testData$true_negativeUSGS))
-    fpUSGS <- c(fpUSGS, sum(testData$false_positiveUSGS))
     tpr = c(tpr, sum(testData$true_positive) / (sum(testData$true_positive) + sum(testData$false_negative)))
     fpr = c(fpr, sum(testData$false_positive) / (sum(testData$false_positive) + sum(testData$true_negative)))
-    tprUSGS <- c(tprUSGS, sum(testData$true_positiveUSGS) / (sum(testData$true_positiveUSGS) + sum(testData$false_negativeUSGS)))
-    fprUSGS <- c(fprUSGS, sum(testData$false_positiveUSGS) / (sum(testData$false_positiveUSGS) + sum(testData$true_negativeUSGS)))
     precision = c(precision, sum(testData$true_positive) / (sum(testData$true_positive) + sum(testData$false_positive)))
     recall = c(recall, sum(testData$true_positive) / (sum(testData$true_positive) + sum(testData$false_negative)))
-    precisionUSGS <- c(precisionUSGS, sum(testData$true_positiveUSGS) / (sum(testData$true_positiveUSGS) + sum(testData$false_positiveUSGS)))
-    recallUSGS <- c(recallUSGS, sum(testData$true_positiveUSGS) / (sum(testData$true_positiveUSGS) + sum(testData$false_negativeUSGS)))
     thresholds <- c(thresholds, threshold)
     if (threshold == thresh) {
       predictions <- rbind(predictions, testData)
@@ -64,20 +47,12 @@ modelEcoli <- function(trainData, testData, threshBegin, threshEnd, thresh, prod
   }
   list("tpr"=tpr,
        "fpr"=fpr,
-       "tprUSGS"=tprUSGS,
-       "fprUSGS"=fprUSGS,
        "precision"=precision,
        "recall"=recall,
-       "precisionUSGS"=precisionUSGS,
-       "recallUSGS"=recallUSGS,
        "tp"=tp,
        "fn"=fn,
        "tn"=tn,
        "fp"=fp,
-       "tpUSGS"=tpUSGS,
-       "fnUSGS"=fnUSGS,
-       "tnUSGS"=tnUSGS,
-       "fpUSGS"=fpUSGS,
        "thresholds"=thresholds,
        "predictions"=predictions)
 }
diff --git a/Master.R b/Master.R
index af2fbde..550c954 100644
--- a/Master.R
+++ b/Master.R
@@ -29,8 +29,8 @@ source("R/00_Startup.R")
 df <- readRDS(paste0(getwd(),"/Data/df.Rds"))
 
 # remove prior modeling variables before starting up a new model
-keep <- list("df", "modelCurves", "modelEcoli")
-rm(list=ls()[!ls() %in% keep])
+# keep <- list("df", "modelCurves", "modelEcoli")
+# rm(list=ls()[!ls() %in% keep])
 
 #-------------------------------------------------------------------------------
 #  CHOOSE PREDICTORS
@@ -42,44 +42,51 @@ df_model <- df[, c("Escherichia.coli", #dependent variable
                    "Client.ID",
                    # "precipProbability",
                    # "Water.Level",
-                   "Rogers_Escherichia.coli",
+                   # "n12th_Escherichia.coli",
+                   "Foster_Escherichia.coli",
+                   # "Ohio_Escherichia.coli",
+                   "North_Avenue_Escherichia.coli",
+                   "n31st_Escherichia.coli",
+                   "Leone_Escherichia.coli",
+                   # "Albion_Escherichia.coli",
+                   # "Rogers_Escherichia.coli",
                    # "Howard_Escherichia.coli",
-                   # "n57th_Escherichia.coli", 
+                   # "n57th_Escherichia.coli",
                    # "n63rd_Escherichia.coli",
-                   # "South_Shore_Escherichia.coli",
+                   "South_Shore_Escherichia.coli",
                    # "Montrose_Escherichia.coli",
                    # "Calumet_Escherichia.coli",
                    # "Rainbow_Escherichia.coli",
                    # "Ohio_DNA.Geo.Mean",
                    # "North_Avenue_DNA.Geo.Mean",
-                   "n63rd_DNA.Geo.Mean",
-                   "South_Shore_DNA.Geo.Mean",
-                   "Montrose_DNA.Geo.Mean",
-                   "Calumet_DNA.Geo.Mean",
-                   "Rainbow_DNA.Geo.Mean",
-                   "Date", #Must use for splitting data, not included in model
-                   "Predicted.Level" #Must use for USGS model comparison, not included in model
-                   )]
-# to run without USGS for comparison, comment out "Predicted.Level" above and uncomment next line
-# df_model$Predicted.Level <- 1 #meaningless value
+                   # "n63rd_DNA.Geo.Mean",
+                   # "South_Shore_DNA.Geo.Mean",
+                   # "Montrose_DNA.Geo.Mean",
+                   # "Calumet_DNA.Geo.Mean",
+                   # "Rainbow_DNA.Geo.Mean",
+                   "Year", #used for splitting data
+                   "Date" #used for splitting data
+)]
+
+finaltest <- df_model[df_model$Year == "2016",]
 
 #-------------------------------------------------------------------------------
 #  CHOOSE TEST/TRAIN SETS
 #  You can decide whether to use kFolds cross validation or define your own sets
 #  If you set kFolds to TRUE, the data will be separated into 10 sets
 #  If you set kFolds to FALSE, the model will use trainStart, trainEnd, etc. (see below)
+#  CANNOT BE USED IF productionMode = TRUE
 #-------------------------------------------------------------------------------
 
-kFolds <- TRUE #If TRUE next 4 lines will not be used but cannot be commented out
-trainStart <- "2006-01-01"
-trainEnd <- "2015-12-31"
-testStart <- "2016-01-01"
-testEnd <- "2016-12-31"
+kFolds <- FALSE #If TRUE next 2 lines will not be used but cannot be commented out
+testYears <- c("2016")
+trainYears <- c("2006", "2007", "2008", "2009","2010", "2011", "2012", "2013", "2014", "2015")
+# trainYears <- trainYears[! trainYears %in% testYears]
 
 # If productionMode is set to TRUE, a file named model.Rds will be generated
 # Its used is explained at https://github.com/Chicago/clear-water-app
-# Set trainStart and trainEnd to what you would like the model to train on
-# testStart and testEnd must still be specified, although not applicable
+# Set trainYears to what you would like the model to train on
+# testYears must still be specified, although not applicable
 # plots will not be accurate
 
 productionMode <- FALSE
@@ -93,9 +100,9 @@ productionMode <- FALSE
 
 # downsample settings
 downsample <- FALSE #If FALSE comment out the next 3 lines
-# highMin <- 200
-# highMax <- 2500
-# lowMax <- 200
+highMin <- 235
+highMax <- 2500
+lowMax <- 235
 
 
 #-------------------------------------------------------------------------------
@@ -108,24 +115,24 @@ downsample <- FALSE #If FALSE comment out the next 3 lines
 
 excludeBeaches <- c(
                     # "12th",
-                    # "31st",
+                    "31st",
                     # "39th",
                     # "57th",
                     "63rd",
                     # "Albion",
                     "Calumet",
-                    # "Foster",
+                    "Foster",
                     # "Howard",
                     # "Jarvis",
                     # "Juneway",
-                    # "Leone",
+                    "Leone",
                     "Montrose",
-                    # "North Avenue",
+                    "North Avenue",
                     # "Oak Street",
-                    # "Ohio",
+                    "Ohio",
                     # "Osterman",
                     "Rainbow",
-                    "Rogers",
+                    # "Rogers",
                     "South Shore"
                     )
 
@@ -137,15 +144,11 @@ excludeBeaches <- c(
 title1 <- paste0("ROC", 
                  if(kFolds == TRUE) " - kFolds",
                  if(kFolds == FALSE) " - validate on ",
-                 if(kFolds == FALSE) testStart,
-                 if(kFolds == FALSE) " to ",
-                 if(kFolds == FALSE) testEnd)
+                 if(kFolds == FALSE) testYears)
 title2 <- paste0("PR Curve", 
                  if(kFolds == TRUE) " - kFolds",
                  if(kFolds == FALSE) " - validate on ",
-                 if(kFolds == FALSE) testStart,
-                 if(kFolds == FALSE) " to ",
-                 if(kFolds == FALSE) testEnd)
+                 if(kFolds == FALSE) testYears)
 
 
 #-------------------------------------------------------------------------------
@@ -154,18 +157,18 @@ title2 <- paste0("PR Curve",
 #-------------------------------------------------------------------------------
 
 threshBegin <- 1
-threshEnd <- 500
+threshEnd <- 1000
 
 
 thresh <- 235
 
 #-------------------------------------------------------------------------------
 #  RUN MODEL
-#  Plots will generate and results will be saved in "model_summary)
+#  Plots will generate and results will be saved in "model_summary"
 #-------------------------------------------------------------------------------
 
 # runs all modeling code
-source("R/30_model.R", print.eval=TRUE)
+source("R/30_Model.R", print.eval=TRUE)
 
 # creates a data frame with all model results
 # this aggregates the folds to generate one single curve
@@ -174,18 +177,31 @@ model_summary <- plot_data %>%
   group_by(thresholds) %>%
   summarize(tpr = mean(tpr),
             fpr = mean(fpr),
-            tprUSGS = mean(tprUSGS),
-            fprUSGS = mean(fprUSGS),
             precision = mean(precision, na.rm = TRUE),
             recall = mean(recall),
-            precisionUSGS = mean(precisionUSGS, na.rm = TRUE),
-            recallUSGS = mean(recallUSGS),
             tp = mean(tp),
             fn = mean(fn),
             tn = mean(tn),
-            fp = mean(fp),
-            tpUSGS = mean(tpUSGS),
-            fnUSGS = mean(fnUSGS),
-            tnUSGS = mean(tnUSGS),
-            fpUSGS = mean(fpUSGS)
-            )
+            fp = mean(fp)
+  )
+
+# saveRDS(model_summary, paste0("model_results/y", testYears, ".Rds"))
+
+## final holdout validation
+
+model <- readRDS("model.Rds")
+finalthresh <- 381
+finaltest <- finaltest[!finaltest$Client.ID %in% excludeBeaches,]
+finaltest <- finaltest[complete.cases(finaltest),]
+finaltest$prediction <- predict(model, finaltest)
+finaltest$actualbin <- ifelse(finaltest$Escherichia.coli >= 235, 1, 0)
+finaltest$predbin <- ifelse(finaltest$prediction >= finalthresh, 1, 0)
+finaltest$tp <- ifelse(finaltest$actualbin == 1 & finaltest$predbin == 1, 1, 0)
+finaltest$tn <- ifelse(finaltest$actualbin == 0 & finaltest$predbin == 0, 1, 0)
+finaltest$fp <- ifelse(finaltest$actualbin == 0 & finaltest$predbin == 1, 1, 0)
+finaltest$fn <- ifelse(finaltest$actualbin == 1 & finaltest$predbin == 0, 1, 0)
+sum(finaltest$tp)
+sum(finaltest$fn)
+sum(finaltest$tn)
+sum(finaltest$fp)
+
diff --git a/R/20_Clean.R b/R/20_Clean.R
index e008f10..9f1c3be 100644
--- a/R/20_Clean.R
+++ b/R/20_Clean.R
@@ -83,7 +83,7 @@ for(i in 1:length(beach_dup$Year)){
     if ( is.na(prod(as.numeric(check$Escherichia.coli,na.rm=TRUE)))){
       df[row.names(check)[1],]$Escherichia.coli<-NA
     }
-    else{
+    else if (!is.na(df[row.names(check)[1],]$Escherichia.coli)){
       #Produce a geomean of all the duplicated beaches and assign it to the first
       #beach in a datafr),]
       df[row.names(check)[1],]$Escherichia.coli <-prod(as.numeric(check$Escherichia.coli,na.rm=TRUE))^(1/length(check$Escherichia.coli))
@@ -97,6 +97,9 @@ rm(beach_dup,daily_beach_obs,mult_obs_beach,check,drop_list,i,j)
 #Get rid of the duplicated rows
 df<-df[!duplicated(df[,1:4]),]
 
+#remove artifact NA rows
+df <- df[!is.na(df$DayOfYear),]
+
 #add new columns for predictor beaches.
 #We will use the same-day test results for these beaches to predict other beaches
 for (beach in unique(df$Client.ID)) {
@@ -126,6 +129,8 @@ names(df)[names(df) == "North Avenue_DNA.Geo.Mean"] <- "North_Avenue_DNA.Geo.Mea
 names(df)[names(df) == "Oak Street_DNA.Geo.Mean"] <- "Oak_Street_DNA.Geo.Mean"
 names(df)[names(df) == "South Shore_DNA.Geo.Mean"] <- "South_Shore_DNA.Geo.Mean"
 names(df)[names(df) == "South Shore_Escherichia.coli"] <- "South_Shore_Escherichia.coli"
+names(df)[names(df) == "North Avenue_Escherichia.coli"] <- "North_Avenue_Escherichia.coli"
+
 
 
 #remove times from Date variable
diff --git a/R/30_Model.R b/R/30_Model.R
index 78e9a69..a8aed93 100644
--- a/R/30_Model.R
+++ b/R/30_Model.R
@@ -2,12 +2,11 @@
 # 2) model and 3) plot curves
 
 model_cols <- (ncol(df_model))
-
+set.seed(111)
 
 if (kFolds & !productionMode) {
   print("Modeling with 10 folds validation")
   df_model <- df_model[complete.cases(df_model),] #remove NAs from df_model
-  set.seed(111)
   dates <- unique(df_model$Date)
   fold_size <- .1 * length(dates)
   dates_sample <- sample(dates, fold_size)
@@ -19,15 +18,15 @@ if (kFolds & !productionMode) {
     testDays <- dates_sample
     testData <- df_model[df_model$Date %in% testDays, ]
     testData <- testData[which(!testData$Client.ID %in% excludeBeaches),]
-    trainData <- df_model[!df_model$Date %in% testDays, c(1:ncol(df_model) - 1)]
+    trainData <- df_model[!df_model$Date %in% testDays, c(1:(ncol(df_model) - 2))]
     trainData <- trainData[which(!trainData$Client.ID %in% excludeBeaches),]
     if (downsample) {
       train_high <- trainData[trainData$Escherichia.coli >= highMin
                               & trainData$Escherichia.coli < highMax, ]
       train_low <- trainData[trainData$Escherichia.coli < lowMax, ]
-      # only use as many low days as you have high days
+      # use 1:5 ratio of high days to total days
       ind <- sample(c(1:nrow(train_low)),
-                    nrow(train_high),
+                    nrow(train_high) * 4,
                     replace = TRUE)
       train_balanced <- rbind(train_high, train_low[ind, ])
       trainData <- train_balanced
@@ -36,20 +35,12 @@ if (kFolds & !productionMode) {
     fold_data <- data.frame(fold, 
                             "tpr" = model$tpr,
                             "fpr" = model$fpr,
-                            "tprUSGS" = model$tprUSGS,
-                            "fprUSGS" = model$fprUSGS,
                             "precision" = model$precision,
                             "recall" = model$recall,
-                            "precisionUSGS" = model$precisionUSGS,
-                            "recallUSGS" = model$recallUSGS,
                             "tp" = model$tp,
                             "fn" = model$fn,
                             "tn" = model$tn,
                             "fp" = model$fp,
-                            "tpUSGS" = model$tpUSGS,
-                            "fnUSGS" = model$fnUSGS,
-                            "tnUSGS" = model$tnUSGS,
-                            "fpUSGS" = model$fpUSGS, 
                             "thresholds" = model$thresholds)
     plot_data <- rbind(plot_data, fold_data)
     predictions <- rbind(predictions, model$predictions)
@@ -57,7 +48,6 @@ if (kFolds & !productionMode) {
     remaining_dates <- dates[!dates %in% used_dates]
     if (fold < 10) dates_sample <- sample(remaining_dates, fold_size)
   }
-  names(predictions)[names(predictions) == "Predicted.Level"] <- "USGS.Prediction"
   names(predictions)[names(predictions) == "predictionRF"] <- "DNAModel.Prediction"
   plot_data$fold <- as.factor(plot_data$fold)
   plot_data <- plot_data %>%
@@ -65,65 +55,46 @@ if (kFolds & !productionMode) {
     summarize(tp = sum(tp),
               fn = sum(fn),
               tn = sum(tn),
-              fp = sum(fp),
-              tpUSGS = sum(tpUSGS),
-              fnUSGS = sum(fnUSGS),
-              tnUSGS = sum(tnUSGS),
-              fpUSGS = sum(fpUSGS)
+              fp = sum(fp)
     )
   plot_data <- mutate(plot_data,
                       tpr = tp/(tp+fn),
                       fpr = fp/(fp+tn),
                       precision = tp/(tp+fp),
-                      recall = tp/(tp+fn),
-                      tprUSGS = tpUSGS/(tpUSGS+fnUSGS),
-                      fprUSGS = fpUSGS/(fpUSGS+tnUSGS),
-                      precisionUSGS = tpUSGS/(tpUSGS+fpUSGS),
-                      recallUSGS = tpUSGS/(tpUSGS+fnUSGS)
+                      recall = tp/(tp+fn)
                       )
   p <- ggplot(data = plot_data) 
   print(p + 
-          geom_smooth(aes(x = fpr, y = tpr, 
-                          color = "DNA Model"),
-                      span = .9) + 
-          geom_smooth(aes(x = fprUSGS, y = tprUSGS, 
-                          color = "USGS Model"),
-                      span = .9) + 
+          geom_line(aes(x = fpr, y = tpr, 
+                          color = "DNA Model")) + 
           ylim(0,1) + 
           xlim(0,1) +
           ggtitle(title1))
   print(p + 
-          geom_smooth(aes(x = recall, y = precision,
-                          color = "DNA Model"),
-                      span = .9) +
-          geom_smooth(aes(x = recallUSGS, y = precisionUSGS,
-                          color = "USGS Model"),
-                      span = .9) +
+          geom_line(aes(x = recall, y = precision,
+                          color = "DNA Model")) +
           ylim(0,1) + 
           xlim(0,1) +
           ggtitle(title2))
 } else {
   print("Modeling with user-defined validation data")
-  trainData <- df_model[,
-                        c(1:model_cols - 1)] #remove EPA prediction from training data
+  trainData <- df_model[, c(1:model_cols)]
   # Reduce train set to non-predictor beaches
   trainData <- trainData[which(!trainData$Client.ID %in% excludeBeaches),]
-  trainData <- trainData[trainData$Date < trainEnd
-                         & trainData$Date > trainStart,]
+  trainData <- trainData[trainData$Year %in% trainYears,]
   trainData <- trainData[complete.cases(trainData),] #remove NAs from train data
   if (downsample) {
     train_high <- trainData[trainData$Escherichia.coli >= highMin
                             & trainData$Escherichia.coli < highMax, ]
     train_low <- trainData[trainData$Escherichia.coli < lowMax, ]
-    # only use as many low days as you have high days
+    # use 1:5 ratio of high days to total days
     ind <- sample(c(1:nrow(train_low)),
-                  nrow(train_high),
+                  nrow(train_high) * 4,
                   replace = TRUE)
     train_balanced <- rbind(train_high, train_low[ind, ])
     trainData <- train_balanced
   }
-  testData <- df_model[df_model$Date < testEnd
-                       & df_model$Date > testStart, ]
+  testData <- df_model[df_model$Year %in% testYears, ]
   # Reduce test set to non-predictor beaches
   testData <- testData[which(!testData$Client.ID %in% excludeBeaches),]
   testData <- testData[complete.cases(testData),] #remove NAs from test data
@@ -132,22 +103,14 @@ if (kFolds & !productionMode) {
   model <- modelEcoli(trainData, testData, threshBegin, threshEnd, thresh, productionMode)
   p <- ggplot() 
   print(p + 
-          geom_smooth(aes(x = model$fpr, y = model$tpr, 
-                          color = "DNA Model"), 
-                      span = .9) + 
-          geom_smooth(aes(x = model$fprUSGS, y = model$tprUSGS, 
-                          color = "USGS Model"),
-                      span = .9) + 
+          geom_line(aes(x = model$fpr, y = model$tpr, 
+                          color = "DNA Model")) + 
           ylim(0,1) + 
           xlim(0,1) + 
           ggtitle(title1))
   print(p + 
-          geom_smooth(aes(x = model$recall, y = model$precision,
-                          color = "DNA Model"),
-                      span = .9) +
-          geom_smooth(aes(x = model$recallUSGS, y = model$precisionUSGS,
-                          color = "USGS Model"),
-                      span = .9) +
+          geom_line(aes(x = model$recall, y = model$precision,
+                          color = "DNA Model")) +
           ylim(0,1) + 
           xlim(0,1) +
           ggtitle(title2))
diff --git a/R/31_RandomForest.R b/R/31_RandomForest.R
deleted file mode 100644
index 8090ff4..0000000
--- a/R/31_RandomForest.R
+++ /dev/null
@@ -1,171 +0,0 @@
-#To be able to use the DNA testing (very expensive), we want to try and cluster
-#beaches with similar beaches to be able to predict the levels of E.coli at
-#other beaches to reduce the cost in the future on how much is spent on testing.
-
-#In this program we split the beaches into clusters, the clusters were chosen by 
-#the K-means method. Within the clusters we are going to :
-#1- Use only one beach as our predicting beach and assume we know the correct
-#   mean for each day, therefore we cannot use that beach in our confusion
-#   matricies.
-#2- Use Random Forest models to predict the level of E.Coli at the other beaches
-#   within the cluster.
-
-tp<- NULL
-fn<- NULL
-prec<- NULL
-s<-1
-for(year in 2012:2012)
-{
-  for(n in 1:20)
-  {
-    ############################################################################
-    # CLUSTER CREATION
-    ############################################################################
-    
-    #Create the 6 clusters of beaches that are going to be used for predicting
-    Calumet_Cluster<- c("31st","Calumet","South Shore")
-    Rainbow_Cluster<- c("Rainbow")
-    SixtyThird_Cluster<- c("63rd")
-    Montrose_Cluster<- c("Montrose")
-    Southern_Cluster<- c("57th","12th","39th")
-    Northern_Cluster<- c("Albion","Foster","Howard","Jarvis","Juneway","Leone",
-                         "North Avenue", "Oak Street", "Ohio", "Osterman",
-                         "Rogers")
-    
-    ############################################################################
-    # VARIABLE CREATION
-    ############################################################################
-    
-    #Because the RF runs on factors and numeric variables only the
-    #qualitative_variables needed to join the data frames together later,
-    #but will be taken out of final data frame before the RF is run. 
-    #These variables should not be changed!
-    qualitative_variables<-c("Client.ID","Day","Month","Year")
-    
-    #The numeric_variables are the variables that RF is using to form it's
-    #predictions, these varibles can be changed when needed.
-    numeric_variables<-c("Escherichia.coli",
-                         "DayOfYear",
-                         "precipIntensity",
-                         #  "temperatureMax",
-                         #  "temperatureMin",
-                         "humidity")
-                         #  "Water.Level")
-    
-    
-    #A List of all the clusters to use later for the amount of beache in the
-    #cluster that is being predicted.
-    clusters<-list(Calumet_Cluster, Rainbow_Cluster, SixtyThird_Cluster,
-                   Montrose_Cluster, Southern_Cluster, Northern_Cluster)
-    
-    #A list, in the order of the clusters that they are in, that are used as 
-    #the beaches we are going to be taking the DNA test at.
-    client =c("South Shore","Rainbow","63rd","Montrose","57th","Rogers")
-    
-    #The year that we are looking at, this comes from the for loop above.
-    predict_year<-year
-    
-    #For downsampling what predictions are we going to take in from previous years
-    low_cutoff <-5
-    high_cutoff<-10
-    
-    #What is our cutoff for high E.Coli levels going to be?
-    high_ecoli_level <-150
-    
-    #Create a df that we are going to use as the final df to create a conf_matrix
-    final<-NULL
-    
-    ############################################################################
-    # DATA FRAME CREATION
-    ############################################################################
-    #In this for loop we are going to take each cluster and make predictions
-    #within the cluster
-    for(j in 1:length(client))
-    {
-      #Get the the qualitative_variables of the year that we are looking at from
-      #the main df, and assign them to a new df called `Cluster_df`
-      Cluster_df<- df[df$Client.ID%in%clusters[[j]],c(qualitative_variables,numeric_variables)]
-      
-      #Same thing as the qualitative but with the numeric variables,
-      #the difference is that we change these all to numeric.
-      for(i in 1:length(numeric_variables)){
-        Cluster_df$numeric_variables[i]<-as.numeric(Cluster_df$numeric_variables[i])
-      }
-      rm(i)
-      
-      #Take out all the rows that have an N/A in them
-      Cluster_df<- na.omit(Cluster_df)
-      
-      #Since we are going to know the levels at a particular beach, Use those
-      #levels to try and predict at other beaches in the cluster. 
-      known_beach_df<-Cluster_df[Cluster_df$Client.ID==client[j],c("Day",
-                                                                   "Month",
-                                                                   "Year",
-                                                                   "Escherichia.coli")]
-      #Change the column name from `Escherichia.coli` to 
-      #`Known_Beach.Escherichia.coli` so we don't have 2 `Escherichia.coli` when
-      #the merge happens.
-      names(known_beach_df)[names(known_beach_df)== 'Escherichia.coli']<-"Known_Beach.Escherichia.coli"
-      
-      #Merge cluster_df and known_beach_df and check for any N/A's
-      Cluster_df<-merge(Cluster_df,known_beach_df)
-      Cluster_df<- na.omit(Cluster_df)
-      
-      #Create a variable that has only the beaches we are predicting, so we can
-      #reduce the data frame in the future.
-      predicting_beaches<- clusters[[j]][clusters[[j]]!=client[j]]
-      ##########################################################################
-      # MODEL CREATION
-      ##########################################################################
-      
-      #If we are predicting for 1 or more beaches, we will do all the predictions
-      #for the specific beach 
-      if(length(predicting_beaches)>0)
-      {
-        for(k in 1:length(predicting_beaches))
-        {
-          #Build a data frame for the beach we are predicting
-          predicting_df<-Cluster_df[Cluster_df$Client.ID == predicting_beaches[k],]
-          
-          #Build the testing and training sets
-          test<-subset(predicting_df,Year == as.character(predict_year))
-          train_low <- subset(predicting_df,Escherichia.coli<=low_cutoff
-                              & Escherichia.coli>5
-                              & Client.ID != client[j] & Year != as.character(predict_year))
-          train_high<-subset(predicting_df,Escherichia.coli>=high_cutoff 
-                             & Client.ID != client[j] & Year != as.character(predict_year))
-          #Put the low and high training sets together
-          training<-rbind(train_low,train_high)
-          
-          #Set the binary outcome for the test and training sets
-          training$Escherichia.coli<- ifelse(training$Escherichia.coli<high_ecoli_level,0,1)
-          test$Escherichia.coli<- ifelse(test$Escherichia.coli<high_ecoli_level,0,1)
-          
-          #Take the Qualitative variables out of the data frame
-          training<- training[,!(names(training)%in%qualitative_variables)]
-          test<- test[,!(names(test)%in%qualitative_variables)]
-          
-          #Run the RandomForest model
-          model<-randomForest(factor(Escherichia.coli)~.,data=training)
-          #Make Predictions on the test set
-          test$pred<-predict(model,newdata = test)
-          #Put all the predictions from all beaches together into 1 data set
-          final<-rbind(final,test)
-        }
-      }
-    } 
-    
-    #Get the necessary information that we need for ascessing the model
-    tp[s]<- conf_matrix(final$Escherichia.coli,final$pred,show=FALSE)$tpr
-    fn[s]<- conf_matrix(final$Escherichia.coli,final$pred,show=FALSE)$fnr
-    prec[s]<- conf_matrix(final$Escherichia.coli,final$pred,show=FALSE)$prec
-    s<-s+1
-  }
-}
-
-
-rm(Cluster_df,final,known_beach_df,predicting_df,test,train_high,
-   train_low,training,Calumet_Cluster,client,clusters,high_cutoff,
-   high_ecoli_level,j,k,low_cutoff,model,Montrose_Cluster,Northern_Cluster,
-   numeric_variables,predict_year,qualitative_variables,predicting_beaches,
-   Rainbow_Cluster,SixtyThird_Cluster,Southern_Cluster,s)
diff --git a/R/k-means.R b/R/k-means.R
new file mode 100644
index 0000000..3f487e9
--- /dev/null
+++ b/R/k-means.R
@@ -0,0 +1,95 @@
+library(data.table)
+library(ggplot2)
+
+# x - latitude
+# y - freq of elevated occurances
+
+clusters <- 5
+reclusters <- 5
+
+source("R/00_Startup.R")
+df <- readRDS(paste0(getwd(),"/Data/df.Rds"))
+dt <- data.table(df)
+
+dt <- dt[Year %in% c("2006",
+                     "2007",
+                     "2008",
+                     "2009",
+                     "2010",
+                     "2011",
+                     "2012",
+                     "2013",
+                     "2014",
+                     "2015",
+                     "2016",
+                     "2017")]
+
+dt[Escherichia.coli >= 235, exceedance := 1]
+dt[Escherichia.coli < 235, exceedance := 0]
+dt_byBeach <- dt[!is.na(exceedance),
+                 .(exceedances = sum(exceedance)),
+                 .(Client.ID, Latitude, Longitude)]
+dt_byBeach$breakwater <- 0
+dt_byBeach[Client.ID == "12th"]$breakwater <- 108
+dt_byBeach[Client.ID == "31st"]$breakwater <- 415
+dt_byBeach[Client.ID == "39th"]$breakwater <- 128
+dt_byBeach[Client.ID == "57th"]$breakwater <- 442
+dt_byBeach[Client.ID == "63rd"]$breakwater <- 1215
+dt_byBeach[Client.ID == "Albion"]$breakwater <- 0
+dt_byBeach[Client.ID == "Calumet"]$breakwater <- 536
+dt_byBeach[Client.ID == "Foster"]$breakwater <- 202
+dt_byBeach[Client.ID == "Howard"]$breakwater <- 0
+dt_byBeach[Client.ID == "Juneway"]$breakwater <- 0
+dt_byBeach[Client.ID == "Leone"]$breakwater <- 41
+dt_byBeach[Client.ID == "Montrose"]$breakwater <- 977
+dt_byBeach[Client.ID == "North Avenue"]$breakwater <- 499
+dt_byBeach[Client.ID == "Oak Street"]$breakwater <- 0
+dt_byBeach[Client.ID == "Ohio"]$breakwater <- 1381
+dt_byBeach[Client.ID == "Osterman"]$breakwater <- 265
+dt_byBeach[Client.ID == "Rainbow"]$breakwater <- 1002
+dt_byBeach[Client.ID == "Rogers"]$breakwater <- 75
+dt_byBeach[Client.ID == "South Shore"]$breakwater <- 387
+dt_byBeach[Client.ID == "Jarvis"]$breakwater <- 0
+
+
+set.seed(317)
+km <- kmeans(scale(dt_byBeach[,2:5]),
+             centers = clusters,
+             nstart = 100)
+plot(dt_byBeach[,3:2],
+     col =(km$cluster +1),
+     main=paste0("K-Means result with ", clusters, " clusters"), 
+     pch=20, 
+     cex=2)
+dt_byBeach$cluster <- km$cluster
+
+dt_byBeach[order(exceedances, decreasing = TRUE)]
+
+# remove worst clusters (these are the beaches to always test)
+
+km2 <- kmeans(scale(dt_byBeach[!cluster %in% c(2,5),
+                         2:3]),
+             centers = reclusters,
+             nstart = 100)
+
+plot(dt_byBeach[!cluster %in% c(2,5),
+                2:3],
+     col =(km2$cluster +1),
+     main=paste0("K-Means result with ", reclusters, " reclusters"), 
+     pch=20, 
+     cex=2)
+dt_byBeach[!cluster %in% c(2,5),"recluster"] <- km2$cluster
+
+dt_byBeach[order(exceedances, decreasing = TRUE)]
+
+lm(exceedances ~ breakwater, dt_byBeach)
+
+cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
+dt_byBeach$cluster <- as.factor(dt_byBeach$cluster)
+ggplot(dt_byBeach, aes(breakwater, exceedances, color = cluster, shape = cluster)) +
+  geom_point(size = 3) + 
+  geom_abline(slope = .1202, intercept = 60.8528, linetype = "dashed") +
+  labs(x = "Breakwater Length (ft)", y = "Total E. coli Exceedances", title = "Chicago Beaches 2006 - 2017") +
+  guides(fill = "legend") +
+  scale_fill_manual(values=cbPalette)
+
diff --git a/README.md b/README.md
index 65eb396..567c02a 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 # Clear Water
-[![Stories in Ready](https://badge.waffle.io/Chicago/clear-water.svg?label=ready&title=Ready)](http://waffle.io/Chicago/clear-water) [![MIT License project](https://img.shields.io/github/license/mashape/apistatus.svg)](https://opensource.org/licenses/MIT)
+[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.1053023.svg)](https://doi.org/10.5281/zenodo.1053023) [![MIT License project](https://img.shields.io/github/license/mashape/apistatus.svg)](https://opensource.org/licenses/MIT)
 
 
 The City of Chicago's Clear Water project brings an innovative approach to beach water quality monitoring. It uses a machine learning prediction technique to better forecast the bacteria levels at Chicago beaches. The model works by interpreting patterns in the results of DNA tests at a handful of beaches across the City, which are then extrapolated to forecast the water quality at other, untested beaches. This method provides a new way for beach managers to save money on expensive rapid water quality tests.
@@ -36,12 +36,6 @@ If you are interested in contributing to this project, see our [Contribution Gui
 
 ## Notes
 
-### A/B evaluation feature
-
-This project began as an attempt to improve on a model that was produced by the United States Geological Survey (USGS). As part of model development, an embedded feature was engineered that provides a way to do an A/B model peformance comparison against the USGS's model performance. 
-
-This feature can be switched on and off, and another model could be substituted for the USGS model to do an A/B evaluation. 
-
 ### Collaboration with the Civic Tech Community
 
 This project originated as a [breakout group](https://chihacknight.org/projects/2016/05/01/e-coli-predictions.html) at [Chi Hack Night](https://chihacknight.org/about.html).
diff --git a/model.Rds b/model.Rds
new file mode 100644
index 0000000..d69b772
Binary files /dev/null and b/model.Rds differ