Skip to content
This repository has been archived by the owner on Jul 12, 2023. It is now read-only.

Commit

Permalink
Improve service timeouts (#1916)
Browse files Browse the repository at this point in the history
* Improve service timeouts

In-request services have a timeout of 10 seconds while background jobs have a timeout of 900s. The Cloud Scheduler timeout (which invokes the background jobs) has a 60s buffer to reduce timeout races.

* Decrease e2e-runner to 120s

* Increase adminapi and server timeouts for bulk issue

* Missed one
  • Loading branch information
sethvargo authored Mar 12, 2021
1 parent f845ce2 commit d99cc94
Show file tree
Hide file tree
Showing 10 changed files with 40 additions and 19 deletions.
6 changes: 4 additions & 2 deletions terraform/service_admin_apiserver.tf
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ resource "google_cloud_run_service" "adminapi" {
template {
spec {
service_account_name = google_service_account.adminapi.email
timeout_seconds = 25
timeout_seconds = 60

containers {
image = "gcr.io/${var.project}/github.com/google/exposure-notifications-verification-server/adminapi:initial"
Expand Down Expand Up @@ -171,10 +171,12 @@ resource "google_compute_backend_service" "adminapi" {
name = "adminapi"
project = var.project

security_policy = google_compute_security_policy.cloud-armor.name

backend {
group = google_compute_region_network_endpoint_group.adminapi[0].id
}
security_policy = google_compute_security_policy.cloud-armor.name

log_config {
enable = var.enable_lb_logging
sample_rate = var.enable_lb_logging ? 1 : null
Expand Down
6 changes: 4 additions & 2 deletions terraform/service_apiserver.tf
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ resource "google_cloud_run_service" "apiserver" {
template {
spec {
service_account_name = google_service_account.apiserver.email
timeout_seconds = 25
timeout_seconds = 10

containers {
image = "gcr.io/${var.project}/github.com/google/exposure-notifications-verification-server/apiserver:initial"
Expand Down Expand Up @@ -171,10 +171,12 @@ resource "google_compute_backend_service" "apiserver" {
name = "apiserver"
project = var.project

security_policy = google_compute_security_policy.cloud-armor.name

backend {
group = google_compute_region_network_endpoint_group.apiserver[0].id
}
security_policy = google_compute_security_policy.cloud-armor.name

log_config {
enable = var.enable_lb_logging
sample_rate = var.enable_lb_logging ? 1 : null
Expand Down
4 changes: 3 additions & 1 deletion terraform/service_appsync.tf
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,11 @@ resource "google_cloud_run_service" "appsync" {
lookup(var.service_annotations, "appsync", {})
)
}

template {
spec {
service_account_name = google_service_account.appsync.email
timeout_seconds = 900

containers {
image = "gcr.io/${var.project}/github.com/google/exposure-notifications-verification-server/appsync:initial"
Expand Down Expand Up @@ -168,7 +170,7 @@ resource "google_cloud_scheduler_job" "appsync-worker" {
region = var.cloudscheduler_location
schedule = "0 */6 * * *"
time_zone = "America/Los_Angeles"
attempt_deadline = "600s"
attempt_deadline = "${google_cloud_run_service.appsync.template[0].spec[0].timeout_seconds + 60}s"

retry_config {
retry_count = 3
Expand Down
4 changes: 3 additions & 1 deletion terraform/service_cleanup.tf
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,11 @@ resource "google_cloud_run_service" "cleanup" {
lookup(var.service_annotations, "cleanup", {})
)
}

template {
spec {
service_account_name = google_service_account.cleanup.email
timeout_seconds = 900

containers {
image = "gcr.io/${var.project}/github.com/google/exposure-notifications-verification-server/cleanup:initial"
Expand Down Expand Up @@ -181,7 +183,7 @@ resource "google_cloud_scheduler_job" "cleanup-worker" {
region = var.cloudscheduler_location
schedule = "0 * * * *"
time_zone = "America/Los_Angeles"
attempt_deadline = "600s"
attempt_deadline = "${google_cloud_run_service.cleanup.template[0].spec[0].timeout_seconds + 60}s"

retry_config {
retry_count = 3
Expand Down
8 changes: 5 additions & 3 deletions terraform/service_e2e_runner.tf
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,11 @@ resource "google_cloud_run_service" "e2e-runner" {
lookup(var.service_annotations, "e2e-runner", {})
)
}

template {
spec {
service_account_name = google_service_account.e2e-runner.email
timeout_seconds = 120

containers {
image = "gcr.io/${var.project}/github.com/google/exposure-notifications-verification-server/e2e-runner:initial"
Expand Down Expand Up @@ -168,7 +170,7 @@ resource "google_cloud_scheduler_job" "e2e-default-workflow" {
region = var.cloudscheduler_location
schedule = "0,10,20,30,40,50,55 * * * *"
time_zone = "America/Los_Angeles"
attempt_deadline = "30s"
attempt_deadline = "${google_cloud_run_service.e2e-runner.template[0].spec[0].timeout_seconds + 60}s"

retry_config {
retry_count = 3
Expand All @@ -195,7 +197,7 @@ resource "google_cloud_scheduler_job" "e2e-revise-workflow" {
region = var.cloudscheduler_location
schedule = "0,5,15,25,35,45,55 * * * *"
time_zone = "America/Los_Angeles"
attempt_deadline = "30s"
attempt_deadline = "${google_cloud_run_service.e2e-runner.template[0].spec[0].timeout_seconds + 60}s"

retry_config {
retry_count = 3
Expand All @@ -222,7 +224,7 @@ resource "google_cloud_scheduler_job" "e2e-enx-redirect-workflow" {
region = var.cloudscheduler_location
schedule = "0,5,15,25,35,45,55 * * * *"
time_zone = "America/Los_Angeles"
attempt_deadline = "30s"
attempt_deadline = "${google_cloud_run_service.e2e-runner.template[0].spec[0].timeout_seconds + 60}s"

retry_config {
retry_count = 3
Expand Down
5 changes: 3 additions & 2 deletions terraform/service_modeler.tf
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,11 @@ resource "google_cloud_run_service" "modeler" {
lookup(var.service_annotations, "modeler", {})
)
}

template {
spec {
service_account_name = google_service_account.modeler.email
timeout_seconds = 120
timeout_seconds = 900

containers {
image = "gcr.io/${var.project}/github.com/google/exposure-notifications-verification-server/modeler:initial"
Expand Down Expand Up @@ -162,7 +163,7 @@ resource "google_cloud_scheduler_job" "modeler-worker" {
region = var.cloudscheduler_location
schedule = "0 */6 * * *"
time_zone = "America/Los_Angeles"
attempt_deadline = "600s"
attempt_deadline = "${google_cloud_run_service.modeler.template[0].spec[0].timeout_seconds + 60}s"

retry_config {
retry_count = 3
Expand Down
9 changes: 6 additions & 3 deletions terraform/service_redirect.tf
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ resource "google_cloud_run_service" "enx-redirect" {
template {
spec {
service_account_name = google_service_account.enx-redirect.email
timeout_seconds = 25
timeout_seconds = 10

containers {
image = "gcr.io/${var.project}/github.com/google/exposure-notifications-verification-server/enx-redirect:initial"
Expand Down Expand Up @@ -168,15 +168,18 @@ resource "google_compute_region_network_endpoint_group" "enx-redirect" {
}

resource "google_compute_backend_service" "enx-redirect" {
count = local.enable_lb ? 1 : 0
count = local.enable_lb ? 1 : 0

provider = google-beta
name = "enx-redirect"
project = var.project

security_policy = google_compute_security_policy.cloud-armor.name

backend {
group = google_compute_region_network_endpoint_group.enx-redirect.id
}
security_policy = google_compute_security_policy.cloud-armor.name

log_config {
enable = var.enable_lb_logging
sample_rate = var.enable_lb_logging ? 1 : null
Expand Down
6 changes: 4 additions & 2 deletions terraform/service_rotation.tf
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,11 @@ resource "google_cloud_run_service" "rotation" {
lookup(var.service_annotations, "rotation", {})
)
}

template {
spec {
service_account_name = google_service_account.rotation.email
timeout_seconds = 900

containers {
image = "gcr.io/${var.project}/github.com/google/exposure-notifications-verification-server/rotation:initial"
Expand Down Expand Up @@ -178,7 +180,7 @@ resource "google_cloud_scheduler_job" "rotation-worker" {
region = var.cloudscheduler_location
schedule = "2,32 * * * *"
time_zone = "America/Los_Angeles"
attempt_deadline = "600s"
attempt_deadline = "${google_cloud_run_service.rotation.template[0].spec[0].timeout_seconds + 60}s"

retry_config {
retry_count = 3
Expand Down Expand Up @@ -207,7 +209,7 @@ resource "google_cloud_scheduler_job" "realm-key-rotation-worker" {
// This schedule is offset from the token rotation schedule.
schedule = "*/15 * * * *"
time_zone = "America/Los_Angeles"
attempt_deadline = "600s"
attempt_deadline = "${google_cloud_run_service.rotation.template[0].spec[0].timeout_seconds + 60}s"

retry_config {
retry_count = 3
Expand Down
7 changes: 5 additions & 2 deletions terraform/service_server.tf
Original file line number Diff line number Diff line change
Expand Up @@ -83,10 +83,11 @@ resource "google_cloud_run_service" "server" {
lookup(var.service_annotations, "server", {})
)
}

template {
spec {
service_account_name = google_service_account.server.email
timeout_seconds = 25
timeout_seconds = 60

containers {
image = "gcr.io/${var.project}/github.com/google/exposure-notifications-verification-server/server:initial"
Expand Down Expand Up @@ -189,10 +190,12 @@ resource "google_compute_backend_service" "server" {
name = "server"
project = var.project

security_policy = google_compute_security_policy.cloud-armor.name

backend {
group = google_compute_region_network_endpoint_group.server[0].id
}
security_policy = google_compute_security_policy.cloud-armor.name

log_config {
enable = var.enable_lb_logging
sample_rate = var.enable_lb_logging ? 1 : null
Expand Down
4 changes: 3 additions & 1 deletion terraform/service_stats_puller.tf
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,11 @@ resource "google_cloud_run_service" "stats-puller" {
lookup(var.service_annotations, "stats-puller", {})
)
}

template {
spec {
service_account_name = google_service_account.stats-puller.email
timeout_seconds = 900

containers {
image = "gcr.io/${var.project}/github.com/google/exposure-notifications-verification-server/stats-puller:initial"
Expand Down Expand Up @@ -171,7 +173,7 @@ resource "google_cloud_scheduler_job" "stats-puller-worker" {
region = var.cloudscheduler_location
schedule = "10,20,30 * * * *"
time_zone = "America/Los_Angeles"
attempt_deadline = "600s"
attempt_deadline = "${google_cloud_run_service.stats-puller.template[0].spec[0].timeout_seconds + 60}s"

retry_config {
retry_count = 3
Expand Down

0 comments on commit d99cc94

Please # to comment.