chore: update scaletest terraform with latest findings (#8249)

Updates scaletest terraform with learnings from #8213: - Increase max pods per node to 256 - Decrease CPU requests for test workspace to allow maxing out workspaces per node - Explicitly set CODER_ACCESS_URL for ssh to work - Explicitly disable rate limits in coderd - Increase DB size for medium and large scenarios - Mount cache volume directly under /tmp/coder instead of /tmp. - Plumb through requests and limits for workspaces - Plumb through requests for coderd
2025-07-13 21:36:50 +00:00 · 2023-06-29 15:03:11 +01:00
parent 83fee4b192
commit 7072b8eff5
6 changed files with 80 additions and 32 deletions
--- a/scaletest/terraform/coder.tf
+++ b/scaletest/terraform/coder.tf
@ -83,6 +83,8 @@ coder:
              operator: "In"
              values:   ["${local.coder_release_name}"]
  env:
    - name: "CODER_ACCESS_URL"
      value: "${local.coder_url}"
    - name: "CODER_CACHE_DIRECTORY"
      value: "/tmp/coder"
    - name: "CODER_ENABLE_TELEMETRY"
@ -108,17 +110,19 @@ coder:
      value: "true"
    - name: "CODER_EXPERIMENTS"
      value: "${var.coder_experiments}"
    - name: "CODER_DANGEROUS_DISABLE_RATE_LIMITS"
      value: "true"
  image:
    repo: ${var.coder_image_repo}
    tag: ${var.coder_image_tag}
  replicaCount: "${var.coder_replicas}"
  resources:
    requests:
-      cpu: "${var.coder_cpu}"
+      cpu: "${var.coder_cpu_request}"
-      memory: "${var.coder_mem}"
+      memory: "${var.coder_mem_request}"
    limits:
-      cpu: "${var.coder_cpu}"
+      cpu: "${var.coder_cpu_limit}"
-      memory: "${var.coder_mem}"
+      memory: "${var.coder_mem_limit}"
  securityContext:
    readOnlyRootFilesystem: true
  service:
@ -126,7 +130,7 @@ coder:
    sessionAffinity: None
    loadBalancerIP: "${local.coder_address}"
  volumeMounts:
-  - mountPath: "/tmp"
+  - mountPath: "/tmp/coder"
    name: cache
    readOnly: false
  volumes:
@ -197,12 +201,12 @@ resource "local_file" "kubernetes_template" {
          }
          resources {
            requests = {
-              "cpu"    = "0.1"
+              "cpu"    = "${var.workspace_cpu_request}"
-              "memory" = "128Mi"
+              "memory" = "${var.workspace_mem_request}"
            }
            limits = {
-              "cpu"    = "1"
+              "cpu"    = "${var.workspace_cpu_limit}"
-              "memory" = "1Gi"
+              "memory" = "${var.workspace_mem_limit}"
            }
          }
        }
--- a/scaletest/terraform/gcp_cluster.tf
+++ b/scaletest/terraform/gcp_cluster.tf
@ -3,12 +3,13 @@ data "google_compute_default_service_account" "default" {
 }
 resource "google_container_cluster" "primary" {
-  name            = var.name
+  name                      = var.name
-  location        = var.zone
+  location                  = var.zone
-  project         = var.project_id
+  project                   = var.project_id
-  network         = google_compute_network.vpc.name
+  network                   = google_compute_network.vpc.name
-  subnetwork      = google_compute_subnetwork.subnet.name
+  subnetwork                = google_compute_subnetwork.subnet.name
-  networking_mode = "VPC_NATIVE"
+  networking_mode           = "VPC_NATIVE"
  default_max_pods_per_node = 256
  ip_allocation_policy { # Required with networking_mode=VPC_NATIVE
  }
--- a/scaletest/terraform/scenario-large.tfvars
+++ b/scaletest/terraform/scenario-large.tfvars
@ -1,6 +1,9 @@
 nodepool_machine_type_coder      = "t2d-standard-8"
 nodepool_size_coder              = 3
 nodepool_machine_type_workspaces = "t2d-standard-8"
-coder_cpu                        = "6000m" # Leaving 2 CPUs for system workloads
+cloudsql_tier                    = "db-custom-2-7680"
-coder_mem                        = "24Gi"  # Leaving 8 GB for system workloads
+coder_cpu_request                = "3000m"
 coder_mem_request                = "12Gi"
 coder_cpu_limit                  = "6000m" # Leaving 2 CPUs for system workloads
 coder_mem_limit                  = "24Gi"  # Leaving 8 GB for system workloads
 coder_replicas                   = 3
--- a/scaletest/terraform/scenario-medium.tfvars
+++ b/scaletest/terraform/scenario-medium.tfvars
@ -1,4 +1,7 @@
 nodepool_machine_type_coder      = "t2d-standard-8"
 nodepool_machine_type_workspaces = "t2d-standard-8"
-coder_cpu                        = "6000m" # Leaving 2 CPUs for system workloads
+cloudsql_tier                    = "db-custom-1-3840"
-coder_mem                        = "24Gi"  # Leaving 8 GB for system workloads
+coder_cpu_request                = "3000m"
 coder_mem_request                = "12Gi"
 coder_cpu_limit                  = "6000m" # Leaving 2 CPUs for system workloads
 coder_mem_limit                  = "24Gi"  # Leaving 8 GB for system workloads
--- a/scaletest/terraform/scenario-small.tfvars
+++ b/scaletest/terraform/scenario-small.tfvars
@ -1,4 +1,6 @@
 nodepool_machine_type_coder      = "t2d-standard-4"
 nodepool_machine_type_workspaces = "t2d-standard-4"
-coder_cpu                        = "2000m" # Leaving 2 CPUs for system workloads
+coder_cpu_request                = "1000m"
-coder_mem                        = "12Gi"  # Leaving 4GB for system workloads
+coder_mem_request                = "6Gi"
 coder_cpu_limit                  = "2000m" # Leaving 2 CPUs for system workloads
 coder_mem_limit                  = "12Gi"  # Leaving 4GB for system workloads
--- a/scaletest/terraform/vars.tf
+++ b/scaletest/terraform/vars.tf
@ -94,17 +94,30 @@ variable "cloudsql_max_connections" {
 // These variables control the Coder deployment.
 variable "coder_replicas" {
-  description = "Number of Coder replicas to provision"
+  description = "Number of Coder replicas to provision."
  default     = 1
 }
-variable "coder_cpu" {
+// Ensure that requests allow for at least two replicas to be scheduled
-  description = "CPU to allocate to Coder"
+// on a single node temporarily, otherwise deployments may fail due to
 // lack of resources.
 variable "coder_cpu_request" {
  description = "CPU request to allocate to Coder."
  default     = "500m"
 }
 variable "coder_mem_request" {
  description = "Memory request to allocate to Coder."
  default     = "512Mi"
 }
 variable "coder_cpu_limit" {
  description = "CPU limit to allocate to Coder."
  default     = "1000m"
 }
-variable "coder_mem" {
+variable "coder_mem_limit" {
-  description = "Memory to allocate to Coder"
+  description = "Memory limit to allocate to Coder."
  default     = "1024Mi"
 }
@ -123,11 +136,38 @@ variable "coder_image_tag" {
  default     = "latest"
 }
 variable "coder_experiments" {
  description = "Coder Experiments to enable."
  default     = ""
 }
 // These variables control the default workspace template.
 variable "workspace_image" {
  description = "Image and tag to use for workspaces."
  default     = "docker.io/codercom/enterprise-minimal:ubuntu"
 }
 variable "workspace_cpu_request" {
  description = "CPU request to allocate to workspaces."
  default     = "100m"
 }
 variable "workspace_cpu_limit" {
  description = "CPU limit to allocate to workspaces."
  default     = "100m"
 }
 variable "workspace_mem_request" {
  description = "Memory request to allocate to workspaces."
  default     = "128Mi"
 }
 variable "workspace_mem_limit" {
  description = "Memory limit to allocate to workspaces."
  default     = "128Mi"
 }
 // These variables control the Prometheus deployment.
 variable "prometheus_remote_write_user" {
  description = "Username for Prometheus remote write."
  default     = ""
@ -139,7 +179,7 @@ variable "prometheus_remote_write_password" {
 }
 variable "prometheus_remote_write_url" {
-  description = "URL for Prometheus remote write. Defaults to stats.dev.c8s.io"
+  description = "URL for Prometheus remote write. Defaults to stats.dev.c8s.io."
  default     = "https://stats.dev.c8s.io:9443/api/v1/write"
 }
@ -157,8 +197,3 @@ variable "prometheus_remote_write_send_interval" {
  description = "Prometheus remote write interval."
  default     = "15s"
 }
 variable "coder_experiments" {
  description = "Coder Experiments to enable"
  default     = ""
 }