#!/bin/bash set -euo pipefail [[ $VERBOSE == 1 ]] && set -x # shellcheck disable=SC2153 source=scaletest/templates/scaletest-runner/scripts/lib.sh . "${SCRIPTS_DIR}/lib.sh" mapfile -t scaletest_load_scenarios < <(jq -r '. | join ("\n")' <<<"${SCALETEST_PARAM_LOAD_SCENARIOS}") export SCALETEST_PARAM_LOAD_SCENARIOS=("${scaletest_load_scenarios[@]}") log "Running scaletest..." set_status Running start_phase "Creating workspaces" if [[ ${SCALETEST_PARAM_SKIP_CREATE_WORKSPACES} == 0 ]]; then # Note that we allow up to 5 failures to bring up the workspace, since # we're creating a lot of workspaces at once and some of them may fail # due to network issues or other transient errors. coder exp scaletest create-workspaces \ --retry 5 \ --count "${SCALETEST_PARAM_NUM_WORKSPACES}" \ --template "${SCALETEST_PARAM_TEMPLATE}" \ --concurrency "${SCALETEST_PARAM_CREATE_CONCURRENCY}" \ --timeout 5h \ --job-timeout 5h \ --no-cleanup \ --output json:"${SCALETEST_RESULTS_DIR}/create-workspaces.json" show_json "${SCALETEST_RESULTS_DIR}/create-workspaces.json" fi end_phase wait_baseline "${SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION}" non_greedy_agent_traffic_args=() if [[ ${SCALETEST_PARAM_GREEDY_AGENT} != 1 ]]; then greedy_agent_traffic() { :; } else echo "WARNING: Greedy agent enabled, this may cause the load tests to fail." >&2 non_greedy_agent_traffic_args=( # Let the greedy agent traffic command be scraped. # --scaletest-prometheus-address 0.0.0.0:21113 # --trace=false ) annotate_grafana greedy_agent "Create greedy agent" coder exp scaletest create-workspaces \ --count 1 \ --template "${SCALETEST_PARAM_GREEDY_AGENT_TEMPLATE}" \ --concurrency 1 \ --timeout 5h \ --job-timeout 5h \ --no-cleanup \ --output json:"${SCALETEST_RESULTS_DIR}/create-workspaces-greedy-agent.json" wait_baseline "${SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION}" greedy_agent_traffic() { local timeout=${1} scenario=${2} # Run the greedy test for ~1/3 of the timeout. delay=$((timeout * 60 / 3)) local type=web-terminal args=() if [[ ${scenario} == "SSH Traffic" ]]; then type=ssh args+=(--ssh) fi sleep "${delay}" annotate_grafana greedy_agent "${scenario}: Greedy agent traffic" # Produce load at about 1000MB/s (25MB/40ms). set +e coder exp scaletest workspace-traffic \ --template "${SCALETEST_PARAM_GREEDY_AGENT_TEMPLATE}" \ --bytes-per-tick $((1024 * 1024 * 25)) \ --tick-interval 40ms \ --timeout "$((delay))s" \ --job-timeout "$((delay))s" \ --output json:"${SCALETEST_RESULTS_DIR}/traffic-${type}-greedy-agent.json" \ --scaletest-prometheus-address 0.0.0.0:21113 \ --trace=false \ "${args[@]}" status=${?} show_json "${SCALETEST_RESULTS_DIR}/traffic-${type}-greedy-agent.json" export GRAFANA_ADD_TAGS= if [[ ${status} != 0 ]]; then GRAFANA_ADD_TAGS=error fi annotate_grafana_end greedy_agent "${scenario}: Greedy agent traffic" return "${status}" } fi run_scenario_cmd() { local scenario=${1} shift local command=("$@") set +e if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then annotate_grafana scenario "Load scenario: ${scenario}" fi "${command[@]}" status=${?} if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then export GRAFANA_ADD_TAGS= if [[ ${status} != 0 ]]; then GRAFANA_ADD_TAGS=error fi annotate_grafana_end scenario "Load scenario: ${scenario}" fi exit "${status}" } declare -a pids=() declare -A pid_to_scenario=() declare -A failed=() target_start=0 target_end=-1 if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then start_phase "Load scenarios: ${SCALETEST_PARAM_LOAD_SCENARIOS[*]}" fi for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then start_phase "Load scenario: ${scenario}" fi set +e status=0 case "${scenario}" in "SSH Traffic") greedy_agent_traffic "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION}" "${scenario}" & greedy_agent_traffic_pid=$! target_count=$(jq -n --argjson percentage "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_PERCENTAGE}" --argjson num_workspaces "${SCALETEST_PARAM_NUM_WORKSPACES}" '$percentage / 100 * $num_workspaces | floor') target_end=$((target_start + target_count)) if [[ ${target_end} -gt ${SCALETEST_PARAM_NUM_WORKSPACES} ]]; then log "WARNING: Target count ${target_end} exceeds number of workspaces ${SCALETEST_PARAM_NUM_WORKSPACES}, using ${SCALETEST_PARAM_NUM_WORKSPACES} instead." target_start=0 target_end=${target_count} fi run_scenario_cmd "${scenario}" coder exp scaletest workspace-traffic \ --template "${SCALETEST_PARAM_TEMPLATE}" \ --ssh \ --bytes-per-tick "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_BYTES_PER_TICK}" \ --tick-interval "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_TICK_INTERVAL}ms" \ --timeout "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION}m" \ --job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION}m30s" \ --output json:"${SCALETEST_RESULTS_DIR}/traffic-ssh.json" \ --scaletest-prometheus-address "0.0.0.0:${SCALETEST_PROMETHEUS_START_PORT}" \ --target-workspaces "${target_start}:${target_end}" \ "${non_greedy_agent_traffic_args[@]}" & pids+=($!) if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then wait "${pids[-1]}" status=$? show_json "${SCALETEST_RESULTS_DIR}/traffic-ssh.json" else SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1)) fi wait "${greedy_agent_traffic_pid}" status2=$? if [[ ${status} == 0 ]]; then status=${status2} fi ;; "Web Terminal Traffic") greedy_agent_traffic "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION}" "${scenario}" & greedy_agent_traffic_pid=$! target_count=$(jq -n --argjson percentage "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_PERCENTAGE}" --argjson num_workspaces "${SCALETEST_PARAM_NUM_WORKSPACES}" '$percentage / 100 * $num_workspaces | floor') target_end=$((target_start + target_count)) if [[ ${target_end} -gt ${SCALETEST_PARAM_NUM_WORKSPACES} ]]; then log "WARNING: Target count ${target_end} exceeds number of workspaces ${SCALETEST_PARAM_NUM_WORKSPACES}, using ${SCALETEST_PARAM_NUM_WORKSPACES} instead." target_start=0 target_end=${target_count} fi run_scenario_cmd "${scenario}" coder exp scaletest workspace-traffic \ --template "${SCALETEST_PARAM_TEMPLATE}" \ --bytes-per-tick "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_BYTES_PER_TICK}" \ --tick-interval "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_TICK_INTERVAL}ms" \ --timeout "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION}m" \ --job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION}m30s" \ --output json:"${SCALETEST_RESULTS_DIR}/traffic-web-terminal.json" \ --scaletest-prometheus-address "0.0.0.0:${SCALETEST_PROMETHEUS_START_PORT}" \ --target-workspaces "${target_start}:${target_end}" \ "${non_greedy_agent_traffic_args[@]}" & pids+=($!) if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then wait "${pids[-1]}" status=$? show_json "${SCALETEST_RESULTS_DIR}/traffic-web-terminal.json" else SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1)) fi wait "${greedy_agent_traffic_pid}" status2=$? if [[ ${status} == 0 ]]; then status=${status2} fi ;; "App Traffic") greedy_agent_traffic "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_DURATION}" "${scenario}" & greedy_agent_traffic_pid=$! target_count=$(jq -n --argjson percentage "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_PERCENTAGE}" --argjson num_workspaces "${SCALETEST_PARAM_NUM_WORKSPACES}" '$percentage / 100 * $num_workspaces | floor') target_end=$((target_start + target_count)) if [[ ${target_end} -gt ${SCALETEST_PARAM_NUM_WORKSPACES} ]]; then log "WARNING: Target count ${target_end} exceeds number of workspaces ${SCALETEST_PARAM_NUM_WORKSPACES}, using ${SCALETEST_PARAM_NUM_WORKSPACES} instead." target_start=0 target_end=${target_count} fi run_scenario_cmd "${scenario}" coder exp scaletest workspace-traffic \ --template "${SCALETEST_PARAM_TEMPLATE}" \ --bytes-per-tick "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_BYTES_PER_TICK}" \ --tick-interval "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_TICK_INTERVAL}ms" \ --timeout "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_DURATION}m" \ --job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_DURATION}m30s" \ --output json:"${SCALETEST_RESULTS_DIR}/traffic-app.json" \ --scaletest-prometheus-address "0.0.0.0:${SCALETEST_PROMETHEUS_START_PORT}" \ --app "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_MODE}" \ --target-workspaces "${target_start}:${target_end}" \ "${non_greedy_agent_traffic_args[@]}" & pids+=($!) if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then wait "${pids[-1]}" status=$? show_json "${SCALETEST_RESULTS_DIR}/traffic-app.json" else SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1)) fi wait "${greedy_agent_traffic_pid}" status2=$? if [[ ${status} == 0 ]]; then status=${status2} fi ;; "Dashboard Traffic") target_count=$(jq -n --argjson percentage "${SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_PERCENTAGE}" --argjson num_workspaces "${SCALETEST_PARAM_NUM_WORKSPACES}" '$percentage / 100 * $num_workspaces | floor') target_end=$((target_start + target_count)) if [[ ${target_end} -gt ${SCALETEST_PARAM_NUM_WORKSPACES} ]]; then log "WARNING: Target count ${target_end} exceeds number of workspaces ${SCALETEST_PARAM_NUM_WORKSPACES}, using ${SCALETEST_PARAM_NUM_WORKSPACES} instead." target_start=0 target_end=${target_count} fi # TODO: Remove this once the dashboard traffic command is fixed, # (i.e. once images are no longer dumped into PWD). mkdir -p dashboard pushd dashboard run_scenario_cmd "${scenario}" coder exp scaletest dashboard \ --timeout "${SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION}m" \ --job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION}m30s" \ --output json:"${SCALETEST_RESULTS_DIR}/traffic-dashboard.json" \ --scaletest-prometheus-address "0.0.0.0:${SCALETEST_PROMETHEUS_START_PORT}" \ --target-users "${target_start}:${target_end}" \ >"${SCALETEST_RESULTS_DIR}/traffic-dashboard-output.log" & pids+=($!) popd if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then wait "${pids[-1]}" status=$? show_json "${SCALETEST_RESULTS_DIR}/traffic-dashboard.json" else SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1)) fi ;; # Debug scenarios, for testing the runner. "debug:greedy_agent_traffic") greedy_agent_traffic 10 "${scenario}" & pids+=($!) if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then wait "${pids[-1]}" status=$? else SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1)) fi ;; "debug:success") { maybedryrun "$DRY_RUN" sleep 10 true } & pids+=($!) if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then wait "${pids[-1]}" status=$? else SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1)) fi ;; "debug:error") { maybedryrun "$DRY_RUN" sleep 10 false } & pids+=($!) if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then wait "${pids[-1]}" status=$? else SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1)) fi ;; *) log "WARNING: Unknown load scenario: ${scenario}, skipping..." ;; esac set -e # Allow targeting to be distributed evenly across workspaces when each # scenario is run concurrently and all percentages add up to 100. target_start=${target_end} if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then pid_to_scenario+=(["${pids[-1]}"]="${scenario}") # Stagger the start of each scenario to avoid a burst of load and deted # problematic scenarios. sleep $((SCALETEST_PARAM_LOAD_SCENARIO_CONCURRENCY_STAGGER_DELAY_MINS * 60)) continue fi if ((status > 0)); then log "Load scenario failed: ${scenario} (exit=${status})" failed+=(["${scenario}"]="${status}") PHASE_ADD_TAGS=error end_phase else end_phase fi wait_baseline "${SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION}" done if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then wait "${pids[@]}" # Wait on all pids will wait until all have exited, but we need to # check their individual exit codes. for pid in "${pids[@]}"; do wait "${pid}" status=${?} scenario=${pid_to_scenario[${pid}]} if ((status > 0)); then log "Load scenario failed: ${scenario} (exit=${status})" failed+=(["${scenario}"]="${status}") fi done if ((${#failed[@]} > 0)); then PHASE_ADD_TAGS=error end_phase else end_phase fi fi if ((${#failed[@]} > 0)); then log "Load scenarios failed: ${!failed[*]}" for scenario in "${!failed[@]}"; do log " ${scenario}: exit=${failed[$scenario]}" done exit 1 fi log "Scaletest complete!" set_status Complete