Files
coder/scaletest/templates/scaletest-runner/scripts/run.sh

370 lines
13 KiB
Bash
Executable File

#!/bin/bash
set -euo pipefail
[[ $VERBOSE == 1 ]] && set -x
# shellcheck disable=SC2153 source=scaletest/templates/scaletest-runner/scripts/lib.sh
. "${SCRIPTS_DIR}/lib.sh"
mapfile -t scaletest_load_scenarios < <(jq -r '. | join ("\n")' <<<"${SCALETEST_PARAM_LOAD_SCENARIOS}")
export SCALETEST_PARAM_LOAD_SCENARIOS=("${scaletest_load_scenarios[@]}")
log "Running scaletest..."
set_status Running
start_phase "Creating workspaces"
if [[ ${SCALETEST_PARAM_SKIP_CREATE_WORKSPACES} == 0 ]]; then
# Note that we allow up to 5 failures to bring up the workspace, since
# we're creating a lot of workspaces at once and some of them may fail
# due to network issues or other transient errors.
coder exp scaletest create-workspaces \
--retry 5 \
--count "${SCALETEST_PARAM_NUM_WORKSPACES}" \
--template "${SCALETEST_PARAM_TEMPLATE}" \
--concurrency "${SCALETEST_PARAM_CREATE_CONCURRENCY}" \
--timeout 5h \
--job-timeout 5h \
--no-cleanup \
--output json:"${SCALETEST_RESULTS_DIR}/create-workspaces.json"
show_json "${SCALETEST_RESULTS_DIR}/create-workspaces.json"
fi
end_phase
wait_baseline "${SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION}"
non_greedy_agent_traffic_args=()
if [[ ${SCALETEST_PARAM_GREEDY_AGENT} != 1 ]]; then
greedy_agent_traffic() { :; }
else
echo "WARNING: Greedy agent enabled, this may cause the load tests to fail." >&2
non_greedy_agent_traffic_args=(
# Let the greedy agent traffic command be scraped.
# --scaletest-prometheus-address 0.0.0.0:21113
# --trace=false
)
annotate_grafana greedy_agent "Create greedy agent"
coder exp scaletest create-workspaces \
--count 1 \
--template "${SCALETEST_PARAM_GREEDY_AGENT_TEMPLATE}" \
--concurrency 1 \
--timeout 5h \
--job-timeout 5h \
--no-cleanup \
--output json:"${SCALETEST_RESULTS_DIR}/create-workspaces-greedy-agent.json"
wait_baseline "${SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION}"
greedy_agent_traffic() {
local timeout=${1} scenario=${2}
# Run the greedy test for ~1/3 of the timeout.
delay=$((timeout * 60 / 3))
local type=web-terminal
args=()
if [[ ${scenario} == "SSH Traffic" ]]; then
type=ssh
args+=(--ssh)
fi
sleep "${delay}"
annotate_grafana greedy_agent "${scenario}: Greedy agent traffic"
# Produce load at about 1000MB/s (25MB/40ms).
set +e
coder exp scaletest workspace-traffic \
--template "${SCALETEST_PARAM_GREEDY_AGENT_TEMPLATE}" \
--bytes-per-tick $((1024 * 1024 * 25)) \
--tick-interval 40ms \
--timeout "$((delay))s" \
--job-timeout "$((delay))s" \
--output json:"${SCALETEST_RESULTS_DIR}/traffic-${type}-greedy-agent.json" \
--scaletest-prometheus-address 0.0.0.0:21113 \
--trace=false \
"${args[@]}"
status=${?}
show_json "${SCALETEST_RESULTS_DIR}/traffic-${type}-greedy-agent.json"
export GRAFANA_ADD_TAGS=
if [[ ${status} != 0 ]]; then
GRAFANA_ADD_TAGS=error
fi
annotate_grafana_end greedy_agent "${scenario}: Greedy agent traffic"
return "${status}"
}
fi
run_scenario_cmd() {
local scenario=${1}
shift
local command=("$@")
set +e
if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then
annotate_grafana scenario "Load scenario: ${scenario}"
fi
"${command[@]}"
status=${?}
if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then
export GRAFANA_ADD_TAGS=
if [[ ${status} != 0 ]]; then
GRAFANA_ADD_TAGS=error
fi
annotate_grafana_end scenario "Load scenario: ${scenario}"
fi
exit "${status}"
}
declare -a pids=()
declare -A pid_to_scenario=()
declare -A failed=()
target_start=0
target_end=-1
if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then
start_phase "Load scenarios: ${SCALETEST_PARAM_LOAD_SCENARIOS[*]}"
fi
for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do
if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then
start_phase "Load scenario: ${scenario}"
fi
set +e
status=0
case "${scenario}" in
"SSH Traffic")
greedy_agent_traffic "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION}" "${scenario}" &
greedy_agent_traffic_pid=$!
target_count=$(jq -n --argjson percentage "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_PERCENTAGE}" --argjson num_workspaces "${SCALETEST_PARAM_NUM_WORKSPACES}" '$percentage / 100 * $num_workspaces | floor')
target_end=$((target_start + target_count))
if [[ ${target_end} -gt ${SCALETEST_PARAM_NUM_WORKSPACES} ]]; then
log "WARNING: Target count ${target_end} exceeds number of workspaces ${SCALETEST_PARAM_NUM_WORKSPACES}, using ${SCALETEST_PARAM_NUM_WORKSPACES} instead."
target_start=0
target_end=${target_count}
fi
run_scenario_cmd "${scenario}" coder exp scaletest workspace-traffic \
--template "${SCALETEST_PARAM_TEMPLATE}" \
--ssh \
--bytes-per-tick "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_BYTES_PER_TICK}" \
--tick-interval "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_TICK_INTERVAL}ms" \
--timeout "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION}m" \
--job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION}m30s" \
--output json:"${SCALETEST_RESULTS_DIR}/traffic-ssh.json" \
--scaletest-prometheus-address "0.0.0.0:${SCALETEST_PROMETHEUS_START_PORT}" \
--target-workspaces "${target_start}:${target_end}" \
"${non_greedy_agent_traffic_args[@]}" &
pids+=($!)
if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then
wait "${pids[-1]}"
status=$?
show_json "${SCALETEST_RESULTS_DIR}/traffic-ssh.json"
else
SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1))
fi
wait "${greedy_agent_traffic_pid}"
status2=$?
if [[ ${status} == 0 ]]; then
status=${status2}
fi
;;
"Web Terminal Traffic")
greedy_agent_traffic "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION}" "${scenario}" &
greedy_agent_traffic_pid=$!
target_count=$(jq -n --argjson percentage "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_PERCENTAGE}" --argjson num_workspaces "${SCALETEST_PARAM_NUM_WORKSPACES}" '$percentage / 100 * $num_workspaces | floor')
target_end=$((target_start + target_count))
if [[ ${target_end} -gt ${SCALETEST_PARAM_NUM_WORKSPACES} ]]; then
log "WARNING: Target count ${target_end} exceeds number of workspaces ${SCALETEST_PARAM_NUM_WORKSPACES}, using ${SCALETEST_PARAM_NUM_WORKSPACES} instead."
target_start=0
target_end=${target_count}
fi
run_scenario_cmd "${scenario}" coder exp scaletest workspace-traffic \
--template "${SCALETEST_PARAM_TEMPLATE}" \
--bytes-per-tick "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_BYTES_PER_TICK}" \
--tick-interval "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_TICK_INTERVAL}ms" \
--timeout "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION}m" \
--job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION}m30s" \
--output json:"${SCALETEST_RESULTS_DIR}/traffic-web-terminal.json" \
--scaletest-prometheus-address "0.0.0.0:${SCALETEST_PROMETHEUS_START_PORT}" \
--target-workspaces "${target_start}:${target_end}" \
"${non_greedy_agent_traffic_args[@]}" &
pids+=($!)
if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then
wait "${pids[-1]}"
status=$?
show_json "${SCALETEST_RESULTS_DIR}/traffic-web-terminal.json"
else
SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1))
fi
wait "${greedy_agent_traffic_pid}"
status2=$?
if [[ ${status} == 0 ]]; then
status=${status2}
fi
;;
"App Traffic")
greedy_agent_traffic "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_DURATION}" "${scenario}" &
greedy_agent_traffic_pid=$!
target_count=$(jq -n --argjson percentage "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_PERCENTAGE}" --argjson num_workspaces "${SCALETEST_PARAM_NUM_WORKSPACES}" '$percentage / 100 * $num_workspaces | floor')
target_end=$((target_start + target_count))
if [[ ${target_end} -gt ${SCALETEST_PARAM_NUM_WORKSPACES} ]]; then
log "WARNING: Target count ${target_end} exceeds number of workspaces ${SCALETEST_PARAM_NUM_WORKSPACES}, using ${SCALETEST_PARAM_NUM_WORKSPACES} instead."
target_start=0
target_end=${target_count}
fi
run_scenario_cmd "${scenario}" coder exp scaletest workspace-traffic \
--template "${SCALETEST_PARAM_TEMPLATE}" \
--bytes-per-tick "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_BYTES_PER_TICK}" \
--tick-interval "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_TICK_INTERVAL}ms" \
--timeout "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_DURATION}m" \
--job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_DURATION}m30s" \
--output json:"${SCALETEST_RESULTS_DIR}/traffic-app.json" \
--scaletest-prometheus-address "0.0.0.0:${SCALETEST_PROMETHEUS_START_PORT}" \
--app "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_MODE}" \
--target-workspaces "${target_start}:${target_end}" \
"${non_greedy_agent_traffic_args[@]}" &
pids+=($!)
if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then
wait "${pids[-1]}"
status=$?
show_json "${SCALETEST_RESULTS_DIR}/traffic-app.json"
else
SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1))
fi
wait "${greedy_agent_traffic_pid}"
status2=$?
if [[ ${status} == 0 ]]; then
status=${status2}
fi
;;
"Dashboard Traffic")
target_count=$(jq -n --argjson percentage "${SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_PERCENTAGE}" --argjson num_workspaces "${SCALETEST_PARAM_NUM_WORKSPACES}" '$percentage / 100 * $num_workspaces | floor')
target_end=$((target_start + target_count))
if [[ ${target_end} -gt ${SCALETEST_PARAM_NUM_WORKSPACES} ]]; then
log "WARNING: Target count ${target_end} exceeds number of workspaces ${SCALETEST_PARAM_NUM_WORKSPACES}, using ${SCALETEST_PARAM_NUM_WORKSPACES} instead."
target_start=0
target_end=${target_count}
fi
# TODO: Remove this once the dashboard traffic command is fixed,
# (i.e. once images are no longer dumped into PWD).
mkdir -p dashboard
pushd dashboard
run_scenario_cmd "${scenario}" coder exp scaletest dashboard \
--timeout "${SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION}m" \
--job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION}m30s" \
--output json:"${SCALETEST_RESULTS_DIR}/traffic-dashboard.json" \
--scaletest-prometheus-address "0.0.0.0:${SCALETEST_PROMETHEUS_START_PORT}" \
--target-users "${target_start}:${target_end}" \
>"${SCALETEST_RESULTS_DIR}/traffic-dashboard-output.log" &
pids+=($!)
popd
if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then
wait "${pids[-1]}"
status=$?
show_json "${SCALETEST_RESULTS_DIR}/traffic-dashboard.json"
else
SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1))
fi
;;
# Debug scenarios, for testing the runner.
"debug:greedy_agent_traffic")
greedy_agent_traffic 10 "${scenario}" &
pids+=($!)
if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then
wait "${pids[-1]}"
status=$?
else
SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1))
fi
;;
"debug:success")
{
maybedryrun "$DRY_RUN" sleep 10
true
} &
pids+=($!)
if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then
wait "${pids[-1]}"
status=$?
else
SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1))
fi
;;
"debug:error")
{
maybedryrun "$DRY_RUN" sleep 10
false
} &
pids+=($!)
if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then
wait "${pids[-1]}"
status=$?
else
SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1))
fi
;;
*)
log "WARNING: Unknown load scenario: ${scenario}, skipping..."
;;
esac
set -e
# Allow targeting to be distributed evenly across workspaces when each
# scenario is run concurrently and all percentages add up to 100.
target_start=${target_end}
if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then
pid_to_scenario+=(["${pids[-1]}"]="${scenario}")
# Stagger the start of each scenario to avoid a burst of load and deted
# problematic scenarios.
sleep $((SCALETEST_PARAM_LOAD_SCENARIO_CONCURRENCY_STAGGER_DELAY_MINS * 60))
continue
fi
if ((status > 0)); then
log "Load scenario failed: ${scenario} (exit=${status})"
failed+=(["${scenario}"]="${status}")
PHASE_ADD_TAGS=error end_phase
else
end_phase
fi
wait_baseline "${SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION}"
done
if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then
wait "${pids[@]}"
# Wait on all pids will wait until all have exited, but we need to
# check their individual exit codes.
for pid in "${pids[@]}"; do
wait "${pid}"
status=${?}
scenario=${pid_to_scenario[${pid}]}
if ((status > 0)); then
log "Load scenario failed: ${scenario} (exit=${status})"
failed+=(["${scenario}"]="${status}")
fi
done
if ((${#failed[@]} > 0)); then
PHASE_ADD_TAGS=error end_phase
else
end_phase
fi
fi
if ((${#failed[@]} > 0)); then
log "Load scenarios failed: ${!failed[*]}"
for scenario in "${!failed[@]}"; do
log " ${scenario}: exit=${failed[$scenario]}"
done
exit 1
fi
log "Scaletest complete!"
set_status Complete