mirror of
https://github.com/coder/coder.git
synced 2025-07-09 11:45:56 +00:00
fix: reduce cost of prebuild failure (#17697)
Relates to https://github.com/coder/coder/issues/17432 ### Part 1: Notes: - `GetPresetsAtFailureLimit` SQL query is added, which is similar to `GetPresetsBackoff`, they use same CTEs: `filtered_builds`, `time_sorted_builds`, but they are still different. - Query is executed on every loop iteration. We can consider marking specific preset as permanently failed as an optimization to avoid executing query on every loop iteration. But I decided don't do it for now. - By default `FailureHardLimit` is set to 3. - `FailureHardLimit` is configurable. Setting it to zero - means that hard limit is disabled. ### Part 2 Notes: - `PrebuildFailureLimitReached` notification is added. - Notification is sent to template admins. - Notification is sent only the first time, when hard limit is reached. But it will `log.Warn` on every loop iteration. - I introduced this enum: ```sql CREATE TYPE prebuild_status AS ENUM ( 'normal', -- Prebuilds are working as expected; this is the default, healthy state. 'hard_limited', -- Prebuilds have failed repeatedly and hit the configured hard failure limit; won't be retried anymore. 'validation_failed' -- Prebuilds failed due to a non-retryable validation error (e.g. template misconfiguration); won't be retried. ); ``` `validation_failed` not used in this PR, but I think it will be used in next one, so I wanted to save us an extra migration. - Notification looks like this: <img width="472" alt="image" src="https://github.com/user-attachments/assets/e10efea0-1790-4e7f-a65c-f94c40fced27" /> ### Latest notification views: <img width="463" alt="image" src="https://github.com/user-attachments/assets/11310c58-68d1-4075-a497-f76d854633fe" /> <img width="725" alt="image" src="https://github.com/user-attachments/assets/6bbfe21a-91ac-47c3-a9d1-21807bb0c53a" />
This commit is contained in:
committed by
GitHub
parent
e1934fe119
commit
53e8e9c7cd
@ -0,0 +1 @@
|
||||
DELETE FROM notification_templates WHERE id = '414d9331-c1fc-4761-b40c-d1f4702279eb';
|
@ -0,0 +1,25 @@
|
||||
INSERT INTO notification_templates
|
||||
(id, name, title_template, body_template, "group", actions)
|
||||
VALUES ('414d9331-c1fc-4761-b40c-d1f4702279eb',
|
||||
'Prebuild Failure Limit Reached',
|
||||
E'There is a problem creating prebuilt workspaces',
|
||||
$$
|
||||
The number of failed prebuild attempts has reached the hard limit for template **{{ .Labels.template }}** and preset **{{ .Labels.preset }}**.
|
||||
|
||||
To resume prebuilds, fix the underlying issue and upload a new template version.
|
||||
|
||||
Refer to the documentation for more details:
|
||||
- [Troubleshooting templates](https://coder.com/docs/admin/templates/troubleshooting)
|
||||
- [Troubleshooting of prebuilt workspaces](https://coder.com/docs/admin/templates/extending-templates/prebuilt-workspaces#administration-and-troubleshooting)
|
||||
$$,
|
||||
'Template Events',
|
||||
'[
|
||||
{
|
||||
"label": "View failed prebuilt workspaces",
|
||||
"url": "{{base_url}}/workspaces?filter=owner:prebuilds+status:failed+template:{{.Labels.template}}"
|
||||
},
|
||||
{
|
||||
"label": "View template version",
|
||||
"url": "{{base_url}}/templates/{{.Labels.org}}/{{.Labels.template}}/versions/{{.Labels.template_version}}"
|
||||
}
|
||||
]'::jsonb);
|
@ -0,0 +1,5 @@
|
||||
-- Remove the column from the table first (must happen before dropping the enum type)
|
||||
ALTER TABLE template_version_presets DROP COLUMN prebuild_status;
|
||||
|
||||
-- Then drop the enum type
|
||||
DROP TYPE prebuild_status;
|
@ -0,0 +1,7 @@
|
||||
CREATE TYPE prebuild_status AS ENUM (
|
||||
'healthy', -- Prebuilds are working as expected; this is the default, healthy state.
|
||||
'hard_limited', -- Prebuilds have failed repeatedly and hit the configured hard failure limit; won't be retried anymore.
|
||||
'validation_failed' -- Prebuilds failed due to a non-retryable validation error (e.g. template misconfiguration); won't be retried.
|
||||
);
|
||||
|
||||
ALTER TABLE template_version_presets ADD COLUMN prebuild_status prebuild_status NOT NULL DEFAULT 'healthy'::prebuild_status;
|
Reference in New Issue
Block a user