feat: add computed workspace and agent health fields to the api (#8280)

This commit is contained in:
Mathias Fredriksson
2023-07-10 12:40:11 +03:00
committed by GitHub
parent eabf929676
commit b73f9d8e86
16 changed files with 509 additions and 26 deletions

49
coderd/apidoc/docs.go generated
View File

@ -9592,6 +9592,14 @@ const docTemplate = `{
"type": "string",
"format": "date-time"
},
"health": {
"description": "Health shows the health of the workspace and information about\nwhat is causing an unhealthy status.",
"allOf": [
{
"$ref": "#/definitions/codersdk.WorkspaceHealth"
}
]
},
"id": {
"type": "string",
"format": "uuid"
@ -9689,6 +9697,14 @@ const docTemplate = `{
"type": "string",
"format": "date-time"
},
"health": {
"description": "Health reports the health of the agent.",
"allOf": [
{
"$ref": "#/definitions/codersdk.WorkspaceAgentHealth"
}
]
},
"id": {
"type": "string",
"format": "uuid"
@ -9783,6 +9799,21 @@ const docTemplate = `{
}
}
},
"codersdk.WorkspaceAgentHealth": {
"type": "object",
"properties": {
"healthy": {
"description": "Healthy is true if the agent is healthy.",
"type": "boolean",
"example": false
},
"reason": {
"description": "Reason is a human-readable explanation of the agent's health. It is empty if Healthy is true.",
"type": "string",
"example": "agent has lost connection"
}
}
},
"codersdk.WorkspaceAgentLifecycle": {
"type": "string",
"enum": [
@ -10149,6 +10180,24 @@ const docTemplate = `{
}
}
},
"codersdk.WorkspaceHealth": {
"type": "object",
"properties": {
"failing_agents": {
"description": "FailingAgents lists the IDs of the agents that are failing, if any.",
"type": "array",
"items": {
"type": "string",
"format": "uuid"
}
},
"healthy": {
"description": "Healthy is true if the workspace is healthy.",
"type": "boolean",
"example": false
}
}
},
"codersdk.WorkspaceProxy": {
"type": "object",
"properties": {

View File

@ -8659,6 +8659,14 @@
"type": "string",
"format": "date-time"
},
"health": {
"description": "Health shows the health of the workspace and information about\nwhat is causing an unhealthy status.",
"allOf": [
{
"$ref": "#/definitions/codersdk.WorkspaceHealth"
}
]
},
"id": {
"type": "string",
"format": "uuid"
@ -8756,6 +8764,14 @@
"type": "string",
"format": "date-time"
},
"health": {
"description": "Health reports the health of the agent.",
"allOf": [
{
"$ref": "#/definitions/codersdk.WorkspaceAgentHealth"
}
]
},
"id": {
"type": "string",
"format": "uuid"
@ -8850,6 +8866,21 @@
}
}
},
"codersdk.WorkspaceAgentHealth": {
"type": "object",
"properties": {
"healthy": {
"description": "Healthy is true if the agent is healthy.",
"type": "boolean",
"example": false
},
"reason": {
"description": "Reason is a human-readable explanation of the agent's health. It is empty if Healthy is true.",
"type": "string",
"example": "agent has lost connection"
}
}
},
"codersdk.WorkspaceAgentLifecycle": {
"type": "string",
"enum": [
@ -9187,6 +9218,24 @@
}
}
},
"codersdk.WorkspaceHealth": {
"type": "object",
"properties": {
"failing_agents": {
"description": "FailingAgents lists the IDs of the agents that are failing, if any.",
"type": "array",
"items": {
"type": "string",
"format": "uuid"
}
},
"healthy": {
"description": "Healthy is true if the workspace is healthy.",
"type": "boolean",
"example": false
}
}
},
"codersdk.WorkspaceProxy": {
"type": "object",
"properties": {

View File

@ -1262,6 +1262,24 @@ func convertWorkspaceAgent(derpMap *tailcfg.DERPMap, coordinator tailnet.Coordin
workspaceAgent.ReadyAt = &dbAgent.ReadyAt.Time
}
switch {
case workspaceAgent.Status != codersdk.WorkspaceAgentConnected && workspaceAgent.LifecycleState == codersdk.WorkspaceAgentLifecycleOff:
workspaceAgent.Health.Reason = "agent is not running"
case workspaceAgent.Status == codersdk.WorkspaceAgentTimeout:
workspaceAgent.Health.Reason = "agent is taking too long to connect"
case workspaceAgent.Status == codersdk.WorkspaceAgentDisconnected:
workspaceAgent.Health.Reason = "agent has lost connection"
// Note: We could also handle codersdk.WorkspaceAgentLifecycleStartTimeout
// here, but it's more of a soft issue, so we don't want to mark the agent
// as unhealthy.
case workspaceAgent.LifecycleState == codersdk.WorkspaceAgentLifecycleStartError:
workspaceAgent.Health.Reason = "agent startup script exited with an error"
case workspaceAgent.LifecycleState.ShuttingDown():
workspaceAgent.Health.Reason = "agent is shutting down"
default:
workspaceAgent.Health.Healthy = true
}
return workspaceAgent, nil
}

View File

@ -72,6 +72,7 @@ func TestWorkspaceAgent(t *testing.T) {
require.Equal(t, tmpDir, workspace.LatestBuild.Resources[0].Agents[0].Directory)
_, err = client.WorkspaceAgent(ctx, workspace.LatestBuild.Resources[0].Agents[0].ID)
require.NoError(t, err)
require.True(t, workspace.LatestBuild.Resources[0].Agents[0].Health.Healthy)
})
t.Run("HasFallbackTroubleshootingURL", func(t *testing.T) {
t.Parallel()
@ -167,6 +168,8 @@ func TestWorkspaceAgent(t *testing.T) {
}, testutil.IntervalMedium, "agent status timeout")
require.Equal(t, wantTroubleshootingURL, workspace.LatestBuild.Resources[0].Agents[0].TroubleshootingURL)
require.False(t, workspace.LatestBuild.Resources[0].Agents[0].Health.Healthy)
require.NotEmpty(t, workspace.LatestBuild.Resources[0].Agents[0].Health.Reason)
})
}

View File

@ -1110,6 +1110,15 @@ func convertWorkspace(
lockedAt = &workspace.LockedAt.Time
}
failingAgents := []uuid.UUID{}
for _, resource := range workspaceBuild.Resources {
for _, agent := range resource.Agents {
if !agent.Health.Healthy {
failingAgents = append(failingAgents, agent.ID)
}
}
}
var (
ttlMillis = convertWorkspaceTTLMillis(workspace.Ttl)
deletingAt = calculateDeletingAt(workspace, template, workspaceBuild)
@ -1135,6 +1144,10 @@ func convertWorkspace(
LastUsedAt: workspace.LastUsedAt,
DeletingAt: deletingAt,
LockedAt: lockedAt,
Health: codersdk.WorkspaceHealth{
Healthy: len(failingAgents) == 0,
FailingAgents: failingAgents,
},
}
}

View File

@ -164,6 +164,148 @@ func TestWorkspace(t *testing.T) {
assert.Equal(t, templateDisplayName, ws.TemplateDisplayName)
assert.Equal(t, templateAllowUserCancelWorkspaceJobs, ws.TemplateAllowUserCancelWorkspaceJobs)
})
t.Run("Health", func(t *testing.T) {
t.Parallel()
t.Run("Healthy", func(t *testing.T) {
t.Parallel()
client := coderdtest.New(t, &coderdtest.Options{IncludeProvisionerDaemon: true})
user := coderdtest.CreateFirstUser(t, client)
version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{
Parse: echo.ParseComplete,
ProvisionApply: []*proto.Provision_Response{{
Type: &proto.Provision_Response_Complete{
Complete: &proto.Provision_Complete{
Resources: []*proto.Resource{{
Name: "some",
Type: "example",
Agents: []*proto.Agent{{
Id: uuid.NewString(),
Auth: &proto.Agent_Token{},
}},
}},
},
},
}},
})
coderdtest.AwaitTemplateVersionJob(t, client, version.ID)
template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID)
workspace := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID)
coderdtest.AwaitWorkspaceBuildJob(t, client, workspace.LatestBuild.ID)
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
defer cancel()
workspace, err := client.Workspace(ctx, workspace.ID)
require.NoError(t, err)
agent := workspace.LatestBuild.Resources[0].Agents[0]
assert.True(t, workspace.Health.Healthy)
assert.Equal(t, []uuid.UUID{}, workspace.Health.FailingAgents)
assert.True(t, agent.Health.Healthy)
assert.Empty(t, agent.Health.Reason)
})
t.Run("Unhealthy", func(t *testing.T) {
t.Parallel()
client := coderdtest.New(t, &coderdtest.Options{IncludeProvisionerDaemon: true})
user := coderdtest.CreateFirstUser(t, client)
version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{
Parse: echo.ParseComplete,
ProvisionApply: []*proto.Provision_Response{{
Type: &proto.Provision_Response_Complete{
Complete: &proto.Provision_Complete{
Resources: []*proto.Resource{{
Name: "some",
Type: "example",
Agents: []*proto.Agent{{
Id: uuid.NewString(),
Auth: &proto.Agent_Token{},
ConnectionTimeoutSeconds: 1,
}},
}},
},
},
}},
})
coderdtest.AwaitTemplateVersionJob(t, client, version.ID)
template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID)
workspace := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID)
coderdtest.AwaitWorkspaceBuildJob(t, client, workspace.LatestBuild.ID)
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
defer cancel()
var err error
testutil.Eventually(ctx, t, func(ctx context.Context) bool {
workspace, err = client.Workspace(ctx, workspace.ID)
return assert.NoError(t, err) && !workspace.Health.Healthy
}, testutil.IntervalMedium)
agent := workspace.LatestBuild.Resources[0].Agents[0]
assert.False(t, workspace.Health.Healthy)
assert.Equal(t, []uuid.UUID{agent.ID}, workspace.Health.FailingAgents)
assert.False(t, agent.Health.Healthy)
assert.NotEmpty(t, agent.Health.Reason)
})
t.Run("Mixed health", func(t *testing.T) {
t.Parallel()
client := coderdtest.New(t, &coderdtest.Options{IncludeProvisionerDaemon: true})
user := coderdtest.CreateFirstUser(t, client)
version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{
Parse: echo.ParseComplete,
ProvisionApply: []*proto.Provision_Response{{
Type: &proto.Provision_Response_Complete{
Complete: &proto.Provision_Complete{
Resources: []*proto.Resource{{
Name: "some",
Type: "example",
Agents: []*proto.Agent{{
Id: uuid.NewString(),
Name: "a1",
Auth: &proto.Agent_Token{},
}, {
Id: uuid.NewString(),
Name: "a2",
Auth: &proto.Agent_Token{},
ConnectionTimeoutSeconds: 1,
}},
}},
},
},
}},
})
coderdtest.AwaitTemplateVersionJob(t, client, version.ID)
template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID)
workspace := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID)
coderdtest.AwaitWorkspaceBuildJob(t, client, workspace.LatestBuild.ID)
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
defer cancel()
var err error
testutil.Eventually(ctx, t, func(ctx context.Context) bool {
workspace, err = client.Workspace(ctx, workspace.ID)
return assert.NoError(t, err) && !workspace.Health.Healthy
}, testutil.IntervalMedium)
assert.False(t, workspace.Health.Healthy)
assert.Len(t, workspace.Health.FailingAgents, 1)
agent1 := workspace.LatestBuild.Resources[0].Agents[0]
agent2 := workspace.LatestBuild.Resources[0].Agents[1]
assert.Equal(t, []uuid.UUID{agent2.ID}, workspace.Health.FailingAgents)
assert.True(t, agent1.Health.Healthy)
assert.Empty(t, agent1.Health.Reason)
assert.False(t, agent2.Health.Healthy)
assert.NotEmpty(t, agent2.Health.Reason)
})
})
}
func TestAdminViewAllWorkspaces(t *testing.T) {