feat: Add connection_timeout and troubleshooting_url to agent (#4937)

* feat: Add connection_timeout and troubleshooting_url to agent

This commit adds the connection timeout and troubleshooting url fields
to coder agents.

If an initial connection cannot be established within connection timeout
seconds, then the agent status will be marked as `"timeout"`.

The troubleshooting URL will be present, if configured in the Terraform
template, it can be presented to the user when the agent state is either
`"timeout"` or `"disconnected"`.

Fixes #4678
This commit is contained in:
Mathias Fredriksson
2022-11-09 17:27:05 +02:00
committed by GitHub
parent ed7de90a55
commit 90c34b74de
44 changed files with 857 additions and 423 deletions

View File

@ -154,6 +154,7 @@ func (q *fakeQuerier) AcquireProvisionerJob(_ context.Context, arg database.Acqu
}
return database.ProvisionerJob{}, sql.ErrNoRows
}
func (*fakeQuerier) DeleteOldAgentStats(_ context.Context) error {
// no-op
return nil
@ -2362,20 +2363,22 @@ func (q *fakeQuerier) InsertWorkspaceAgent(_ context.Context, arg database.Inser
defer q.mutex.Unlock()
agent := database.WorkspaceAgent{
ID: arg.ID,
CreatedAt: arg.CreatedAt,
UpdatedAt: arg.UpdatedAt,
ResourceID: arg.ResourceID,
AuthToken: arg.AuthToken,
AuthInstanceID: arg.AuthInstanceID,
EnvironmentVariables: arg.EnvironmentVariables,
Name: arg.Name,
Architecture: arg.Architecture,
OperatingSystem: arg.OperatingSystem,
Directory: arg.Directory,
StartupScript: arg.StartupScript,
InstanceMetadata: arg.InstanceMetadata,
ResourceMetadata: arg.ResourceMetadata,
ID: arg.ID,
CreatedAt: arg.CreatedAt,
UpdatedAt: arg.UpdatedAt,
ResourceID: arg.ResourceID,
AuthToken: arg.AuthToken,
AuthInstanceID: arg.AuthInstanceID,
EnvironmentVariables: arg.EnvironmentVariables,
Name: arg.Name,
Architecture: arg.Architecture,
OperatingSystem: arg.OperatingSystem,
Directory: arg.Directory,
StartupScript: arg.StartupScript,
InstanceMetadata: arg.InstanceMetadata,
ResourceMetadata: arg.ResourceMetadata,
ConnectionTimeoutSeconds: arg.ConnectionTimeoutSeconds,
TroubleshootingURL: arg.TroubleshootingURL,
}
q.provisionerJobAgents = append(q.provisionerJobAgents, agent)

View File

@ -400,11 +400,17 @@ CREATE TABLE workspace_agents (
resource_metadata jsonb,
directory character varying(4096) DEFAULT ''::character varying NOT NULL,
version text DEFAULT ''::text NOT NULL,
last_connected_replica_id uuid
last_connected_replica_id uuid,
connection_timeout_seconds integer DEFAULT 0 NOT NULL,
troubleshooting_url text DEFAULT ''::text NOT NULL
);
COMMENT ON COLUMN workspace_agents.version IS 'Version tracks the version of the currently running workspace agent. Workspace agents register their version upon start.';
COMMENT ON COLUMN workspace_agents.connection_timeout_seconds IS 'Connection timeout in seconds, 0 means disabled.';
COMMENT ON COLUMN workspace_agents.troubleshooting_url IS 'URL for troubleshooting the agent.';
CREATE TABLE workspace_apps (
id uuid NOT NULL,
created_at timestamp with time zone NOT NULL,

View File

@ -0,0 +1,9 @@
BEGIN;
ALTER TABLE workspace_agents
DROP COLUMN connection_timeout_seconds;
ALTER TABLE workspace_agents
DROP COLUMN troubleshooting_url;
COMMIT;

View File

@ -0,0 +1,13 @@
BEGIN;
ALTER TABLE workspace_agents
ADD COLUMN connection_timeout_seconds integer NOT NULL DEFAULT 0;
COMMENT ON COLUMN workspace_agents.connection_timeout_seconds IS 'Connection timeout in seconds, 0 means disabled.';
ALTER TABLE workspace_agents
ADD COLUMN troubleshooting_url text NOT NULL DEFAULT '';
COMMENT ON COLUMN workspace_agents.troubleshooting_url IS 'URL for troubleshooting the agent.';
COMMIT;

View File

@ -662,6 +662,10 @@ type WorkspaceAgent struct {
// Version tracks the version of the currently running workspace agent. Workspace agents register their version upon start.
Version string `db:"version" json:"version"`
LastConnectedReplicaID uuid.NullUUID `db:"last_connected_replica_id" json:"last_connected_replica_id"`
// Connection timeout in seconds, 0 means disabled.
ConnectionTimeoutSeconds int32 `db:"connection_timeout_seconds" json:"connection_timeout_seconds"`
// URL for troubleshooting the agent.
TroubleshootingURL string `db:"troubleshooting_url" json:"troubleshooting_url"`
}
type WorkspaceApp struct {

View File

@ -4520,7 +4520,7 @@ func (q *sqlQuerier) UpdateUserStatus(ctx context.Context, arg UpdateUserStatusP
const getWorkspaceAgentByAuthToken = `-- name: GetWorkspaceAgentByAuthToken :one
SELECT
id, created_at, updated_at, name, first_connected_at, last_connected_at, disconnected_at, resource_id, auth_token, auth_instance_id, architecture, environment_variables, operating_system, startup_script, instance_metadata, resource_metadata, directory, version, last_connected_replica_id
id, created_at, updated_at, name, first_connected_at, last_connected_at, disconnected_at, resource_id, auth_token, auth_instance_id, architecture, environment_variables, operating_system, startup_script, instance_metadata, resource_metadata, directory, version, last_connected_replica_id, connection_timeout_seconds, troubleshooting_url
FROM
workspace_agents
WHERE
@ -4552,13 +4552,15 @@ func (q *sqlQuerier) GetWorkspaceAgentByAuthToken(ctx context.Context, authToken
&i.Directory,
&i.Version,
&i.LastConnectedReplicaID,
&i.ConnectionTimeoutSeconds,
&i.TroubleshootingURL,
)
return i, err
}
const getWorkspaceAgentByID = `-- name: GetWorkspaceAgentByID :one
SELECT
id, created_at, updated_at, name, first_connected_at, last_connected_at, disconnected_at, resource_id, auth_token, auth_instance_id, architecture, environment_variables, operating_system, startup_script, instance_metadata, resource_metadata, directory, version, last_connected_replica_id
id, created_at, updated_at, name, first_connected_at, last_connected_at, disconnected_at, resource_id, auth_token, auth_instance_id, architecture, environment_variables, operating_system, startup_script, instance_metadata, resource_metadata, directory, version, last_connected_replica_id, connection_timeout_seconds, troubleshooting_url
FROM
workspace_agents
WHERE
@ -4588,13 +4590,15 @@ func (q *sqlQuerier) GetWorkspaceAgentByID(ctx context.Context, id uuid.UUID) (W
&i.Directory,
&i.Version,
&i.LastConnectedReplicaID,
&i.ConnectionTimeoutSeconds,
&i.TroubleshootingURL,
)
return i, err
}
const getWorkspaceAgentByInstanceID = `-- name: GetWorkspaceAgentByInstanceID :one
SELECT
id, created_at, updated_at, name, first_connected_at, last_connected_at, disconnected_at, resource_id, auth_token, auth_instance_id, architecture, environment_variables, operating_system, startup_script, instance_metadata, resource_metadata, directory, version, last_connected_replica_id
id, created_at, updated_at, name, first_connected_at, last_connected_at, disconnected_at, resource_id, auth_token, auth_instance_id, architecture, environment_variables, operating_system, startup_script, instance_metadata, resource_metadata, directory, version, last_connected_replica_id, connection_timeout_seconds, troubleshooting_url
FROM
workspace_agents
WHERE
@ -4626,13 +4630,15 @@ func (q *sqlQuerier) GetWorkspaceAgentByInstanceID(ctx context.Context, authInst
&i.Directory,
&i.Version,
&i.LastConnectedReplicaID,
&i.ConnectionTimeoutSeconds,
&i.TroubleshootingURL,
)
return i, err
}
const getWorkspaceAgentsByResourceIDs = `-- name: GetWorkspaceAgentsByResourceIDs :many
SELECT
id, created_at, updated_at, name, first_connected_at, last_connected_at, disconnected_at, resource_id, auth_token, auth_instance_id, architecture, environment_variables, operating_system, startup_script, instance_metadata, resource_metadata, directory, version, last_connected_replica_id
id, created_at, updated_at, name, first_connected_at, last_connected_at, disconnected_at, resource_id, auth_token, auth_instance_id, architecture, environment_variables, operating_system, startup_script, instance_metadata, resource_metadata, directory, version, last_connected_replica_id, connection_timeout_seconds, troubleshooting_url
FROM
workspace_agents
WHERE
@ -4668,6 +4674,8 @@ func (q *sqlQuerier) GetWorkspaceAgentsByResourceIDs(ctx context.Context, ids []
&i.Directory,
&i.Version,
&i.LastConnectedReplicaID,
&i.ConnectionTimeoutSeconds,
&i.TroubleshootingURL,
); err != nil {
return nil, err
}
@ -4683,7 +4691,7 @@ func (q *sqlQuerier) GetWorkspaceAgentsByResourceIDs(ctx context.Context, ids []
}
const getWorkspaceAgentsCreatedAfter = `-- name: GetWorkspaceAgentsCreatedAfter :many
SELECT id, created_at, updated_at, name, first_connected_at, last_connected_at, disconnected_at, resource_id, auth_token, auth_instance_id, architecture, environment_variables, operating_system, startup_script, instance_metadata, resource_metadata, directory, version, last_connected_replica_id FROM workspace_agents WHERE created_at > $1
SELECT id, created_at, updated_at, name, first_connected_at, last_connected_at, disconnected_at, resource_id, auth_token, auth_instance_id, architecture, environment_variables, operating_system, startup_script, instance_metadata, resource_metadata, directory, version, last_connected_replica_id, connection_timeout_seconds, troubleshooting_url FROM workspace_agents WHERE created_at > $1
`
func (q *sqlQuerier) GetWorkspaceAgentsCreatedAfter(ctx context.Context, createdAt time.Time) ([]WorkspaceAgent, error) {
@ -4715,6 +4723,8 @@ func (q *sqlQuerier) GetWorkspaceAgentsCreatedAfter(ctx context.Context, created
&i.Directory,
&i.Version,
&i.LastConnectedReplicaID,
&i.ConnectionTimeoutSeconds,
&i.TroubleshootingURL,
); err != nil {
return nil, err
}
@ -4745,27 +4755,31 @@ INSERT INTO
startup_script,
directory,
instance_metadata,
resource_metadata
resource_metadata,
connection_timeout_seconds,
troubleshooting_url
)
VALUES
($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14) RETURNING id, created_at, updated_at, name, first_connected_at, last_connected_at, disconnected_at, resource_id, auth_token, auth_instance_id, architecture, environment_variables, operating_system, startup_script, instance_metadata, resource_metadata, directory, version, last_connected_replica_id
($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16) RETURNING id, created_at, updated_at, name, first_connected_at, last_connected_at, disconnected_at, resource_id, auth_token, auth_instance_id, architecture, environment_variables, operating_system, startup_script, instance_metadata, resource_metadata, directory, version, last_connected_replica_id, connection_timeout_seconds, troubleshooting_url
`
type InsertWorkspaceAgentParams struct {
ID uuid.UUID `db:"id" json:"id"`
CreatedAt time.Time `db:"created_at" json:"created_at"`
UpdatedAt time.Time `db:"updated_at" json:"updated_at"`
Name string `db:"name" json:"name"`
ResourceID uuid.UUID `db:"resource_id" json:"resource_id"`
AuthToken uuid.UUID `db:"auth_token" json:"auth_token"`
AuthInstanceID sql.NullString `db:"auth_instance_id" json:"auth_instance_id"`
Architecture string `db:"architecture" json:"architecture"`
EnvironmentVariables pqtype.NullRawMessage `db:"environment_variables" json:"environment_variables"`
OperatingSystem string `db:"operating_system" json:"operating_system"`
StartupScript sql.NullString `db:"startup_script" json:"startup_script"`
Directory string `db:"directory" json:"directory"`
InstanceMetadata pqtype.NullRawMessage `db:"instance_metadata" json:"instance_metadata"`
ResourceMetadata pqtype.NullRawMessage `db:"resource_metadata" json:"resource_metadata"`
ID uuid.UUID `db:"id" json:"id"`
CreatedAt time.Time `db:"created_at" json:"created_at"`
UpdatedAt time.Time `db:"updated_at" json:"updated_at"`
Name string `db:"name" json:"name"`
ResourceID uuid.UUID `db:"resource_id" json:"resource_id"`
AuthToken uuid.UUID `db:"auth_token" json:"auth_token"`
AuthInstanceID sql.NullString `db:"auth_instance_id" json:"auth_instance_id"`
Architecture string `db:"architecture" json:"architecture"`
EnvironmentVariables pqtype.NullRawMessage `db:"environment_variables" json:"environment_variables"`
OperatingSystem string `db:"operating_system" json:"operating_system"`
StartupScript sql.NullString `db:"startup_script" json:"startup_script"`
Directory string `db:"directory" json:"directory"`
InstanceMetadata pqtype.NullRawMessage `db:"instance_metadata" json:"instance_metadata"`
ResourceMetadata pqtype.NullRawMessage `db:"resource_metadata" json:"resource_metadata"`
ConnectionTimeoutSeconds int32 `db:"connection_timeout_seconds" json:"connection_timeout_seconds"`
TroubleshootingURL string `db:"troubleshooting_url" json:"troubleshooting_url"`
}
func (q *sqlQuerier) InsertWorkspaceAgent(ctx context.Context, arg InsertWorkspaceAgentParams) (WorkspaceAgent, error) {
@ -4784,6 +4798,8 @@ func (q *sqlQuerier) InsertWorkspaceAgent(ctx context.Context, arg InsertWorkspa
arg.Directory,
arg.InstanceMetadata,
arg.ResourceMetadata,
arg.ConnectionTimeoutSeconds,
arg.TroubleshootingURL,
)
var i WorkspaceAgent
err := row.Scan(
@ -4806,6 +4822,8 @@ func (q *sqlQuerier) InsertWorkspaceAgent(ctx context.Context, arg InsertWorkspa
&i.Directory,
&i.Version,
&i.LastConnectedReplicaID,
&i.ConnectionTimeoutSeconds,
&i.TroubleshootingURL,
)
return i, err
}

View File

@ -53,10 +53,12 @@ INSERT INTO
startup_script,
directory,
instance_metadata,
resource_metadata
resource_metadata,
connection_timeout_seconds,
troubleshooting_url
)
VALUES
($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14) RETURNING *;
($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16) RETURNING *;
-- name: UpdateWorkspaceAgentConnectionByID :exec
UPDATE

View File

@ -47,3 +47,4 @@ rename:
jwt: JWT
user_acl: UserACL
group_acl: GroupACL
troubleshooting_url: TroubleshootingURL

View File

@ -678,7 +678,7 @@ func InsertWorkspaceResource(ctx context.Context, db database.Store, jobID uuid.
}
snapshot.WorkspaceResources = append(snapshot.WorkspaceResources, telemetry.ConvertWorkspaceResource(resource))
var appSlugs = make(map[string]struct{})
appSlugs := make(map[string]struct{})
for _, prAgent := range protoResource.Agents {
var instanceID sql.NullString
if prAgent.GetInstanceId() != "" {
@ -723,6 +723,8 @@ func InsertWorkspaceResource(ctx context.Context, db database.Store, jobID uuid.
String: prAgent.StartupScript,
Valid: prAgent.StartupScript != "",
},
ConnectionTimeoutSeconds: prAgent.GetConnectionTimeoutSeconds(),
TroubleshootingURL: prAgent.GetTroubleshootingUrl(),
})
if err != nil {
return xerrors.Errorf("insert agent: %w", err)

View File

@ -669,19 +669,21 @@ func convertWorkspaceAgent(derpMap *tailcfg.DERPMap, coordinator tailnet.Coordin
}
}
workspaceAgent := codersdk.WorkspaceAgent{
ID: dbAgent.ID,
CreatedAt: dbAgent.CreatedAt,
UpdatedAt: dbAgent.UpdatedAt,
ResourceID: dbAgent.ResourceID,
InstanceID: dbAgent.AuthInstanceID.String,
Name: dbAgent.Name,
Architecture: dbAgent.Architecture,
OperatingSystem: dbAgent.OperatingSystem,
StartupScript: dbAgent.StartupScript.String,
Version: dbAgent.Version,
EnvironmentVariables: envs,
Directory: dbAgent.Directory,
Apps: apps,
ID: dbAgent.ID,
CreatedAt: dbAgent.CreatedAt,
UpdatedAt: dbAgent.UpdatedAt,
ResourceID: dbAgent.ResourceID,
InstanceID: dbAgent.AuthInstanceID.String,
Name: dbAgent.Name,
Architecture: dbAgent.Architecture,
OperatingSystem: dbAgent.OperatingSystem,
StartupScript: dbAgent.StartupScript.String,
Version: dbAgent.Version,
EnvironmentVariables: envs,
Directory: dbAgent.Directory,
Apps: apps,
ConnectionTimeoutSeconds: dbAgent.ConnectionTimeoutSeconds,
TroubleshootingURL: dbAgent.TroubleshootingURL,
}
node := coordinator.Node(dbAgent.ID)
if node != nil {
@ -718,11 +720,20 @@ func convertWorkspaceAgent(derpMap *tailcfg.DERPMap, coordinator tailnet.Coordin
if dbAgent.DisconnectedAt.Valid {
workspaceAgent.DisconnectedAt = &dbAgent.DisconnectedAt.Time
}
connectionTimeout := time.Duration(dbAgent.ConnectionTimeoutSeconds) * time.Second
switch {
case !dbAgent.FirstConnectedAt.Valid:
// If the agent never connected, it's waiting for the compute
// to start up.
workspaceAgent.Status = codersdk.WorkspaceAgentConnecting
switch {
case connectionTimeout > 0 && database.Now().Sub(dbAgent.CreatedAt) > connectionTimeout:
// If the agent took too long to connect the first time,
// mark it as timed out.
workspaceAgent.Status = codersdk.WorkspaceAgentTimeout
default:
// If the agent never connected, it's waiting for the compute
// to start up.
workspaceAgent.Status = codersdk.WorkspaceAgentConnecting
}
case dbAgent.DisconnectedAt.Time.After(dbAgent.LastConnectedAt.Time):
// If we've disconnected after our last connection, we know the
// agent is no longer connected.

View File

@ -74,6 +74,53 @@ func TestWorkspaceAgent(t *testing.T) {
_, err = client.WorkspaceAgent(ctx, workspace.LatestBuild.Resources[0].Agents[0].ID)
require.NoError(t, err)
})
t.Run("Timeout", func(t *testing.T) {
t.Parallel()
client := coderdtest.New(t, &coderdtest.Options{
IncludeProvisionerDaemon: true,
})
user := coderdtest.CreateFirstUser(t, client)
authToken := uuid.NewString()
tmpDir := t.TempDir()
version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{
Parse: echo.ParseComplete,
ProvisionDryRun: echo.ProvisionComplete,
Provision: []*proto.Provision_Response{{
Type: &proto.Provision_Response_Complete{
Complete: &proto.Provision_Complete{
Resources: []*proto.Resource{{
Name: "example",
Type: "aws_instance",
Agents: []*proto.Agent{{
Id: uuid.NewString(),
Directory: tmpDir,
Auth: &proto.Agent_Token{
Token: authToken,
},
ConnectionTimeoutSeconds: 1,
TroubleshootingUrl: "https://example.com/troubleshoot",
}},
}},
},
},
}},
})
template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID)
coderdtest.AwaitTemplateVersionJob(t, client, version.ID)
workspace := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID)
coderdtest.AwaitWorkspaceBuildJob(t, client, workspace.LatestBuild.ID)
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitMedium)
defer cancel()
testutil.Eventually(ctx, t, func(ctx context.Context) (done bool) {
workspace, err := client.Workspace(ctx, workspace.ID)
if !assert.NoError(t, err) {
return false
}
return workspace.LatestBuild.Resources[0].Agents[0].Status == codersdk.WorkspaceAgentTimeout
}, testutil.IntervalMedium, "agent status timeout")
})
}
func TestWorkspaceAgentListen(t *testing.T) {