feat: expose template insights as Prometheus metrics (#10325)

This commit is contained in:
Marcin Tojek
2023-10-19 10:45:12 +02:00
committed by GitHub
parent 997493d4ae
commit c4f590581e
11 changed files with 555 additions and 0 deletions

View File

@ -1327,6 +1327,13 @@ func (q *querier) GetTemplateInsightsByInterval(ctx context.Context, arg databas
return q.db.GetTemplateInsightsByInterval(ctx, arg)
}
func (q *querier) GetTemplateInsightsByTemplate(ctx context.Context, arg database.GetTemplateInsightsByTemplateParams) ([]database.GetTemplateInsightsByTemplateRow, error) {
if err := q.authorizeContext(ctx, rbac.ActionUpdate, rbac.ResourceTemplate.All()); err != nil {
return nil, err
}
return q.db.GetTemplateInsightsByTemplate(ctx, arg)
}
func (q *querier) GetTemplateParameterInsights(ctx context.Context, arg database.GetTemplateParameterInsightsParams) ([]database.GetTemplateParameterInsightsRow, error) {
for _, templateID := range arg.TemplateIDs {
template, err := q.db.GetTemplateByID(ctx, templateID)

View File

@ -2500,6 +2500,10 @@ func (q *FakeQuerier) GetTemplateInsights(_ context.Context, arg database.GetTem
templateIDSet := make(map[uuid.UUID]struct{})
appUsageIntervalsByUser := make(map[uuid.UUID]map[time.Time]*database.GetTemplateInsightsRow)
q.mutex.RLock()
defer q.mutex.RUnlock()
for _, s := range q.workspaceAgentStats {
if s.CreatedAt.Before(arg.StartTime) || s.CreatedAt.Equal(arg.EndTime) || s.CreatedAt.After(arg.EndTime) {
continue
@ -2648,6 +2652,101 @@ func (q *FakeQuerier) GetTemplateInsightsByInterval(ctx context.Context, arg dat
return result, nil
}
func (q *FakeQuerier) GetTemplateInsightsByTemplate(_ context.Context, arg database.GetTemplateInsightsByTemplateParams) ([]database.GetTemplateInsightsByTemplateRow, error) {
err := validateDatabaseType(arg)
if err != nil {
return nil, err
}
q.mutex.RLock()
defer q.mutex.RUnlock()
// map time.Time x TemplateID x UserID x <usage>
appUsageByTemplateAndUser := map[time.Time]map[uuid.UUID]map[uuid.UUID]database.GetTemplateInsightsByTemplateRow{}
// Review agent stats in terms of usage
templateIDSet := make(map[uuid.UUID]struct{})
for _, s := range q.workspaceAgentStats {
if s.CreatedAt.Before(arg.StartTime) || s.CreatedAt.Equal(arg.EndTime) || s.CreatedAt.After(arg.EndTime) {
continue
}
if s.ConnectionCount == 0 {
continue
}
t := s.CreatedAt.Truncate(time.Minute)
templateIDSet[s.TemplateID] = struct{}{}
if _, ok := appUsageByTemplateAndUser[t]; !ok {
appUsageByTemplateAndUser[t] = make(map[uuid.UUID]map[uuid.UUID]database.GetTemplateInsightsByTemplateRow)
}
if _, ok := appUsageByTemplateAndUser[t][s.TemplateID]; !ok {
appUsageByTemplateAndUser[t][s.TemplateID] = make(map[uuid.UUID]database.GetTemplateInsightsByTemplateRow)
}
if _, ok := appUsageByTemplateAndUser[t][s.TemplateID][s.UserID]; !ok {
appUsageByTemplateAndUser[t][s.TemplateID][s.UserID] = database.GetTemplateInsightsByTemplateRow{}
}
u := appUsageByTemplateAndUser[t][s.TemplateID][s.UserID]
if s.SessionCountJetBrains > 0 {
u.UsageJetbrainsSeconds = 60
}
if s.SessionCountVSCode > 0 {
u.UsageVscodeSeconds = 60
}
if s.SessionCountReconnectingPTY > 0 {
u.UsageReconnectingPtySeconds = 60
}
if s.SessionCountSSH > 0 {
u.UsageSshSeconds = 60
}
appUsageByTemplateAndUser[t][s.TemplateID][s.UserID] = u
}
// Sort used templates
templateIDs := make([]uuid.UUID, 0, len(templateIDSet))
for templateID := range templateIDSet {
templateIDs = append(templateIDs, templateID)
}
slices.SortFunc(templateIDs, func(a, b uuid.UUID) int {
return slice.Ascending(a.String(), b.String())
})
// Build result
var result []database.GetTemplateInsightsByTemplateRow
for _, templateID := range templateIDs {
r := database.GetTemplateInsightsByTemplateRow{
TemplateID: templateID,
}
uniqueUsers := map[uuid.UUID]struct{}{}
for _, mTemplateUserUsage := range appUsageByTemplateAndUser {
mUserUsage, ok := mTemplateUserUsage[templateID]
if !ok {
continue // template was not used in this time window
}
for userID, usage := range mUserUsage {
uniqueUsers[userID] = struct{}{}
r.UsageJetbrainsSeconds += usage.UsageJetbrainsSeconds
r.UsageVscodeSeconds += usage.UsageVscodeSeconds
r.UsageReconnectingPtySeconds += usage.UsageReconnectingPtySeconds
r.UsageSshSeconds += usage.UsageSshSeconds
}
}
r.ActiveUsers = int64(len(uniqueUsers))
result = append(result, r)
}
return result, nil
}
func (q *FakeQuerier) GetTemplateParameterInsights(ctx context.Context, arg database.GetTemplateParameterInsightsParams) ([]database.GetTemplateParameterInsightsRow, error) {
err := validateDatabaseType(arg)
if err != nil {

View File

@ -704,6 +704,13 @@ func (m metricsStore) GetTemplateInsightsByInterval(ctx context.Context, arg dat
return r0, r1
}
func (m metricsStore) GetTemplateInsightsByTemplate(ctx context.Context, arg database.GetTemplateInsightsByTemplateParams) ([]database.GetTemplateInsightsByTemplateRow, error) {
start := time.Now()
r0, r1 := m.s.GetTemplateInsightsByTemplate(ctx, arg)
m.queryLatencies.WithLabelValues("GetTemplateInsightsByTemplate").Observe(time.Since(start).Seconds())
return r0, r1
}
func (m metricsStore) GetTemplateParameterInsights(ctx context.Context, arg database.GetTemplateParameterInsightsParams) ([]database.GetTemplateParameterInsightsRow, error) {
start := time.Now()
r0, r1 := m.s.GetTemplateParameterInsights(ctx, arg)

View File

@ -1433,6 +1433,21 @@ func (mr *MockStoreMockRecorder) GetTemplateInsightsByInterval(arg0, arg1 interf
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetTemplateInsightsByInterval", reflect.TypeOf((*MockStore)(nil).GetTemplateInsightsByInterval), arg0, arg1)
}
// GetTemplateInsightsByTemplate mocks base method.
func (m *MockStore) GetTemplateInsightsByTemplate(arg0 context.Context, arg1 database.GetTemplateInsightsByTemplateParams) ([]database.GetTemplateInsightsByTemplateRow, error) {
m.ctrl.T.Helper()
ret := m.ctrl.Call(m, "GetTemplateInsightsByTemplate", arg0, arg1)
ret0, _ := ret[0].([]database.GetTemplateInsightsByTemplateRow)
ret1, _ := ret[1].(error)
return ret0, ret1
}
// GetTemplateInsightsByTemplate indicates an expected call of GetTemplateInsightsByTemplate.
func (mr *MockStoreMockRecorder) GetTemplateInsightsByTemplate(arg0, arg1 interface{}) *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetTemplateInsightsByTemplate", reflect.TypeOf((*MockStore)(nil).GetTemplateInsightsByTemplate), arg0, arg1)
}
// GetTemplateParameterInsights mocks base method.
func (m *MockStore) GetTemplateParameterInsights(arg0 context.Context, arg1 database.GetTemplateParameterInsightsParams) ([]database.GetTemplateParameterInsightsRow, error) {
m.ctrl.T.Helper()

View File

@ -145,6 +145,7 @@ type sqlcQuerier interface {
// that interval will be shorter than a full one. If there is no data for a selected
// interval/template, it will be included in the results with 0 active users.
GetTemplateInsightsByInterval(ctx context.Context, arg GetTemplateInsightsByIntervalParams) ([]GetTemplateInsightsByIntervalRow, error)
GetTemplateInsightsByTemplate(ctx context.Context, arg GetTemplateInsightsByTemplateParams) ([]GetTemplateInsightsByTemplateRow, error)
// GetTemplateParameterInsights does for each template in a given timeframe,
// look for the latest workspace build (for every workspace) that has been
// created in the timeframe and return the aggregate usage counts of parameter

View File

@ -1937,6 +1937,79 @@ func (q *sqlQuerier) GetTemplateInsightsByInterval(ctx context.Context, arg GetT
return items, nil
}
const getTemplateInsightsByTemplate = `-- name: GetTemplateInsightsByTemplate :many
WITH agent_stats_by_interval_and_user AS (
SELECT
date_trunc('minute', was.created_at) AS created_at_trunc,
was.template_id,
was.user_id,
CASE WHEN SUM(was.session_count_vscode) > 0 THEN 60 ELSE 0 END AS usage_vscode_seconds,
CASE WHEN SUM(was.session_count_jetbrains) > 0 THEN 60 ELSE 0 END AS usage_jetbrains_seconds,
CASE WHEN SUM(was.session_count_reconnecting_pty) > 0 THEN 60 ELSE 0 END AS usage_reconnecting_pty_seconds,
CASE WHEN SUM(was.session_count_ssh) > 0 THEN 60 ELSE 0 END AS usage_ssh_seconds
FROM workspace_agent_stats was
WHERE
was.created_at >= $1::timestamptz
AND was.created_at < $2::timestamptz
AND was.connection_count > 0
GROUP BY created_at_trunc, was.template_id, was.user_id
)
SELECT
template_id,
COALESCE(COUNT(DISTINCT user_id))::bigint AS active_users,
COALESCE(SUM(usage_vscode_seconds), 0)::bigint AS usage_vscode_seconds,
COALESCE(SUM(usage_jetbrains_seconds), 0)::bigint AS usage_jetbrains_seconds,
COALESCE(SUM(usage_reconnecting_pty_seconds), 0)::bigint AS usage_reconnecting_pty_seconds,
COALESCE(SUM(usage_ssh_seconds), 0)::bigint AS usage_ssh_seconds
FROM agent_stats_by_interval_and_user
GROUP BY template_id
`
type GetTemplateInsightsByTemplateParams struct {
StartTime time.Time `db:"start_time" json:"start_time"`
EndTime time.Time `db:"end_time" json:"end_time"`
}
type GetTemplateInsightsByTemplateRow struct {
TemplateID uuid.UUID `db:"template_id" json:"template_id"`
ActiveUsers int64 `db:"active_users" json:"active_users"`
UsageVscodeSeconds int64 `db:"usage_vscode_seconds" json:"usage_vscode_seconds"`
UsageJetbrainsSeconds int64 `db:"usage_jetbrains_seconds" json:"usage_jetbrains_seconds"`
UsageReconnectingPtySeconds int64 `db:"usage_reconnecting_pty_seconds" json:"usage_reconnecting_pty_seconds"`
UsageSshSeconds int64 `db:"usage_ssh_seconds" json:"usage_ssh_seconds"`
}
func (q *sqlQuerier) GetTemplateInsightsByTemplate(ctx context.Context, arg GetTemplateInsightsByTemplateParams) ([]GetTemplateInsightsByTemplateRow, error) {
rows, err := q.db.QueryContext(ctx, getTemplateInsightsByTemplate, arg.StartTime, arg.EndTime)
if err != nil {
return nil, err
}
defer rows.Close()
var items []GetTemplateInsightsByTemplateRow
for rows.Next() {
var i GetTemplateInsightsByTemplateRow
if err := rows.Scan(
&i.TemplateID,
&i.ActiveUsers,
&i.UsageVscodeSeconds,
&i.UsageJetbrainsSeconds,
&i.UsageReconnectingPtySeconds,
&i.UsageSshSeconds,
); err != nil {
return nil, err
}
items = append(items, i)
}
if err := rows.Close(); err != nil {
return nil, err
}
if err := rows.Err(); err != nil {
return nil, err
}
return items, nil
}
const getTemplateParameterInsights = `-- name: GetTemplateParameterInsights :many
WITH latest_workspace_builds AS (
SELECT

View File

@ -134,6 +134,34 @@ SELECT
COALESCE(SUM(usage_ssh_seconds), 0)::bigint AS usage_ssh_seconds
FROM agent_stats_by_interval_and_user;
-- name: GetTemplateInsightsByTemplate :many
WITH agent_stats_by_interval_and_user AS (
SELECT
date_trunc('minute', was.created_at) AS created_at_trunc,
was.template_id,
was.user_id,
CASE WHEN SUM(was.session_count_vscode) > 0 THEN 60 ELSE 0 END AS usage_vscode_seconds,
CASE WHEN SUM(was.session_count_jetbrains) > 0 THEN 60 ELSE 0 END AS usage_jetbrains_seconds,
CASE WHEN SUM(was.session_count_reconnecting_pty) > 0 THEN 60 ELSE 0 END AS usage_reconnecting_pty_seconds,
CASE WHEN SUM(was.session_count_ssh) > 0 THEN 60 ELSE 0 END AS usage_ssh_seconds
FROM workspace_agent_stats was
WHERE
was.created_at >= @start_time::timestamptz
AND was.created_at < @end_time::timestamptz
AND was.connection_count > 0
GROUP BY created_at_trunc, was.template_id, was.user_id
)
SELECT
template_id,
COALESCE(COUNT(DISTINCT user_id))::bigint AS active_users,
COALESCE(SUM(usage_vscode_seconds), 0)::bigint AS usage_vscode_seconds,
COALESCE(SUM(usage_jetbrains_seconds), 0)::bigint AS usage_jetbrains_seconds,
COALESCE(SUM(usage_reconnecting_pty_seconds), 0)::bigint AS usage_reconnecting_pty_seconds,
COALESCE(SUM(usage_ssh_seconds), 0)::bigint AS usage_ssh_seconds
FROM agent_stats_by_interval_and_user
GROUP BY template_id;
-- name: GetTemplateAppInsights :many
-- GetTemplateAppInsights returns the aggregate usage of each app in a given
-- timeframe. The result can be filtered on template_ids, meaning only user data

View File

@ -0,0 +1,174 @@
package insights
import (
"context"
"sync/atomic"
"time"
"github.com/google/uuid"
"github.com/prometheus/client_golang/prometheus"
"golang.org/x/sync/errgroup"
"golang.org/x/xerrors"
"cdr.dev/slog"
"github.com/coder/coder/v2/coderd/database"
)
var templatesActiveUsersDesc = prometheus.NewDesc("coderd_insights_templates_active_users", "The number of active users of the template.", []string{"template_name"}, nil)
type MetricsCollector struct {
database database.Store
logger slog.Logger
timeWindow time.Duration
tickInterval time.Duration
data atomic.Pointer[insightsData]
}
type insightsData struct {
templates []database.GetTemplateInsightsByTemplateRow
templateNames map[uuid.UUID]string
}
var _ prometheus.Collector = new(MetricsCollector)
func NewMetricsCollector(db database.Store, logger slog.Logger, timeWindow time.Duration, tickInterval time.Duration) (*MetricsCollector, error) {
if timeWindow == 0 {
timeWindow = 5 * time.Minute
}
if timeWindow < 5*time.Minute {
return nil, xerrors.Errorf("time window must be at least 5 mins")
}
if tickInterval == 0 {
tickInterval = timeWindow
}
return &MetricsCollector{
database: db,
logger: logger.Named("insights_metrics_collector"),
timeWindow: timeWindow,
tickInterval: tickInterval,
}, nil
}
func (mc *MetricsCollector) Run(ctx context.Context) (func(), error) {
ctx, closeFunc := context.WithCancel(ctx)
done := make(chan struct{})
// Use time.Nanosecond to force an initial tick. It will be reset to the
// correct duration after executing once.
ticker := time.NewTicker(time.Nanosecond)
doTick := func() {
defer ticker.Reset(mc.tickInterval)
now := time.Now()
startTime := now.Add(-mc.timeWindow)
endTime := now
// Phase 1: Fetch insights from database
// FIXME errorGroup will be used to fetch insights for apps and parameters
eg, egCtx := errgroup.WithContext(ctx)
eg.SetLimit(1)
var templateInsights []database.GetTemplateInsightsByTemplateRow
eg.Go(func() error {
var err error
templateInsights, err = mc.database.GetTemplateInsightsByTemplate(egCtx, database.GetTemplateInsightsByTemplateParams{
StartTime: startTime,
EndTime: endTime,
})
if err != nil {
mc.logger.Error(ctx, "unable to fetch template insights from database", slog.Error(err))
}
return err
})
err := eg.Wait()
if err != nil {
return
}
// Phase 2: Collect template IDs, and fetch relevant details
templateIDs := uniqueTemplateIDs(templateInsights)
templateNames := make(map[uuid.UUID]string, len(templateIDs))
if len(templateIDs) > 0 {
templates, err := mc.database.GetTemplatesWithFilter(ctx, database.GetTemplatesWithFilterParams{
IDs: templateIDs,
})
if err != nil {
mc.logger.Error(ctx, "unable to fetch template details from database", slog.Error(err))
return
}
templateNames = onlyTemplateNames(templates)
}
// Refresh the collector state
mc.data.Store(&insightsData{
templates: templateInsights,
templateNames: templateNames,
})
}
go func() {
defer close(done)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
ticker.Stop()
doTick()
}
}
}()
return func() {
closeFunc()
<-done
}, nil
}
func (*MetricsCollector) Describe(descCh chan<- *prometheus.Desc) {
descCh <- templatesActiveUsersDesc
}
func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) {
// Phase 3: Collect metrics
data := mc.data.Load()
if data == nil {
return // insights data not loaded yet
}
for _, templateRow := range data.templates {
metricsCh <- prometheus.MustNewConstMetric(templatesActiveUsersDesc, prometheus.GaugeValue, float64(templateRow.ActiveUsers), data.templateNames[templateRow.TemplateID])
}
}
// Helper functions below.
func uniqueTemplateIDs(templateInsights []database.GetTemplateInsightsByTemplateRow) []uuid.UUID {
tids := map[uuid.UUID]bool{}
for _, t := range templateInsights {
tids[t.TemplateID] = true
}
uniqueUUIDs := make([]uuid.UUID, len(tids))
var i int
for t := range tids {
uniqueUUIDs[i] = t
i++
}
return uniqueUUIDs
}
func onlyTemplateNames(templates []database.Template) map[uuid.UUID]string {
m := map[uuid.UUID]string{}
for _, t := range templates {
m[t.ID] = t.Name
}
return m
}

View File

@ -0,0 +1,132 @@
package insights_test
import (
"context"
"encoding/json"
"io"
"os"
"testing"
"time"
"github.com/google/uuid"
"github.com/prometheus/client_golang/prometheus"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"cdr.dev/slog/sloggers/slogtest"
"github.com/coder/coder/v2/agent/agenttest"
"github.com/coder/coder/v2/coderd/coderdtest"
"github.com/coder/coder/v2/coderd/database/dbtestutil"
"github.com/coder/coder/v2/coderd/prometheusmetrics/insights"
"github.com/coder/coder/v2/codersdk"
"github.com/coder/coder/v2/provisioner/echo"
"github.com/coder/coder/v2/testutil"
)
func TestCollect_TemplateInsights(t *testing.T) {
t.Parallel()
logger := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true})
db, ps := dbtestutil.NewDB(t)
options := &coderdtest.Options{
IncludeProvisionerDaemon: true,
AgentStatsRefreshInterval: time.Millisecond * 100,
Database: db,
Pubsub: ps,
}
client := coderdtest.New(t, options)
// Given
// Initialize metrics collector
mc, err := insights.NewMetricsCollector(db, logger, 0, time.Second)
require.NoError(t, err)
registry := prometheus.NewRegistry()
registry.Register(mc)
// Create two users, one that will appear in the report and another that
// won't (due to not having/using a workspace).
user := coderdtest.CreateFirstUser(t, client)
_, _ = coderdtest.CreateAnotherUser(t, client, user.OrganizationID)
authToken := uuid.NewString()
version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{
Parse: echo.ParseComplete,
ProvisionPlan: echo.PlanComplete,
ProvisionApply: echo.ProvisionApplyWithAgent(authToken),
})
template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID)
require.Empty(t, template.BuildTimeStats[codersdk.WorkspaceTransitionStart])
coderdtest.AwaitTemplateVersionJobCompleted(t, client, version.ID)
workspace := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID)
coderdtest.AwaitWorkspaceBuildJobCompleted(t, client, workspace.LatestBuild.ID)
// Start an agent so that we can generate stats.
_ = agenttest.New(t, client.URL, authToken)
resources := coderdtest.AwaitWorkspaceAgents(t, client, workspace.ID)
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
defer cancel()
// Run metrics collector
closeFunc, err := mc.Run(ctx)
require.NoError(t, err)
defer closeFunc()
// Connect to the agent to generate usage/latency stats.
conn, err := client.DialWorkspaceAgent(ctx, resources[0].Agents[0].ID, &codersdk.DialWorkspaceAgentOptions{
Logger: logger.Named("client"),
})
require.NoError(t, err)
defer conn.Close()
sshConn, err := conn.SSHClient(ctx)
require.NoError(t, err)
defer sshConn.Close()
sess, err := sshConn.NewSession()
require.NoError(t, err)
defer sess.Close()
r, w := io.Pipe()
defer r.Close()
defer w.Close()
sess.Stdin = r
sess.Stdout = io.Discard
err = sess.Start("cat")
require.NoError(t, err)
goldenFile, err := os.ReadFile("testdata/insights-metrics.json")
require.NoError(t, err)
golden := map[string]int{}
err = json.Unmarshal(goldenFile, &golden)
require.NoError(t, err)
collected := map[string]int{}
assert.Eventuallyf(t, func() bool {
// When
metrics, err := registry.Gather()
require.NoError(t, err)
// Then
for _, metric := range metrics {
switch metric.GetName() {
case "coderd_insights_templates_active_users":
for _, m := range metric.Metric {
collected[metric.GetName()] = int(m.Gauge.GetValue())
}
default:
require.FailNowf(t, "unexpected metric collected", "metric: %s", metric.GetName())
}
}
return assert.ObjectsAreEqualValues(golden, collected)
}, testutil.WaitMedium, testutil.IntervalFast, "template insights are missing")
// We got our latency metrics, close the connection.
_ = sess.Close()
_ = sshConn.Close()
require.EqualValues(t, golden, collected)
}

View File

@ -0,0 +1,3 @@
{
"coderd_insights_templates_active_users": 1
}