chore: increase fileCache hit rate in autobuilds lifecycle (#18507)

`wsbuilder` hits the file cache when running validation. This solution is imperfect, but by first sorting workspaces by their template version id, the cache hit rate should improve.
This commit is contained in:
Steven Masley
2025-06-24 07:36:39 -05:00
committed by GitHub
parent 40667855b1
commit 7b152cdd91
5 changed files with 36 additions and 12 deletions

View File

@ -5,6 +5,8 @@ import (
"database/sql"
"fmt"
"net/http"
"slices"
"strings"
"sync"
"sync/atomic"
"time"
@ -155,6 +157,22 @@ func (e *Executor) runOnce(t time.Time) Stats {
return stats
}
// Sort the workspaces by build template version ID so that we can group
// identical template versions together. This is a slight (and imperfect)
// optimization.
//
// `wsbuilder` needs to load the terraform files for a given template version
// into memory. If 2 workspaces are using the same template version, they will
// share the same files in the FileCache. This only happens if the builds happen
// in parallel.
// TODO: Actually make sure the cache has the files in the cache for the full
// set of identical template versions. Then unload the files when the builds
// are done. Right now, this relies on luck for the 10 goroutine workers to
// overlap and keep the file reference in the cache alive.
slices.SortFunc(workspaces, func(a, b database.GetWorkspacesEligibleForTransitionRow) int {
return strings.Compare(a.BuildTemplateVersionID.UUID.String(), b.BuildTemplateVersionID.UUID.String())
})
// We only use errgroup here for convenience of API, not for early
// cancellation. This means we only return nil errors in th eg.Go.
eg := errgroup.Group{}

View File

@ -19443,7 +19443,8 @@ func (q *sqlQuerier) GetWorkspacesByTemplateID(ctx context.Context, templateID u
const getWorkspacesEligibleForTransition = `-- name: GetWorkspacesEligibleForTransition :many
SELECT
workspaces.id,
workspaces.name
workspaces.name,
workspace_builds.template_version_id as build_template_version_id
FROM
workspaces
LEFT JOIN
@ -19562,8 +19563,9 @@ WHERE
`
type GetWorkspacesEligibleForTransitionRow struct {
ID uuid.UUID `db:"id" json:"id"`
Name string `db:"name" json:"name"`
ID uuid.UUID `db:"id" json:"id"`
Name string `db:"name" json:"name"`
BuildTemplateVersionID uuid.NullUUID `db:"build_template_version_id" json:"build_template_version_id"`
}
func (q *sqlQuerier) GetWorkspacesEligibleForTransition(ctx context.Context, now time.Time) ([]GetWorkspacesEligibleForTransitionRow, error) {
@ -19575,7 +19577,7 @@ func (q *sqlQuerier) GetWorkspacesEligibleForTransition(ctx context.Context, now
var items []GetWorkspacesEligibleForTransitionRow
for rows.Next() {
var i GetWorkspacesEligibleForTransitionRow
if err := rows.Scan(&i.ID, &i.Name); err != nil {
if err := rows.Scan(&i.ID, &i.Name, &i.BuildTemplateVersionID); err != nil {
return nil, err
}
items = append(items, i)

View File

@ -642,7 +642,8 @@ FROM pending_workspaces, building_workspaces, running_workspaces, failed_workspa
-- name: GetWorkspacesEligibleForTransition :many
SELECT
workspaces.id,
workspaces.name
workspaces.name,
workspace_builds.template_version_id as build_template_version_id
FROM
workspaces
LEFT JOIN

View File

@ -71,12 +71,12 @@ func (c *Cache) registerMetrics(registerer prometheus.Registerer) *Cache {
Help: "The count of file references currently open in the file cache. Multiple references can be held for the same file.",
})
c.totalOpenFileReferences = f.NewCounter(prometheus.CounterOpts{
c.totalOpenFileReferences = f.NewCounterVec(prometheus.CounterOpts{
Namespace: "coderd",
Subsystem: subsystem,
Name: "open_file_refs_total",
Help: "The total number of file references ever opened in the file cache.",
})
Help: "The total number of file references ever opened in the file cache. The 'hit' label indicates if the file was loaded from the cache.",
}, []string{"hit"})
return c
}
@ -97,7 +97,7 @@ type Cache struct {
type cacheMetrics struct {
currentOpenFileReferences prometheus.Gauge
totalOpenFileReferences prometheus.Counter
totalOpenFileReferences *prometheus.CounterVec
currentOpenFiles prometheus.Gauge
totalOpenedFiles prometheus.Counter
@ -173,6 +173,7 @@ func (c *Cache) prepare(ctx context.Context, db database.Store, fileID uuid.UUID
c.lock.Lock()
defer c.lock.Unlock()
hitLabel := "true"
entry, ok := c.data[fileID]
if !ok {
value := lazy.NewWithError(func() (CacheEntryValue, error) {
@ -194,10 +195,11 @@ func (c *Cache) prepare(ctx context.Context, db database.Store, fileID uuid.UUID
c.data[fileID] = entry
c.currentOpenFiles.Inc()
c.totalOpenedFiles.Inc()
hitLabel = "false"
}
c.currentOpenFileReferences.Inc()
c.totalOpenFileReferences.Inc()
c.totalOpenFileReferences.WithLabelValues(hitLabel).Inc()
entry.refCount++
return entry.value
}

View File

@ -161,7 +161,9 @@ func TestConcurrency(t *testing.T) {
require.Equal(t, batches, promhelp.GaugeValue(t, reg, cachePromMetricName("open_files_current"), nil))
require.Equal(t, batches, promhelp.CounterValue(t, reg, cachePromMetricName("open_files_total"), nil))
require.Equal(t, batches*batchSize, promhelp.GaugeValue(t, reg, cachePromMetricName("open_file_refs_current"), nil))
require.Equal(t, batches*batchSize, promhelp.CounterValue(t, reg, cachePromMetricName("open_file_refs_total"), nil))
hit, miss := promhelp.CounterValue(t, reg, cachePromMetricName("open_file_refs_total"), prometheus.Labels{"hit": "false"}),
promhelp.CounterValue(t, reg, cachePromMetricName("open_file_refs_total"), prometheus.Labels{"hit": "true"})
require.Equal(t, batches*batchSize, hit+miss)
}
func TestRelease(t *testing.T) {
@ -245,7 +247,6 @@ func TestRelease(t *testing.T) {
// Total counts remain
require.Equal(t, batches*fileSize, promhelp.CounterValue(t, reg, cachePromMetricName("open_files_size_bytes_total"), nil))
require.Equal(t, batches, promhelp.CounterValue(t, reg, cachePromMetricName("open_files_total"), nil))
require.Equal(t, batches*batchSize, promhelp.CounterValue(t, reg, cachePromMetricName("open_file_refs_total"), nil))
}
func cacheAuthzSetup(t *testing.T) (database.Store, *files.Cache, *coderdtest.RecordingAuthorizer) {