Skip to content

Commit 7e85efd

Browse files
Track bundle resource state sizes in telemetry (direct engine) (#5199)
Adds a `resources_metadata` field to the bundle deploy telemetry event with, per resource type, the `count` and the max/mean/median state size in bytes, plus the whole state file size.
1 parent 8348fd8 commit 7e85efd

11 files changed

Lines changed: 251 additions & 3 deletions

File tree

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
{
2+
"state_engine": "direct",
3+
"resources": [
4+
{
5+
"resource_type": "jobs",
6+
"count": 3,
7+
"state_size_max_bytes": 256,
8+
"state_size_mean_bytes": 254,
9+
"state_size_median_bytes": 254
10+
},
11+
{
12+
"resource_type": "pipelines",
13+
"count": 2,
14+
"state_size_max_bytes": 205,
15+
"state_size_mean_bytes": 205,
16+
"state_size_median_bytes": 205
17+
}
18+
]
19+
}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
null

acceptance/bundle/telemetry/deploy/script

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,19 @@ trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext
88
title "Assert that mutator execution times are being recorded"
99
trace cat telemetry.json | jq ' .entry.databricks_cli_log.bundle_deploy_event.experimental.bundle_mutator_execution_time_ms | length > 0'
1010

11+
# resources_metadata is only emitted for direct deploys (it is derived from the
12+
# direct engine's state), so it diverges across the DATABRICKS_BUNDLE_ENGINE
13+
# matrix. Capture it in a per-engine file (terraform produces "null") and omit
14+
# it from the engine-agnostic out.telemetry.txt below. Per-resource sizes are
15+
# deterministic for a fixed config and asserted exactly. state_file_size_bytes
16+
# is dropped from the golden because it is os.Stat of resources.json, whose
17+
# header embeds the CLI version string (0.0.0-dev+<sha> on linux/macos vs
18+
# 0.0.0-dev on windows) — it is still emitted in real telemetry.
19+
cat telemetry.json | jq '.entry.databricks_cli_log.bundle_deploy_event.resources_metadata | if . then del(.state_file_size_bytes) else . end' > out.resources_metadata.$DATABRICKS_BUNDLE_ENGINE.txt
20+
1121
# bundle_mutator_execution_time_ms can have variable number of entries depending upon the runtime of the mutators. Thus we omit it from
1222
# being asserted here.
13-
cat telemetry.json | jq 'del(.entry.databricks_cli_log.bundle_deploy_event.experimental.bundle_mutator_execution_time_ms)' > out.telemetry.txt
23+
cat telemetry.json | jq 'del(.entry.databricks_cli_log.bundle_deploy_event.experimental.bundle_mutator_execution_time_ms, .entry.databricks_cli_log.bundle_deploy_event.resources_metadata)' > out.telemetry.txt
1424

1525
cmd_exec_id=$(extract_command_exec_id.py)
1626
deployment_id=$(cat .databricks/bundle/default/deployment.json | jq -r .id)

bundle/bundle.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import (
1818
"github.com/databricks/cli/bundle/direct"
1919
"github.com/databricks/cli/bundle/env"
2020
"github.com/databricks/cli/bundle/metadata"
21+
"github.com/databricks/cli/bundle/statemgmt/resourcestate"
2122
"github.com/databricks/cli/libs/auth"
2223
"github.com/databricks/cli/libs/cache"
2324
"github.com/databricks/cli/libs/fileset"
@@ -55,6 +56,12 @@ type Metrics struct {
5556
PythonUpdatedResourcesCount int64
5657
ExecutionTimes []protos.IntMapEntry
5758
LocalCacheMeasurementsMs []protos.IntMapEntry // Local cache measurements stored as milliseconds
59+
60+
// ResourceState is the direct engine's per-resource deployment state
61+
// captured right after the deploy. It carries each resource's state-size in
62+
// bytes so deploy telemetry can be derived without re-reading or re-parsing
63+
// the state file. Nil for terraform deploys.
64+
ResourceState resourcestate.ExportedResourcesMap
5865
}
5966

6067
// SetBoolValue sets the value of a boolean metric.

bundle/direct/dstate/state.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -451,8 +451,9 @@ func ExportStateFromData(data Database) resourcestate.ExportedResourcesMap {
451451
}
452452

453453
result[key] = resourcestate.ResourceState{
454-
ID: entry.ID,
455-
ETag: etag,
454+
ID: entry.ID,
455+
ETag: etag,
456+
StateSizeBytes: len(entry.State),
456457
}
457458
}
458459
return result

bundle/phases/deploy.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,10 @@ func deployCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, ta
8383
if targetEngine.IsDirect() {
8484
b.DeploymentBundle.Apply(ctx, b.WorkspaceClient(ctx), plan, direct.MigrateMode(false))
8585
state, err = b.DeploymentBundle.StateDB.Finalize(ctx)
86+
// Capture the finalized state for deploy telemetry. It carries each
87+
// resource's state-size in bytes (from the WAL replay Finalize just
88+
// did), so telemetry needs no extra read or parse of the state file.
89+
b.Metrics.ResourceState = state
8690
} else {
8791
bundle.ApplyContext(ctx, b, terraform.Apply())
8892
state, err = terraform.ParseResourcesState(ctx, b)
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
package phases
2+
3+
import (
4+
"context"
5+
"os"
6+
"slices"
7+
8+
"github.com/databricks/cli/bundle"
9+
"github.com/databricks/cli/bundle/config"
10+
"github.com/databricks/cli/bundle/statemgmt/resourcestate"
11+
"github.com/databricks/cli/libs/log"
12+
"github.com/databricks/cli/libs/telemetry/protos"
13+
)
14+
15+
// directEngine is the only engine for which resources_metadata is populated.
16+
const directEngine = "direct"
17+
18+
// collectResourcesMetadata builds a BundleResourcesMetadata from the per-resource
19+
// state sizes captured during the deploy.
20+
//
21+
// Only direct deploys are measured. b.Metrics.ResourceState is the direct
22+
// engine's finalized state, populated in deployCore from the WAL replay the
23+
// deploy already performs; each entry carries StateSizeBytes (len of the JSON
24+
// blob stored in resources.json). So no marshalling, file read, or JSON parsing
25+
// happens here — sizes are read straight off the in-memory map. The whole-file
26+
// size comes from a single os.Stat (no parse). Returns nil for terraform
27+
// deploys (ResourceState is nil) and when no resources are in state.
28+
func collectResourcesMetadata(ctx context.Context, b *bundle.Bundle) *protos.BundleResourcesMetadata {
29+
state := b.Metrics.ResourceState
30+
if len(state) == 0 {
31+
return nil
32+
}
33+
34+
resources := resourceMetadataFromState(state)
35+
if len(resources) == 0 {
36+
return nil
37+
}
38+
39+
return &protos.BundleResourcesMetadata{
40+
StateEngine: directEngine,
41+
StateFileSizeBytes: directStateFileSize(ctx, b),
42+
Resources: resources,
43+
}
44+
}
45+
46+
// resourceMetadataFromState groups the deployment state by resource type and
47+
// computes per-type count and max/mean/median state size. Sizes are sorted per
48+
// type (needed for the median).
49+
func resourceMetadataFromState(state resourcestate.ExportedResourcesMap) []protos.ResourceMetadata {
50+
sizesByType := make(map[string][]int64)
51+
for key, rs := range state {
52+
t := config.GetResourceTypeFromKey(key)
53+
if t == "" {
54+
continue
55+
}
56+
sizesByType[t] = append(sizesByType[t], int64(rs.StateSizeBytes))
57+
}
58+
59+
types := make([]string, 0, len(sizesByType))
60+
for t := range sizesByType {
61+
types = append(types, t)
62+
}
63+
slices.Sort(types)
64+
65+
resources := make([]protos.ResourceMetadata, 0, len(types))
66+
for _, t := range types {
67+
sizes := sizesByType[t]
68+
slices.Sort(sizes)
69+
resources = append(resources, protos.ResourceMetadata{
70+
ResourceType: t,
71+
Count: int64(len(sizes)),
72+
StateSizeMaxBytes: statMax(sizes),
73+
StateSizeMeanBytes: statMean(sizes),
74+
StateSizeMedianBytes: statMedian(sizes),
75+
})
76+
}
77+
return resources
78+
}
79+
80+
// statMax/statMean/statMedian operate on a slice already sorted ascending.
81+
func statMax(sortedSizes []int64) int64 {
82+
return sortedSizes[len(sortedSizes)-1]
83+
}
84+
85+
func statMean(sortedSizes []int64) int64 {
86+
var total int64
87+
for _, s := range sortedSizes {
88+
total += s
89+
}
90+
return total / int64(len(sortedSizes))
91+
}
92+
93+
func statMedian(sortedSizes []int64) int64 {
94+
return sortedSizes[(len(sortedSizes)-1)/2]
95+
}
96+
97+
// directStateFileSize returns the size in bytes of the direct engine's
98+
// resources.json via a single stat (no read/parse), or 0 if it can't be stat'd.
99+
func directStateFileSize(ctx context.Context, b *bundle.Bundle) int64 {
100+
_, localPath := b.StateFilenameDirect(ctx)
101+
info, err := os.Stat(localPath)
102+
if err != nil {
103+
log.Debugf(ctx, "resources-metadata telemetry: cannot stat direct state at %s: %s", localPath, err)
104+
return 0
105+
}
106+
return info.Size()
107+
}
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
package phases
2+
3+
import (
4+
"testing"
5+
6+
"github.com/databricks/cli/bundle"
7+
"github.com/databricks/cli/bundle/statemgmt/resourcestate"
8+
"github.com/databricks/cli/libs/telemetry/protos"
9+
"github.com/stretchr/testify/assert"
10+
)
11+
12+
func TestResourceMetadataFromState_GroupsByType(t *testing.T) {
13+
state := resourcestate.ExportedResourcesMap{
14+
"resources.jobs.foo": {StateSizeBytes: 20},
15+
"resources.jobs.bar": {StateSizeBytes: 10},
16+
"resources.jobs.foo.permissions": {StateSizeBytes: 2},
17+
"resources.pipelines.qux": {StateSizeBytes: 14},
18+
}
19+
20+
got := resourceMetadataFromState(state)
21+
22+
// Sorted by resource type. Sub-resources (permissions) group under
23+
// "<parent>.permissions" per config.GetResourceTypeFromKey. jobs median is
24+
// the lower-middle of sorted [10,20] -> index (2-1)/2 = 0 -> 10.
25+
assert.Equal(t, []protos.ResourceMetadata{
26+
{ResourceType: "jobs", Count: 2, StateSizeMaxBytes: 20, StateSizeMeanBytes: 15, StateSizeMedianBytes: 10},
27+
{ResourceType: "jobs.permissions", Count: 1, StateSizeMaxBytes: 2, StateSizeMeanBytes: 2, StateSizeMedianBytes: 2},
28+
{ResourceType: "pipelines", Count: 1, StateSizeMaxBytes: 14, StateSizeMeanBytes: 14, StateSizeMedianBytes: 14},
29+
}, got)
30+
}
31+
32+
func TestStatHelpers(t *testing.T) {
33+
assert.Equal(t, int64(3), statMax([]int64{1, 2, 3}))
34+
assert.Equal(t, int64(2), statMean([]int64{1, 2, 3}))
35+
assert.Equal(t, int64(2), statMedian([]int64{1, 2, 3}))
36+
// Lower-middle for even count: sorted [1,2,3,4] -> index (4-1)/2 = 1 -> 2.
37+
assert.Equal(t, int64(2), statMedian([]int64{1, 2, 3, 4}))
38+
}
39+
40+
func TestResourceMetadataFromState_SkipsNonResourceKeys(t *testing.T) {
41+
state := resourcestate.ExportedResourcesMap{
42+
"resources.jobs.foo": {StateSizeBytes: 5},
43+
"bogus": {StateSizeBytes: 99},
44+
}
45+
got := resourceMetadataFromState(state)
46+
assert.Equal(t, []protos.ResourceMetadata{
47+
{ResourceType: "jobs", Count: 1, StateSizeMaxBytes: 5, StateSizeMeanBytes: 5, StateSizeMedianBytes: 5},
48+
}, got)
49+
}
50+
51+
func TestCollectResourcesMetadata_NilWhenNoState(t *testing.T) {
52+
// Terraform deploys leave Metrics.ResourceState nil.
53+
b := &bundle.Bundle{}
54+
assert.Nil(t, collectResourcesMetadata(t.Context(), b))
55+
}

bundle/phases/telemetry.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,8 @@ func LogDeployTelemetry(ctx context.Context, b *bundle.Bundle, errMsg string) {
202202
ResourceClusterIDs: clusterIds,
203203
ResourceDashboardIDs: dashboardIds,
204204

205+
ResourcesMetadata: collectResourcesMetadata(ctx, b),
206+
205207
Experimental: &protos.BundleDeployExperimental{
206208
BundleMode: mode,
207209
ConfigurationFileCount: b.Metrics.ConfigurationFileCount,

bundle/statemgmt/resourcestate/resourcestate.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@ type ResourceState struct {
77

88
// For dashboards
99
ETag string
10+
11+
// Size in bytes of the resource's serialized state blob. Populated by the
12+
// direct engine (len of the JSON stored in resources.json) for deploy
13+
// telemetry; left zero by the terraform path.
14+
StateSizeBytes int
1015
}
1116

1217
// ExportedResourcesMap stores relevant attributes from terraform/direct state for all resources

0 commit comments

Comments
 (0)