Skip to content

Commit d0b2183

Browse files
Track bundle resource state sizes in telemetry (direct engine)
Adds a `resources_metadata` field to the bundle deploy telemetry event with, per resource type, the count and the max/mean/median state size in bytes, plus the whole state file size. Only direct deploys are measured, and collection does no marshalling, file read, or JSON parsing of its own. The direct engine already serializes each resource's state during the deploy and reconstructs it via WAL replay in Finalize; ExportStateFromData now records each entry's len(state) on the ResourceState it returns. deployCore stashes that finalized state on b.Metrics, and telemetry reads the per-resource sizes straight off the in-memory map (grouping by config.GetResourceTypeFromKey). The whole-file size comes from a single os.Stat (no read/parse). Terraform stores state differently and is not collected (the field is absent there). Because the metadata is direct-only it diverges across the DATABRICKS_BUNDLE_ENGINE test matrix, so the telemetry/deploy test captures it in a per-engine out.resources_metadata.$DATABRICKS_BUNDLE_ENGINE.txt (terraform: null) and omits it from the engine-agnostic out.telemetry.txt. Per-resource sizes are deterministic for a fixed config and asserted exactly (no redaction). Only state_file_size_bytes is dropped from the golden: it is os.Stat of resources.json whose header embeds the CLI version string, which differs in length between linux/macos (0.0.0-dev+<sha>) and windows (0.0.0-dev). It is still emitted in real telemetry. The universe proto (resources_metadata, BundleResourcesMetadata, ResourceMetadata) is already merged, so this is ingested rather than dropped. Co-authored-by: Isaac
1 parent f725fb5 commit d0b2183

11 files changed

Lines changed: 251 additions & 3 deletions

File tree

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
{
2+
"state_engine": "direct",
3+
"resources": [
4+
{
5+
"resource_type": "jobs",
6+
"count": 3,
7+
"state_size_max_bytes": 256,
8+
"state_size_mean_bytes": 254,
9+
"state_size_median_bytes": 254
10+
},
11+
{
12+
"resource_type": "pipelines",
13+
"count": 2,
14+
"state_size_max_bytes": 205,
15+
"state_size_mean_bytes": 205,
16+
"state_size_median_bytes": 205
17+
}
18+
]
19+
}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
null

acceptance/bundle/telemetry/deploy/script

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,19 @@ trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext
88
title "Assert that mutator execution times are being recorded"
99
trace cat telemetry.json | jq ' .entry.databricks_cli_log.bundle_deploy_event.experimental.bundle_mutator_execution_time_ms | length > 0'
1010

11+
# resources_metadata is only emitted for direct deploys (it is derived from the
12+
# direct engine's state), so it diverges across the DATABRICKS_BUNDLE_ENGINE
13+
# matrix. Capture it in a per-engine file (terraform produces "null") and omit
14+
# it from the engine-agnostic out.telemetry.txt below. Per-resource sizes are
15+
# deterministic for a fixed config and asserted exactly. state_file_size_bytes
16+
# is dropped from the golden because it is os.Stat of resources.json, whose
17+
# header embeds the CLI version string (0.0.0-dev+<sha> on linux/macos vs
18+
# 0.0.0-dev on windows) — it is still emitted in real telemetry.
19+
cat telemetry.json | jq '.entry.databricks_cli_log.bundle_deploy_event.resources_metadata | if . then del(.state_file_size_bytes) else . end' > out.resources_metadata.$DATABRICKS_BUNDLE_ENGINE.txt
20+
1121
# bundle_mutator_execution_time_ms can have variable number of entries depending upon the runtime of the mutators. Thus we omit it from
1222
# being asserted here.
13-
cat telemetry.json | jq 'del(.entry.databricks_cli_log.bundle_deploy_event.experimental.bundle_mutator_execution_time_ms)' > out.telemetry.txt
23+
cat telemetry.json | jq 'del(.entry.databricks_cli_log.bundle_deploy_event.experimental.bundle_mutator_execution_time_ms, .entry.databricks_cli_log.bundle_deploy_event.resources_metadata)' > out.telemetry.txt
1424

1525
cmd_exec_id=$(extract_command_exec_id.py)
1626
deployment_id=$(cat .databricks/bundle/default/deployment.json | jq -r .id)

bundle/bundle.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import (
1818
"github.com/databricks/cli/bundle/direct"
1919
"github.com/databricks/cli/bundle/env"
2020
"github.com/databricks/cli/bundle/metadata"
21+
"github.com/databricks/cli/bundle/statemgmt/resourcestate"
2122
"github.com/databricks/cli/libs/auth"
2223
"github.com/databricks/cli/libs/cache"
2324
"github.com/databricks/cli/libs/fileset"
@@ -55,6 +56,12 @@ type Metrics struct {
5556
PythonUpdatedResourcesCount int64
5657
ExecutionTimes []protos.IntMapEntry
5758
LocalCacheMeasurementsMs []protos.IntMapEntry // Local cache measurements stored as milliseconds
59+
60+
// ResourceState is the direct engine's per-resource deployment state
61+
// captured right after the deploy. It carries each resource's state-size in
62+
// bytes so deploy telemetry can be derived without re-reading or re-parsing
63+
// the state file. Nil for terraform deploys.
64+
ResourceState resourcestate.ExportedResourcesMap
5865
}
5966

6067
// SetBoolValue sets the value of a boolean metric.

bundle/direct/dstate/state.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -451,8 +451,9 @@ func ExportStateFromData(data Database) resourcestate.ExportedResourcesMap {
451451
}
452452

453453
result[key] = resourcestate.ResourceState{
454-
ID: entry.ID,
455-
ETag: etag,
454+
ID: entry.ID,
455+
ETag: etag,
456+
StateSizeBytes: len(entry.State),
456457
}
457458
}
458459
return result

bundle/phases/deploy.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,10 @@ func deployCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, ta
8383
if targetEngine.IsDirect() {
8484
b.DeploymentBundle.Apply(ctx, b.WorkspaceClient(ctx), plan, direct.MigrateMode(false))
8585
state, err = b.DeploymentBundle.StateDB.Finalize(ctx)
86+
// Capture the finalized state for deploy telemetry. It carries each
87+
// resource's state-size in bytes (from the WAL replay Finalize just
88+
// did), so telemetry needs no extra read or parse of the state file.
89+
b.Metrics.ResourceState = state
8690
} else {
8791
bundle.ApplyContext(ctx, b, terraform.Apply())
8892
state, err = terraform.ParseResourcesState(ctx, b)
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
package phases
2+
3+
import (
4+
"context"
5+
"os"
6+
"slices"
7+
8+
"github.com/databricks/cli/bundle"
9+
"github.com/databricks/cli/bundle/config"
10+
"github.com/databricks/cli/bundle/statemgmt/resourcestate"
11+
"github.com/databricks/cli/libs/log"
12+
"github.com/databricks/cli/libs/telemetry/protos"
13+
)
14+
15+
// directEngine is the only engine for which resources_metadata is populated.
16+
const directEngine = "direct"
17+
18+
// collectResourcesMetadata builds a BundleResourcesMetadata from the per-resource
19+
// state sizes captured during the deploy.
20+
//
21+
// Only direct deploys are measured. b.Metrics.ResourceState is the direct
22+
// engine's finalized state, populated in deployCore from the WAL replay the
23+
// deploy already performs; each entry carries StateSizeBytes (len of the JSON
24+
// blob stored in resources.json). So no marshalling, file read, or JSON parsing
25+
// happens here — sizes are read straight off the in-memory map. The whole-file
26+
// size comes from a single os.Stat (no parse). Returns nil for terraform
27+
// deploys (ResourceState is nil) and when no resources are in state.
28+
func collectResourcesMetadata(ctx context.Context, b *bundle.Bundle) *protos.BundleResourcesMetadata {
29+
state := b.Metrics.ResourceState
30+
if len(state) == 0 {
31+
return nil
32+
}
33+
34+
resources := resourceMetadataFromState(state)
35+
if len(resources) == 0 {
36+
return nil
37+
}
38+
39+
return &protos.BundleResourcesMetadata{
40+
StateEngine: directEngine,
41+
StateFileSizeBytes: directStateFileSize(ctx, b),
42+
Resources: resources,
43+
}
44+
}
45+
46+
// resourceMetadataFromState groups the deployment state by resource type and
47+
// computes per-type count and max/mean/median state size. Sizes are sorted per
48+
// type (needed for the median).
49+
func resourceMetadataFromState(state resourcestate.ExportedResourcesMap) []protos.ResourceMetadata {
50+
sizesByType := make(map[string][]int64)
51+
for key, rs := range state {
52+
t := config.GetResourceTypeFromKey(key)
53+
if t == "" {
54+
continue
55+
}
56+
sizesByType[t] = append(sizesByType[t], int64(rs.StateSizeBytes))
57+
}
58+
59+
types := make([]string, 0, len(sizesByType))
60+
for t := range sizesByType {
61+
types = append(types, t)
62+
}
63+
slices.Sort(types)
64+
65+
resources := make([]protos.ResourceMetadata, 0, len(types))
66+
for _, t := range types {
67+
sizes := sizesByType[t]
68+
slices.Sort(sizes)
69+
resources = append(resources, protos.ResourceMetadata{
70+
ResourceType: t,
71+
Count: int64(len(sizes)),
72+
StateSizeMaxBytes: statMax(sizes),
73+
StateSizeMeanBytes: statMean(sizes),
74+
StateSizeMedianBytes: statMedian(sizes),
75+
})
76+
}
77+
return resources
78+
}
79+
80+
// statMax/statMean/statMedian operate on a slice already sorted ascending.
81+
func statMax(sortedSizes []int64) int64 {
82+
return sortedSizes[len(sortedSizes)-1]
83+
}
84+
85+
func statMean(sortedSizes []int64) int64 {
86+
var total int64
87+
for _, s := range sortedSizes {
88+
total += s
89+
}
90+
return total / int64(len(sortedSizes))
91+
}
92+
93+
func statMedian(sortedSizes []int64) int64 {
94+
return sortedSizes[(len(sortedSizes)-1)/2]
95+
}
96+
97+
// directStateFileSize returns the size in bytes of the direct engine's
98+
// resources.json via a single stat (no read/parse), or 0 if it can't be stat'd.
99+
func directStateFileSize(ctx context.Context, b *bundle.Bundle) int64 {
100+
_, localPath := b.StateFilenameDirect(ctx)
101+
info, err := os.Stat(localPath)
102+
if err != nil {
103+
log.Debugf(ctx, "resources-metadata telemetry: cannot stat direct state at %s: %s", localPath, err)
104+
return 0
105+
}
106+
return info.Size()
107+
}
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
package phases
2+
3+
import (
4+
"testing"
5+
6+
"github.com/databricks/cli/bundle"
7+
"github.com/databricks/cli/bundle/statemgmt/resourcestate"
8+
"github.com/databricks/cli/libs/telemetry/protos"
9+
"github.com/stretchr/testify/assert"
10+
)
11+
12+
func TestResourceMetadataFromState_GroupsByType(t *testing.T) {
13+
state := resourcestate.ExportedResourcesMap{
14+
"resources.jobs.foo": {StateSizeBytes: 20},
15+
"resources.jobs.bar": {StateSizeBytes: 10},
16+
"resources.jobs.foo.permissions": {StateSizeBytes: 2},
17+
"resources.pipelines.qux": {StateSizeBytes: 14},
18+
}
19+
20+
got := resourceMetadataFromState(state)
21+
22+
// Sorted by resource type. Sub-resources (permissions) group under
23+
// "<parent>.permissions" per config.GetResourceTypeFromKey. jobs median is
24+
// the lower-middle of sorted [10,20] -> index (2-1)/2 = 0 -> 10.
25+
assert.Equal(t, []protos.ResourceMetadata{
26+
{ResourceType: "jobs", Count: 2, StateSizeMaxBytes: 20, StateSizeMeanBytes: 15, StateSizeMedianBytes: 10},
27+
{ResourceType: "jobs.permissions", Count: 1, StateSizeMaxBytes: 2, StateSizeMeanBytes: 2, StateSizeMedianBytes: 2},
28+
{ResourceType: "pipelines", Count: 1, StateSizeMaxBytes: 14, StateSizeMeanBytes: 14, StateSizeMedianBytes: 14},
29+
}, got)
30+
}
31+
32+
func TestStatHelpers(t *testing.T) {
33+
assert.Equal(t, int64(3), statMax([]int64{1, 2, 3}))
34+
assert.Equal(t, int64(2), statMean([]int64{1, 2, 3}))
35+
assert.Equal(t, int64(2), statMedian([]int64{1, 2, 3}))
36+
// Lower-middle for even count: sorted [1,2,3,4] -> index (4-1)/2 = 1 -> 2.
37+
assert.Equal(t, int64(2), statMedian([]int64{1, 2, 3, 4}))
38+
}
39+
40+
func TestResourceMetadataFromState_SkipsNonResourceKeys(t *testing.T) {
41+
state := resourcestate.ExportedResourcesMap{
42+
"resources.jobs.foo": {StateSizeBytes: 5},
43+
"bogus": {StateSizeBytes: 99},
44+
}
45+
got := resourceMetadataFromState(state)
46+
assert.Equal(t, []protos.ResourceMetadata{
47+
{ResourceType: "jobs", Count: 1, StateSizeMaxBytes: 5, StateSizeMeanBytes: 5, StateSizeMedianBytes: 5},
48+
}, got)
49+
}
50+
51+
func TestCollectResourcesMetadata_NilWhenNoState(t *testing.T) {
52+
// Terraform deploys leave Metrics.ResourceState nil.
53+
b := &bundle.Bundle{}
54+
assert.Nil(t, collectResourcesMetadata(t.Context(), b))
55+
}

bundle/phases/telemetry.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,8 @@ func LogDeployTelemetry(ctx context.Context, b *bundle.Bundle, errMsg string) {
202202
ResourceClusterIDs: clusterIds,
203203
ResourceDashboardIDs: dashboardIds,
204204

205+
ResourcesMetadata: collectResourcesMetadata(ctx, b),
206+
205207
Experimental: &protos.BundleDeployExperimental{
206208
BundleMode: mode,
207209
ConfigurationFileCount: b.Metrics.ConfigurationFileCount,

bundle/statemgmt/resourcestate/resourcestate.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@ type ResourceState struct {
77

88
// For dashboards
99
ETag string
10+
11+
// Size in bytes of the resource's serialized state blob. Populated by the
12+
// direct engine (len of the JSON stored in resources.json) for deploy
13+
// telemetry; left zero by the terraform path.
14+
StateSizeBytes int
1015
}
1116

1217
// ExportedResourcesMap stores relevant attributes from terraform/direct state for all resources

0 commit comments

Comments
 (0)