Skip to content

Commit 41cc73a

Browse files
Track bundle resource state sizes in telemetry (direct engine)
Adds a `resources_metadata` field to the bundle deploy telemetry event with, per resource type, the count and the max/mean/median state size in bytes, plus the whole state file size. Only direct deploys are measured, and collection does no marshalling, file read, or JSON parsing of its own. The direct engine already serializes each resource's state during the deploy and reconstructs it via WAL replay in Finalize; ExportStateFromData now records each entry's len(state) on the ResourceState it returns. deployCore stashes that finalized state on b.Metrics, and telemetry reads the per-resource sizes straight off the in-memory map. The whole-file size comes from a single os.Stat (no read/parse). Terraform stores state differently and is not collected (the field is absent there). Because the metadata is direct-only it diverges across the DATABRICKS_BUNDLE_ENGINE test matrix, so the telemetry/deploy test captures it in a per-engine out.resources_metadata.$DATABRICKS_BUNDLE_ENGINE.txt (terraform: null) and omits it from the engine-agnostic out.telemetry.txt. Byte sizes are redacted to SMALL_INT (they shift with the SDK struct shape); counts are asserted exactly. The grouping logic is also unit-tested. The universe proto (resources_metadata, BundleResourcesMetadata, ResourceMetadata) is already merged, so this is ingested rather than dropped. Co-authored-by: Isaac
1 parent e92f999 commit 41cc73a

12 files changed

Lines changed: 311 additions & 3 deletions

File tree

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
{
2+
"state_engine": "direct",
3+
"state_file_size_bytes": SMALL_INT,
4+
"resources": [
5+
{
6+
"resource_type": "jobs",
7+
"count": 3,
8+
"state_size_max_bytes": SMALL_INT,
9+
"state_size_mean_bytes": SMALL_INT,
10+
"state_size_median_bytes": SMALL_INT
11+
},
12+
{
13+
"resource_type": "pipelines",
14+
"count": 2,
15+
"state_size_max_bytes": SMALL_INT,
16+
"state_size_mean_bytes": SMALL_INT,
17+
"state_size_median_bytes": SMALL_INT
18+
}
19+
]
20+
}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
null

acceptance/bundle/telemetry/deploy/script

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,16 @@ trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext
88
title "Assert that mutator execution times are being recorded"
99
trace cat telemetry.json | jq ' .entry.databricks_cli_log.bundle_deploy_event.experimental.bundle_mutator_execution_time_ms | length > 0'
1010

11+
# resources_metadata is only emitted for direct deploys (it is derived from the
12+
# direct engine's state), so it diverges across the DATABRICKS_BUNDLE_ENGINE
13+
# matrix. Capture it in a per-engine file (terraform produces "null") and omit
14+
# it from the engine-agnostic out.telemetry.txt below. Byte sizes are redacted
15+
# to SMALL_INT via test.toml since they shift with the SDK struct shape.
16+
cat telemetry.json | jq '.entry.databricks_cli_log.bundle_deploy_event.resources_metadata' > out.resources_metadata.$DATABRICKS_BUNDLE_ENGINE.txt
17+
1118
# bundle_mutator_execution_time_ms can have variable number of entries depending upon the runtime of the mutators. Thus we omit it from
1219
# being asserted here.
13-
cat telemetry.json | jq 'del(.entry.databricks_cli_log.bundle_deploy_event.experimental.bundle_mutator_execution_time_ms)' > out.telemetry.txt
20+
cat telemetry.json | jq 'del(.entry.databricks_cli_log.bundle_deploy_event.experimental.bundle_mutator_execution_time_ms, .entry.databricks_cli_log.bundle_deploy_event.resources_metadata)' > out.telemetry.txt
1421

1522
cmd_exec_id=$(extract_command_exec_id.py)
1623
deployment_id=$(cat .databricks/bundle/default/deployment.json | jq -r .id)
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# Redact the resources_metadata byte sizes captured in
2+
# out.resources_metadata.$DATABRICKS_BUNDLE_ENGINE.txt. Exact counts depend on
3+
# the SDK struct shape and would make the golden brittle; counts and the
4+
# per-type structure are still asserted exactly.
5+
[[Repls]]
6+
Old = '"state_file_size_bytes": \d+'
7+
New = '"state_file_size_bytes": SMALL_INT'
8+
9+
[[Repls]]
10+
Old = '"state_size_max_bytes": \d+'
11+
New = '"state_size_max_bytes": SMALL_INT'
12+
13+
[[Repls]]
14+
Old = '"state_size_mean_bytes": \d+'
15+
New = '"state_size_mean_bytes": SMALL_INT'
16+
17+
[[Repls]]
18+
Old = '"state_size_median_bytes": \d+'
19+
New = '"state_size_median_bytes": SMALL_INT'

bundle/bundle.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import (
1818
"github.com/databricks/cli/bundle/direct"
1919
"github.com/databricks/cli/bundle/env"
2020
"github.com/databricks/cli/bundle/metadata"
21+
"github.com/databricks/cli/bundle/statemgmt/resourcestate"
2122
"github.com/databricks/cli/libs/auth"
2223
"github.com/databricks/cli/libs/cache"
2324
"github.com/databricks/cli/libs/fileset"
@@ -55,6 +56,12 @@ type Metrics struct {
5556
PythonUpdatedResourcesCount int64
5657
ExecutionTimes []protos.IntMapEntry
5758
LocalCacheMeasurementsMs []protos.IntMapEntry // Local cache measurements stored as milliseconds
59+
60+
// ResourceState is the direct engine's per-resource deployment state
61+
// captured right after the deploy. It carries each resource's state-size in
62+
// bytes so deploy telemetry can be derived without re-reading or re-parsing
63+
// the state file. Nil for terraform deploys.
64+
ResourceState resourcestate.ExportedResourcesMap
5865
}
5966

6067
// SetBoolValue sets the value of a boolean metric.

bundle/direct/dstate/state.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -451,8 +451,9 @@ func ExportStateFromData(data Database) resourcestate.ExportedResourcesMap {
451451
}
452452

453453
result[key] = resourcestate.ResourceState{
454-
ID: entry.ID,
455-
ETag: etag,
454+
ID: entry.ID,
455+
ETag: etag,
456+
StateSizeBytes: len(entry.State),
456457
}
457458
}
458459
return result

bundle/phases/deploy.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,10 @@ func deployCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, ta
8383
if targetEngine.IsDirect() {
8484
b.DeploymentBundle.Apply(ctx, b.WorkspaceClient(ctx), plan, direct.MigrateMode(false))
8585
state, err = b.DeploymentBundle.StateDB.Finalize(ctx)
86+
// Capture the finalized state for deploy telemetry. It carries each
87+
// resource's state-size in bytes (from the WAL replay Finalize just
88+
// did), so telemetry needs no extra read or parse of the state file.
89+
b.Metrics.ResourceState = state
8690
} else {
8791
bundle.ApplyContext(ctx, b, terraform.Apply())
8892
state, err = terraform.ParseResourcesState(ctx, b)
Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
package phases
2+
3+
import (
4+
"context"
5+
"os"
6+
"slices"
7+
"strings"
8+
9+
"github.com/databricks/cli/bundle"
10+
"github.com/databricks/cli/bundle/statemgmt/resourcestate"
11+
"github.com/databricks/cli/libs/log"
12+
"github.com/databricks/cli/libs/telemetry/protos"
13+
)
14+
15+
// directEngine is the only engine for which resources_metadata is populated.
16+
const directEngine = "direct"
17+
18+
// collectResourcesMetadata builds a BundleResourcesMetadata from the per-resource
19+
// state sizes captured during the deploy.
20+
//
21+
// Only direct deploys are measured. b.Metrics.ResourceState is the direct
22+
// engine's finalized state, populated in deployCore from the WAL replay the
23+
// deploy already performs; each entry carries StateSizeBytes (len of the JSON
24+
// blob stored in resources.json). So no marshalling, file read, or JSON parsing
25+
// happens here — sizes are read straight off the in-memory map. The whole-file
26+
// size comes from a single os.Stat (no parse). Returns nil for terraform
27+
// deploys (ResourceState is nil) and when no resources are in state.
28+
func collectResourcesMetadata(ctx context.Context, b *bundle.Bundle) *protos.BundleResourcesMetadata {
29+
state := b.Metrics.ResourceState
30+
if len(state) == 0 {
31+
return nil
32+
}
33+
34+
resources := resourceMetadataFromState(state)
35+
if len(resources) == 0 {
36+
return nil
37+
}
38+
39+
return &protos.BundleResourcesMetadata{
40+
StateEngine: directEngine,
41+
StateFileSizeBytes: directStateFileSize(ctx, b),
42+
Resources: resources,
43+
}
44+
}
45+
46+
// resourceMetadataFromState groups the deployment state by resource type and
47+
// computes per-type count and size statistics from each entry's StateSizeBytes.
48+
func resourceMetadataFromState(state resourcestate.ExportedResourcesMap) []protos.ResourceMetadata {
49+
counts := make(map[string]int64)
50+
sizesByType := make(map[string][]int64)
51+
for key, rs := range state {
52+
t := resourceTypeFromKey(key)
53+
if t == "" {
54+
continue
55+
}
56+
counts[t]++
57+
sizesByType[t] = append(sizesByType[t], int64(rs.StateSizeBytes))
58+
}
59+
60+
types := make([]string, 0, len(counts))
61+
for t := range counts {
62+
types = append(types, t)
63+
}
64+
slices.Sort(types)
65+
66+
resources := make([]protos.ResourceMetadata, 0, len(types))
67+
for _, t := range types {
68+
sizes := sizesByType[t]
69+
slices.Sort(sizes)
70+
resources = append(resources, protos.ResourceMetadata{
71+
ResourceType: t,
72+
Count: counts[t],
73+
StateSizeMaxBytes: statMax(sizes),
74+
StateSizeMeanBytes: statMean(sizes),
75+
StateSizeMedianBytes: statMedian(sizes),
76+
})
77+
}
78+
return resources
79+
}
80+
81+
// directStateFileSize returns the size in bytes of the direct engine's
82+
// resources.json via a single stat (no read/parse), or 0 if it can't be stat'd.
83+
func directStateFileSize(ctx context.Context, b *bundle.Bundle) int64 {
84+
_, localPath := b.StateFilenameDirect(ctx)
85+
info, err := os.Stat(localPath)
86+
if err != nil {
87+
log.Debugf(ctx, "resources-metadata telemetry: cannot stat direct state at %s: %s", localPath, err)
88+
return 0
89+
}
90+
return info.Size()
91+
}
92+
93+
// resourceTypeFromKey extracts the resource type from a direct-engine state
94+
// key. Keys are "resources.<type>.<name>", or "resources.<type>.<name>.<sub>"
95+
// for sub-resources like permissions / grants / secret_acls. Sub-resources are
96+
// tracked under the sub-resource type so they aggregate across resource
97+
// families. Returns "" for keys that don't match.
98+
func resourceTypeFromKey(key string) string {
99+
parts := strings.SplitN(key, ".", 4)
100+
if len(parts) < 3 || parts[0] != "resources" {
101+
return ""
102+
}
103+
if len(parts) == 4 {
104+
return parts[3]
105+
}
106+
return parts[1]
107+
}
108+
109+
func statMax(sortedSizes []int64) int64 {
110+
if len(sortedSizes) == 0 {
111+
return 0
112+
}
113+
return sortedSizes[len(sortedSizes)-1]
114+
}
115+
116+
func statMean(sortedSizes []int64) int64 {
117+
if len(sortedSizes) == 0 {
118+
return 0
119+
}
120+
var total int64
121+
for _, s := range sortedSizes {
122+
total += s
123+
}
124+
return total / int64(len(sortedSizes))
125+
}
126+
127+
func statMedian(sortedSizes []int64) int64 {
128+
if len(sortedSizes) == 0 {
129+
return 0
130+
}
131+
return sortedSizes[(len(sortedSizes)-1)/2]
132+
}
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
package phases
2+
3+
import (
4+
"testing"
5+
6+
"github.com/databricks/cli/bundle"
7+
"github.com/databricks/cli/bundle/statemgmt/resourcestate"
8+
"github.com/databricks/cli/libs/telemetry/protos"
9+
"github.com/stretchr/testify/assert"
10+
)
11+
12+
func TestResourceTypeFromKey(t *testing.T) {
13+
cases := []struct {
14+
in string
15+
want string
16+
}{
17+
{"resources.jobs.foo", "jobs"},
18+
{"resources.pipelines.bar", "pipelines"},
19+
{"resources.jobs.foo.permissions", "permissions"},
20+
{"resources.secret_scopes.s.permissions", "permissions"},
21+
{"not-a-state-key", ""},
22+
{"resources.jobs", ""},
23+
}
24+
for _, c := range cases {
25+
assert.Equal(t, c.want, resourceTypeFromKey(c.in), "key=%q", c.in)
26+
}
27+
}
28+
29+
func TestStatHelpers(t *testing.T) {
30+
assert.Equal(t, int64(3), statMax([]int64{1, 2, 3}))
31+
assert.Equal(t, int64(2), statMean([]int64{1, 2, 3}))
32+
assert.Equal(t, int64(2), statMedian([]int64{1, 2, 3}))
33+
// Lower-middle for even count: sorted [1,2,3,4] -> index (4-1)/2 = 1 -> 2.
34+
assert.Equal(t, int64(2), statMedian([]int64{1, 2, 3, 4}))
35+
assert.Equal(t, int64(0), statMax(nil))
36+
assert.Equal(t, int64(0), statMean(nil))
37+
assert.Equal(t, int64(0), statMedian(nil))
38+
}
39+
40+
func TestResourceMetadataFromState_GroupsByType(t *testing.T) {
41+
state := resourcestate.ExportedResourcesMap{
42+
"resources.jobs.foo": {StateSizeBytes: 20},
43+
"resources.jobs.bar": {StateSizeBytes: 10},
44+
"resources.jobs.foo.permissions": {StateSizeBytes: 2},
45+
"resources.pipelines.qux": {StateSizeBytes: 14},
46+
}
47+
48+
got := resourceMetadataFromState(state)
49+
50+
// Sorted by resource type: jobs, permissions, pipelines.
51+
assert.Equal(t, []protos.ResourceMetadata{
52+
{ResourceType: "jobs", Count: 2, StateSizeMaxBytes: 20, StateSizeMeanBytes: 15, StateSizeMedianBytes: 10},
53+
{ResourceType: "permissions", Count: 1, StateSizeMaxBytes: 2, StateSizeMeanBytes: 2, StateSizeMedianBytes: 2},
54+
{ResourceType: "pipelines", Count: 1, StateSizeMaxBytes: 14, StateSizeMeanBytes: 14, StateSizeMedianBytes: 14},
55+
}, got)
56+
}
57+
58+
func TestResourceMetadataFromState_SkipsNonResourceKeys(t *testing.T) {
59+
state := resourcestate.ExportedResourcesMap{
60+
"resources.jobs.foo": {StateSizeBytes: 5},
61+
"bogus": {StateSizeBytes: 99},
62+
}
63+
got := resourceMetadataFromState(state)
64+
assert.Equal(t, []protos.ResourceMetadata{
65+
{ResourceType: "jobs", Count: 1, StateSizeMaxBytes: 5, StateSizeMeanBytes: 5, StateSizeMedianBytes: 5},
66+
}, got)
67+
}
68+
69+
func TestCollectResourcesMetadata_NilWhenNoState(t *testing.T) {
70+
// Terraform deploys leave Metrics.ResourceState nil.
71+
b := &bundle.Bundle{}
72+
assert.Nil(t, collectResourcesMetadata(t.Context(), b))
73+
}

bundle/phases/telemetry.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,8 @@ func LogDeployTelemetry(ctx context.Context, b *bundle.Bundle, errMsg string) {
202202
ResourceClusterIDs: clusterIds,
203203
ResourceDashboardIDs: dashboardIds,
204204

205+
ResourcesMetadata: collectResourcesMetadata(ctx, b),
206+
205207
Experimental: &protos.BundleDeployExperimental{
206208
BundleMode: mode,
207209
ConfigurationFileCount: b.Metrics.ConfigurationFileCount,

0 commit comments

Comments
 (0)