Skip to content

Commit d16c208

Browse files
Track bundle resource state sizes in telemetry (direct engine)
Adds a `resources_metadata` field to the bundle deploy telemetry event with, per resource type, the count and the max/mean/median state size in bytes, plus the whole state file size. Only direct deploys are measured. The direct engine persists each resource's state as a JSON blob in resources.json, so sizes are read off directly as len(entry.State) after the deploy — no serialization or config-walking happens at telemetry time. Terraform stores state differently and is not collected (the field is absent there). This keeps collection trivial and cheap. The feature is one isolated module (bundle/phases/resources_metadata.go) plus one line at the telemetry-emission call site. Telemetry never fails a deploy: a missing or unparseable state file is logged at debug level and treated as no data. The universe proto (resources_metadata, BundleResourcesMetadata, ResourceMetadata) is already merged, so this is ingested rather than dropped. Co-authored-by: Isaac
1 parent e92f999 commit d16c208

5 files changed

Lines changed: 341 additions & 1 deletion

File tree

acceptance/bundle/telemetry/deploy/script

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,10 @@ trace cat telemetry.json | jq ' .entry.databricks_cli_log.bundle_deploy_event.ex
1010

1111
# bundle_mutator_execution_time_ms can have variable number of entries depending upon the runtime of the mutators. Thus we omit it from
1212
# being asserted here.
13-
cat telemetry.json | jq 'del(.entry.databricks_cli_log.bundle_deploy_event.experimental.bundle_mutator_execution_time_ms)' > out.telemetry.txt
13+
# resources_metadata is only emitted for direct deploys (it reads the direct
14+
# engine's resources.json), so it diverges across the DATABRICKS_BUNDLE_ENGINE
15+
# matrix; omit it here to keep this golden engine-agnostic.
16+
cat telemetry.json | jq 'del(.entry.databricks_cli_log.bundle_deploy_event.experimental.bundle_mutator_execution_time_ms, .entry.databricks_cli_log.bundle_deploy_event.resources_metadata)' > out.telemetry.txt
1417

1518
cmd_exec_id=$(extract_command_exec_id.py)
1619
deployment_id=$(cat .databricks/bundle/default/deployment.json | jq -r .id)
Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
package phases
2+
3+
import (
4+
"context"
5+
"encoding/json"
6+
"errors"
7+
"io/fs"
8+
"os"
9+
"slices"
10+
"strings"
11+
12+
"github.com/databricks/cli/bundle"
13+
"github.com/databricks/cli/bundle/config/engine"
14+
"github.com/databricks/cli/bundle/direct/dstate"
15+
"github.com/databricks/cli/libs/log"
16+
"github.com/databricks/cli/libs/telemetry/protos"
17+
)
18+
19+
// collectResourcesMetadata builds a BundleResourcesMetadata from the direct
20+
// engine's deployment state file.
21+
//
22+
// Only direct deploys are measured. The direct engine persists each resource's
23+
// state as a JSON blob in resources.json, so per-resource sizes are read off
24+
// directly as len(entry.State) — no serialization or config-walking is done at
25+
// telemetry time. Terraform stores state in a different shape and is not
26+
// collected; for those deploys this returns nil.
27+
//
28+
// Telemetry must never fail a deploy, so a missing or unparseable state file is
29+
// logged at debug level and treated as no data (returns nil).
30+
func collectResourcesMetadata(ctx context.Context, b *bundle.Bundle) *protos.BundleResourcesMetadata {
31+
// resource_*_count covers both engines; this state-size metadata only
32+
// applies to the direct engine's resources.json.
33+
if resolveDeployEngine(ctx, b) != string(engine.EngineDirect) {
34+
return nil
35+
}
36+
37+
// A target must be selected to resolve the local state path.
38+
if b.Target == nil {
39+
return nil
40+
}
41+
42+
_, localPath := b.StateFilenameDirect(ctx)
43+
raw, err := os.ReadFile(localPath)
44+
if err != nil {
45+
if !errors.Is(err, fs.ErrNotExist) {
46+
log.Debugf(ctx, "resources-metadata telemetry: skipping direct state at %s: %s", localPath, err)
47+
}
48+
return nil
49+
}
50+
51+
var db dstate.Database
52+
if err := json.Unmarshal(raw, &db); err != nil {
53+
log.Debugf(ctx, "resources-metadata telemetry: failed to parse direct state: %s", err)
54+
return nil
55+
}
56+
57+
counts := make(map[string]int64)
58+
sizesByType := make(map[string][]int64)
59+
for key, entry := range db.State {
60+
t := resourceTypeFromKey(key)
61+
if t == "" {
62+
continue
63+
}
64+
counts[t]++
65+
sizesByType[t] = append(sizesByType[t], int64(len(entry.State)))
66+
}
67+
if len(counts) == 0 {
68+
return nil
69+
}
70+
71+
types := make([]string, 0, len(counts))
72+
for t := range counts {
73+
types = append(types, t)
74+
}
75+
slices.Sort(types)
76+
77+
resources := make([]protos.ResourceMetadata, 0, len(types))
78+
for _, t := range types {
79+
sizes := sizesByType[t]
80+
slices.Sort(sizes)
81+
resources = append(resources, protos.ResourceMetadata{
82+
ResourceType: t,
83+
Count: counts[t],
84+
StateSizeMaxBytes: statMax(sizes),
85+
StateSizeMeanBytes: statMean(sizes),
86+
StateSizeMedianBytes: statMedian(sizes),
87+
})
88+
}
89+
90+
return &protos.BundleResourcesMetadata{
91+
StateEngine: string(engine.EngineDirect),
92+
StateFileSizeBytes: int64(len(raw)),
93+
Resources: resources,
94+
}
95+
}
96+
97+
// resourceTypeFromKey extracts the resource type from a direct-engine state
98+
// key. Keys are "resources.<type>.<name>", or "resources.<type>.<name>.<sub>"
99+
// for sub-resources like permissions / grants / secret_acls. Sub-resources are
100+
// tracked under the sub-resource type so they aggregate across resource
101+
// families. Returns "" for keys that don't match.
102+
func resourceTypeFromKey(key string) string {
103+
parts := strings.SplitN(key, ".", 4)
104+
if len(parts) < 3 || parts[0] != "resources" {
105+
return ""
106+
}
107+
if len(parts) == 4 {
108+
return parts[3]
109+
}
110+
return parts[1]
111+
}
112+
113+
// resolveDeployEngine returns the effective deploy engine ("direct" or
114+
// "terraform"). Mirrors cmd/bundle/utils.ResolveEngineSetting but is inlined
115+
// here to avoid a layering import (bundle/phases must not depend on cmd/).
116+
func resolveDeployEngine(ctx context.Context, b *bundle.Bundle) string {
117+
if b.Config.Bundle.Engine != engine.EngineNotSet {
118+
return string(b.Config.Bundle.Engine.ThisOrDefault())
119+
}
120+
envEngine, _ := engine.FromEnv(ctx)
121+
return string(envEngine.ThisOrDefault())
122+
}
123+
124+
func statMax(sortedSizes []int64) int64 {
125+
if len(sortedSizes) == 0 {
126+
return 0
127+
}
128+
return sortedSizes[len(sortedSizes)-1]
129+
}
130+
131+
func statMean(sortedSizes []int64) int64 {
132+
if len(sortedSizes) == 0 {
133+
return 0
134+
}
135+
var total int64
136+
for _, s := range sortedSizes {
137+
total += s
138+
}
139+
return total / int64(len(sortedSizes))
140+
}
141+
142+
func statMedian(sortedSizes []int64) int64 {
143+
if len(sortedSizes) == 0 {
144+
return 0
145+
}
146+
return sortedSizes[(len(sortedSizes)-1)/2]
147+
}
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
package phases
2+
3+
import (
4+
"encoding/json"
5+
"os"
6+
"path/filepath"
7+
"testing"
8+
9+
"github.com/databricks/cli/bundle"
10+
"github.com/databricks/cli/bundle/config"
11+
"github.com/databricks/cli/bundle/config/engine"
12+
"github.com/databricks/cli/bundle/direct/dstate"
13+
"github.com/databricks/cli/libs/env"
14+
"github.com/databricks/cli/libs/telemetry/protos"
15+
"github.com/stretchr/testify/assert"
16+
"github.com/stretchr/testify/require"
17+
)
18+
19+
func TestResourceTypeFromKey(t *testing.T) {
20+
cases := []struct {
21+
in string
22+
want string
23+
}{
24+
{"resources.jobs.foo", "jobs"},
25+
{"resources.pipelines.bar", "pipelines"},
26+
{"resources.jobs.foo.permissions", "permissions"},
27+
{"resources.secret_scopes.s.permissions", "permissions"},
28+
{"not-a-state-key", ""},
29+
{"resources.jobs", ""},
30+
}
31+
for _, c := range cases {
32+
assert.Equal(t, c.want, resourceTypeFromKey(c.in), "key=%q", c.in)
33+
}
34+
}
35+
36+
func TestResolveDeployEngine(t *testing.T) {
37+
cases := []struct {
38+
name string
39+
configEng engine.EngineType
40+
envEng string
41+
want string
42+
}{
43+
{"config wins over env", engine.EngineDirect, "terraform", "direct"},
44+
{"env used when config unset", engine.EngineNotSet, "direct", "direct"},
45+
{"default when neither set", engine.EngineNotSet, "", "terraform"},
46+
}
47+
for _, c := range cases {
48+
t.Run(c.name, func(t *testing.T) {
49+
b := &bundle.Bundle{}
50+
b.Config.Bundle.Engine = c.configEng
51+
ctx := env.Set(t.Context(), engine.EnvVar, c.envEng)
52+
assert.Equal(t, c.want, resolveDeployEngine(ctx, b))
53+
})
54+
}
55+
}
56+
57+
func TestStatHelpers(t *testing.T) {
58+
assert.Equal(t, int64(3), statMax([]int64{1, 2, 3}))
59+
assert.Equal(t, int64(2), statMean([]int64{1, 2, 3}))
60+
assert.Equal(t, int64(2), statMedian([]int64{1, 2, 3}))
61+
// Lower-middle for even count: sorted [1,2,3,4] -> index (4-1)/2 = 1 -> 2.
62+
assert.Equal(t, int64(2), statMedian([]int64{1, 2, 3, 4}))
63+
assert.Equal(t, int64(0), statMax(nil))
64+
assert.Equal(t, int64(0), statMean(nil))
65+
assert.Equal(t, int64(0), statMedian(nil))
66+
}
67+
68+
// directStateBundle writes a resources.json with the given per-key state blobs
69+
// and returns a bundle wired to read it via StateFilenameDirect.
70+
func directStateBundle(t *testing.T, state map[string]dstate.ResourceEntry) *bundle.Bundle {
71+
t.Helper()
72+
b := &bundle.Bundle{
73+
BundleRootPath: t.TempDir(),
74+
Config: config.Root{
75+
Bundle: config.Bundle{
76+
Engine: engine.EngineDirect,
77+
Target: "default",
78+
},
79+
Workspace: config.Workspace{
80+
StatePath: "/Workspace/state",
81+
},
82+
},
83+
Target: &config.Target{},
84+
}
85+
_, localPath := b.StateFilenameDirect(t.Context())
86+
require.NoError(t, os.MkdirAll(filepath.Dir(localPath), 0o755))
87+
raw, err := json.Marshal(dstate.Database{State: state})
88+
require.NoError(t, err)
89+
require.NoError(t, os.WriteFile(localPath, raw, 0o600))
90+
return b
91+
}
92+
93+
func TestCollectResourcesMetadata_GroupsByTypeFromState(t *testing.T) {
94+
b := directStateBundle(t, map[string]dstate.ResourceEntry{
95+
"resources.jobs.foo": {State: json.RawMessage(`{"name":"foo","x":1}`)}, // 20
96+
"resources.jobs.bar": {State: json.RawMessage(`{"n":"bar"}`)}, // 11
97+
"resources.jobs.foo.permissions": {State: json.RawMessage(`[]`)}, // 2
98+
"resources.pipelines.qux": {State: json.RawMessage(`{"name":"qux"}`)}, // 14
99+
})
100+
101+
md := collectResourcesMetadata(t.Context(), b)
102+
require.NotNil(t, md)
103+
assert.Equal(t, "direct", md.StateEngine)
104+
assert.Positive(t, md.StateFileSizeBytes)
105+
106+
byType := make(map[string]protos.ResourceMetadata)
107+
for _, r := range md.Resources {
108+
byType[r.ResourceType] = r
109+
}
110+
assert.Equal(t, int64(2), byType["jobs"].Count)
111+
assert.Equal(t, int64(20), byType["jobs"].StateSizeMaxBytes)
112+
assert.Equal(t, int64(15), byType["jobs"].StateSizeMeanBytes) // (20+11)/2
113+
assert.Equal(t, int64(11), byType["jobs"].StateSizeMedianBytes)
114+
assert.Equal(t, int64(1), byType["pipelines"].Count)
115+
assert.Equal(t, int64(1), byType["permissions"].Count)
116+
}
117+
118+
func TestCollectResourcesMetadata_NilForTerraform(t *testing.T) {
119+
b := directStateBundle(t, map[string]dstate.ResourceEntry{
120+
"resources.jobs.foo": {State: json.RawMessage(`{}`)},
121+
})
122+
b.Config.Bundle.Engine = engine.EngineTerraform
123+
assert.Nil(t, collectResourcesMetadata(t.Context(), b))
124+
}
125+
126+
func TestCollectResourcesMetadata_NilWhenNoStateFile(t *testing.T) {
127+
b := &bundle.Bundle{
128+
BundleRootPath: t.TempDir(),
129+
Config: config.Root{
130+
Bundle: config.Bundle{Engine: engine.EngineDirect, Target: "default"},
131+
Workspace: config.Workspace{StatePath: "/Workspace/state"},
132+
},
133+
Target: &config.Target{},
134+
}
135+
assert.Nil(t, collectResourcesMetadata(t.Context(), b))
136+
}
137+
138+
func TestCollectResourcesMetadata_NilOnMalformedState(t *testing.T) {
139+
b := &bundle.Bundle{
140+
BundleRootPath: t.TempDir(),
141+
Config: config.Root{
142+
Bundle: config.Bundle{Engine: engine.EngineDirect, Target: "default"},
143+
Workspace: config.Workspace{StatePath: "/Workspace/state"},
144+
},
145+
Target: &config.Target{},
146+
}
147+
_, localPath := b.StateFilenameDirect(t.Context())
148+
require.NoError(t, os.MkdirAll(filepath.Dir(localPath), 0o755))
149+
require.NoError(t, os.WriteFile(localPath, []byte("not json"), 0o600))
150+
assert.Nil(t, collectResourcesMetadata(t.Context(), b))
151+
}

bundle/phases/telemetry.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,8 @@ func LogDeployTelemetry(ctx context.Context, b *bundle.Bundle, errMsg string) {
202202
ResourceClusterIDs: clusterIds,
203203
ResourceDashboardIDs: dashboardIds,
204204

205+
ResourcesMetadata: collectResourcesMetadata(ctx, b),
206+
205207
Experimental: &protos.BundleDeployExperimental{
206208
BundleMode: mode,
207209
ConfigurationFileCount: b.Metrics.ConfigurationFileCount,

libs/telemetry/protos/bundle_deploy.go

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@ type BundleDeployEvent struct {
3232
ResourceClusterIDs []string `json:"resource_cluster_ids,omitempty"`
3333
ResourceDashboardIDs []string `json:"resource_dashboard_ids,omitempty"`
3434

35+
// Per-resource-type metadata (counts and state-size statistics).
36+
ResourcesMetadata *BundleResourcesMetadata `json:"resources_metadata,omitempty"`
37+
3538
Experimental *BundleDeployExperimental `json:"experimental,omitempty"`
3639
}
3740

@@ -88,6 +91,40 @@ type BundleDeployExperimental struct {
8891
LocalCacheMeasurementsMs []IntMapEntry `json:"local_cache_measurements_ms,omitempty"`
8992
}
9093

94+
// BundleResourcesMetadata mirrors the universe proto. Per-resource-type
95+
// state-size metadata for one bundle deployment.
96+
//
97+
// Only direct deploys are measured: the direct engine persists each resource's
98+
// state as a JSON blob in resources.json, so sizes are read off directly as
99+
// len(state) — no serialization happens at telemetry time. Terraform stores
100+
// state in a different shape and is not collected (the field is absent there).
101+
type BundleResourcesMetadata struct {
102+
// Always "direct"; terraform deploys do not populate this message.
103+
StateEngine string `json:"state_engine,omitempty"`
104+
105+
// Size in bytes of the direct engine's resources.json state file on disk.
106+
StateFileSizeBytes int64 `json:"state_file_size_bytes,omitempty"`
107+
108+
// One entry per resource type present in the deployment state.
109+
Resources []ResourceMetadata `json:"resources,omitempty"`
110+
}
111+
112+
// ResourceMetadata holds metadata about resources of a single type within one
113+
// bundle deployment.
114+
type ResourceMetadata struct {
115+
// Resource type name: "jobs", "pipelines", "schemas", ...
116+
ResourceType string `json:"resource_type,omitempty"`
117+
118+
// Number of resources of this type tracked in the deployment state.
119+
Count int64 `json:"count,omitempty"`
120+
121+
// State-size statistics across resources of this type, each measured as
122+
// len(state) of the JSON blob stored in resources.json.
123+
StateSizeMaxBytes int64 `json:"state_size_max_bytes,omitempty"`
124+
StateSizeMeanBytes int64 `json:"state_size_mean_bytes,omitempty"`
125+
StateSizeMedianBytes int64 `json:"state_size_median_bytes,omitempty"`
126+
}
127+
91128
type BoolMapEntry struct {
92129
Key string `json:"key,omitempty"`
93130
Value bool `json:"value"`

0 commit comments

Comments
 (0)