Skip to content

Commit cf5c230

Browse files
committed
internal/manifest: add tiering info to blob references
Previously, blob references did not store which tier (hot or cold) the blob file was written to. This patch adds a Tier field to both BlobReference and PhysicalBlobFile. The tier is now persisted in version edits using new tags (tagNewBlobFile2 and customTagBlobReferences3) that include tier information in the encoding. Decoding of old formats defaults to HotTier for backward compatibility. WriterOptions.BlobReferenceTierGetter is replaced with BlobReferenceTiers, a slice indexed by reference ID.
1 parent 62a7388 commit cf5c230

12 files changed

Lines changed: 123 additions & 63 deletions

File tree

internal/manifest/blob_metadata.go

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,9 @@ type BlobReference struct {
4646
// size according to the ValueSize of the blob reference relative to the
4747
// total ValueSize of the blob file.
4848
EstimatedPhysicalSize uint64
49+
// Tier indicates the storage tier (hot or cold) where the referenced blob
50+
// file was written.
51+
Tier base.StorageTier
4952
}
5053

5154
// MakeBlobReference creates a BlobReference from the given file ID, value size,
@@ -74,6 +77,7 @@ func MakeBlobReference(
7477
//
7578
// We perform the multiplication first to avoid floating point arithmetic.
7679
EstimatedPhysicalSize: (valueSize * phys.Size) / phys.ValueSize,
80+
Tier: phys.Tier,
7781
}
7882
}
7983

@@ -130,6 +134,9 @@ type PhysicalBlobFile struct {
130134
// File creation time in seconds since the epoch (1970-01-01 00:00:00
131135
// UTC).
132136
CreationTime uint64
137+
// Tier indicates the storage tier (hot or cold) where this blob file was
138+
// written.
139+
Tier base.StorageTier
133140

134141
// Mutable state
135142

@@ -151,6 +158,14 @@ func (m *PhysicalBlobFile) SafeFormat(w redact.SafePrinter, _ rune) {
151158
w.Printf("%s size:[%d (%s)] vals:[%d (%s)]",
152159
m.FileNum, redact.Safe(m.Size), humanize.Bytes.Uint64(m.Size),
153160
redact.Safe(m.ValueSize), humanize.Bytes.Uint64(m.ValueSize))
161+
// Only show tier if it's not the default (HotTier) for backward compatibility.
162+
if m.Tier != base.HotTier {
163+
tierStr := "cold"
164+
if m.Tier == base.HotTier {
165+
tierStr = "hot"
166+
}
167+
w.Printf(" tier:%s", tierStr)
168+
}
154169
}
155170

156171
// String implements fmt.Stringer.
@@ -294,6 +309,16 @@ func parsePhysicalBlobFileDebug(p *strparse.Parser) (*PhysicalBlobFile, error) {
294309
p.Expect("]")
295310
case "creationTime":
296311
m.CreationTime = p.Uint64()
312+
case "tier":
313+
tierStr := p.Next()
314+
switch tierStr {
315+
case "hot":
316+
m.Tier = base.HotTier
317+
case "cold":
318+
m.Tier = base.ColdTier
319+
default:
320+
p.Errf("unknown tier %q", tierStr)
321+
}
297322
default:
298323
p.Errf("unknown field %q", field)
299324
}

internal/manifest/testdata/version_edit_decode

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -83,8 +83,8 @@ encode
8383
excise-op: [b, c] #3
8484
----
8585
67061d000b626172000e0000000000000b666f6f010d0000000000000000
86-
45010129d8a301016baf0729d8a301f9a006006c21210a0162016301030b
87-
0537
86+
47010129d8a3010000016daf0729d8a301f9a00600006c21210a01620163
87+
01030b0537
8888
add-table: L6 000029:[bar#14,DEL-foo#13,SET] seqnums:[#0-#0] points:[bar#14,DEL-foo#13,SET] blobrefs:[(B000041: 20952); depth:1]
8989
add-blob-file: B000943 physical:{000041 size:[20952 (20KB)] vals:[102521 (100KB)]}
9090
del-blob-file: B000033 000033
@@ -110,7 +110,7 @@ encode
110110
add-backing: 000009
111111
----
112112
69096467031d000b626172000e0000000000000b666f6f010d0000000000
113-
000000420946010129d8a301e8f10101
113+
000000420947010129d8a301e8f1010001
114114
add-table: L3 000029(000009):[bar#14,DEL-foo#13,SET] seqnums:[#0-#0] points:[bar#14,DEL-foo#13,SET] blobrefs:[(B000041: 20952/30952); depth:1]
115115
add-backing: 000009
116116

@@ -128,7 +128,7 @@ encode
128128
add-table: L3 000029:[bar#14,DEL-foo#13,SET] blobrefs:[(B000041: 20952 / 30952); depth:1]
129129
----
130130
67031d000b626172000e0000000000000b666f6f010d0000000000000000
131-
45010129d8a30101
131+
47010129d8a301e8f1010001
132132
add-table: L3 000029:[bar#14,DEL-foo#13,SET] seqnums:[#0-#0] points:[bar#14,DEL-foo#13,SET] blobrefs:[(B000041: 20952/30952); depth:1]
133133

134134

internal/manifest/version_edit.go

Lines changed: 35 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ const (
6262
tagRemovedBackingTable = 106
6363
tagNewBlobFile = 107
6464
tagDeletedBlobFile = 108
65+
tagNewBlobFile2 = 109
6566

6667
// The custom tags sub-format used by tagNewFile4 and above. All tags less
6768
// than customTagNonSafeIgnoreMask are safe to ignore and their format must be
@@ -77,6 +78,8 @@ const (
7778
customTagBlobReferences = 69
7879
// customTagBlobReferences2 contains BackingValueSize for each BlobReference.
7980
customTagBlobReferences2 = 70
81+
// customTagBlobReferences3 contains BackingValueSize and Tier for each BlobReference.
82+
customTagBlobReferences3 = 71
8083
)
8184

8285
// DeletedTableEntry holds the state for a sstable deletion from a level. The
@@ -467,7 +470,7 @@ func (v *VersionEdit) Decode(r io.Reader) error {
467470
return err
468471
}
469472

470-
case customTagBlobReferences, customTagBlobReferences2:
473+
case customTagBlobReferences, customTagBlobReferences2, customTagBlobReferences3:
471474
// The first varint encodes the 'blob reference depth'
472475
// of the table.
473476
v, err := d.readUvarint()
@@ -490,16 +493,28 @@ func (v *VersionEdit) Decode(r io.Reader) error {
490493
return err
491494
}
492495
var backingValueSize uint64
493-
if customTag == customTagBlobReferences2 {
496+
if customTag == customTagBlobReferences2 || customTag == customTagBlobReferences3 {
494497
backingValueSize, err = d.readUvarint()
495498
if err != nil {
496499
return err
497500
}
498501
}
502+
var tier base.StorageTier
503+
if customTag == customTagBlobReferences3 {
504+
tierValue, err := d.readUvarint()
505+
if err != nil {
506+
return err
507+
}
508+
tier = base.StorageTier(tierValue)
509+
} else {
510+
// For backward compatibility, default to HotTier.
511+
tier = base.HotTier
512+
}
499513
blobReferences[i] = BlobReference{
500514
FileID: base.BlobFileID(fileID),
501515
ValueSize: valueSize,
502516
BackingValueSize: backingValueSize,
517+
Tier: tier,
503518
}
504519
}
505520
continue
@@ -566,7 +581,7 @@ func (v *VersionEdit) Decode(r io.Reader) error {
566581
}
567582
v.NewTables = append(v.NewTables, nfe)
568583

569-
case tagNewBlobFile:
584+
case tagNewBlobFile, tagNewBlobFile2:
570585
fileID, err := d.readUvarint()
571586
if err != nil {
572587
return err
@@ -587,13 +602,25 @@ func (v *VersionEdit) Decode(r io.Reader) error {
587602
if err != nil {
588603
return err
589604
}
605+
var tier base.StorageTier
606+
if tag == tagNewBlobFile2 {
607+
tierValue, err := d.readUvarint()
608+
if err != nil {
609+
return err
610+
}
611+
tier = base.StorageTier(tierValue)
612+
} else {
613+
// For backward compatibility with tagNewBlobFile, default to HotTier.
614+
tier = base.HotTier
615+
}
590616
v.NewBlobFiles = append(v.NewBlobFiles, BlobFileMetadata{
591617
FileID: base.BlobFileID(fileID),
592618
Physical: &PhysicalBlobFile{
593619
FileNum: base.DiskFileNum(diskFileNum),
594620
Size: size,
595621
ValueSize: valueSize,
596622
CreationTime: creationTime,
623+
Tier: tier,
597624
},
598625
})
599626

@@ -948,40 +975,27 @@ func (v *VersionEdit) Encode(w io.Writer) error {
948975
e.writeBytes(x.Meta.SyntheticPrefixAndSuffix.Suffix())
949976
}
950977
if len(x.Meta.BlobReferences) > 0 {
951-
writeBackingValueSize := false
952-
if x.Meta.Virtual {
953-
for _, ref := range x.Meta.BlobReferences {
954-
if ref.BackingValueSize > 0 && ref.BackingValueSize != ref.ValueSize {
955-
writeBackingValueSize = true
956-
break
957-
}
958-
}
959-
}
960-
if writeBackingValueSize {
961-
e.writeUvarint(customTagBlobReferences2)
962-
} else {
963-
e.writeUvarint(customTagBlobReferences)
964-
}
978+
e.writeUvarint(customTagBlobReferences3)
965979
e.writeUvarint(uint64(x.Meta.BlobReferenceDepth))
966980
e.writeUvarint(uint64(len(x.Meta.BlobReferences)))
967981
for _, ref := range x.Meta.BlobReferences {
968982
e.writeUvarint(uint64(ref.FileID))
969983
e.writeUvarint(ref.ValueSize)
970-
if writeBackingValueSize {
971-
e.writeUvarint(ref.BackingValueSize)
972-
}
984+
e.writeUvarint(ref.BackingValueSize)
985+
e.writeUvarint(uint64(ref.Tier))
973986
}
974987
}
975988
e.writeUvarint(customTagTerminate)
976989
}
977990
}
978991
for _, x := range v.NewBlobFiles {
979-
e.writeUvarint(tagNewBlobFile)
992+
e.writeUvarint(tagNewBlobFile2)
980993
e.writeUvarint(uint64(x.FileID))
981994
e.writeUvarint(uint64(x.Physical.FileNum))
982995
e.writeUvarint(x.Physical.Size)
983996
e.writeUvarint(x.Physical.ValueSize)
984997
e.writeUvarint(x.Physical.CreationTime)
998+
e.writeUvarint(uint64(x.Physical.Tier))
985999
}
9861000
for x := range v.DeletedBlobFiles {
9871001
e.writeUvarint(tagDeletedBlobFile)

internal/sstableinternal/options.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,4 +33,11 @@ type WriterOptions struct {
3333
// in order. It is intended for use only in the construction of invalid
3434
// sstables for testing. See tool/make_test_sstables.go.
3535
DisableKeyOrderChecks bool
36+
37+
// BlobReferenceTiers provides the storage tier (hot or cold) for each blob
38+
// reference ID. Used when WriteTieringHistograms is true to categorize blob
39+
// references by tier. The tier for reference ID i is BlobReferenceTiers[i].
40+
// When a value exists in both tiers (hot-and-cold), there are two separate
41+
// blob references with different IDs and tiers.
42+
BlobReferenceTiers []base.StorageTier
3643
}

replay/testdata/corpus/simple_val_sep

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ simple_val_sep:
101101
stat simple_val_sep/MANIFEST-000013 simple_val_sep/000015.sst simple_val_sep/000016.blob
102102
----
103103
simple_val_sep/MANIFEST-000013:
104-
size: 250
104+
size: 259
105105
simple_val_sep/000015.sst:
106106
size: 792
107107
simple_val_sep/000016.blob:

replay/testdata/replay_val_sep

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,23 +15,23 @@ tree
1515
792 000015.sst
1616
97 000016.blob
1717
0 LOCK
18-
152 MANIFEST-000010
19-
250 MANIFEST-000013
18+
158 MANIFEST-000010
19+
259 MANIFEST-000013
2020
2942 OPTIONS-000002
2121
0 marker.format-version.000011.024
2222
0 marker.manifest.000003.MANIFEST-000013
2323
simple_val_sep/
2424
792 000015.sst
2525
97 000016.blob
26-
250 MANIFEST-000013
26+
259 MANIFEST-000013
2727
checkpoint/
2828
819 000005.sst
2929
101 000006.blob
3030
798 000008.sst
3131
97 000009.blob
3232
11 000011.log
3333
687 000012.sst
34-
187 MANIFEST-000013
34+
193 MANIFEST-000013
3535
2942 OPTIONS-000002
3636
0 marker.format-version.000001.024
3737
0 marker.manifest.000001.MANIFEST-000013

sstable/colblk_writer.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -549,16 +549,17 @@ func (w *RawColumnWriter) addWithBlobHandleInternal(
549549
if secondaryHandle != nil {
550550
// Dual-tier case: track hot-and-cold blob reference bytes.
551551
w.tieringHistogramBlock.AddHotAndColdBlobRefBytes(uint64(hotHandle.ValueLen))
552-
} else if w.opts.BlobReferenceTierGetter != nil {
552+
} else if w.opts.internal.BlobReferenceTiers != nil {
553553
// Single-tier case: track by tier.
554554
var kindAndTier tieredmeta.KindAndTier
555-
switch w.opts.BlobReferenceTierGetter(hotHandle.ReferenceID) {
555+
tier := w.opts.internal.BlobReferenceTiers[hotHandle.ReferenceID]
556+
switch tier {
556557
case base.HotTier:
557558
kindAndTier = tieredmeta.SSTableBlobReferenceHotBytes
558559
case base.ColdTier:
559560
kindAndTier = tieredmeta.SSTableBlobReferenceColdBytes
560561
default:
561-
panic(errors.AssertionFailedf("unexpected tier %s", w.opts.BlobReferenceTierGetter(hotHandle.ReferenceID)))
562+
panic(errors.AssertionFailedf("unexpected tier %s", tier))
562563
}
563564
w.tieringHistogramBlock.Add(kindAndTier, meta.TieringSpanID, meta.TieringAttribute, uint64(hotHandle.ValueLen))
564565
}

sstable/options.go

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -245,18 +245,6 @@ type WriterOptions struct {
245245
// sstable. Requires TieringSpanIDGetter and TieringAttributeExtractor to be set.
246246
WriteTieringHistograms bool
247247

248-
// BlobReferenceTierGetter returns the storage tier (hot or cold) for a blob
249-
// reference ID. Used when WriteTieringHistograms is true to categorize blob
250-
// references by tier. When a value exists in both tiers (hot-and-cold), there
251-
// are two separate blob references with different IDs, each returning its own
252-
// tier.
253-
//
254-
// TODO(annie): This is temporary. Once each BlobReference in sstable metadata
255-
// stores the tier (hot or cold) it was written to, we won't need this getter.
256-
// The tier should be set based on which writer (hot or cold) created the blob
257-
// file, and stored in both the BlobReference and PhysicalBlobFile metadata.
258-
BlobReferenceTierGetter func(base.BlobReferenceID) base.StorageTier
259-
260248
// TieringThreshold is the tiering attribute threshold used to categorize keys
261249
// as below or above threshold in the histogram summary.
262250
TieringThreshold base.TieringAttribute

sstable/test_utils.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ type ParsedKVOrSpan struct {
109109
//
110110
// For blob values:
111111
// - Single-tier: Only BlobHandle is set. The tier (hot/cold) is determined
112-
// by BlobReferenceTierGetter based on the blob file's reference ID.
112+
// by sstableinternal.WriterOptions.BlobReferenceTiers[BlobHandle.ReferenceID].
113113
// - Dual-tier: Both BlobHandle and SecondaryBlobHandle are set, representing
114114
// a value that exists in both tiers simultaneously (typically hot + cold).
115115
Value []byte

sstable/writer_test.go

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -657,15 +657,6 @@ func TestWriterWithTieringHistogram(t *testing.T) {
657657
return 0, nil
658658
}
659659

660-
// BlobReferenceTierGetter determines which tier a blob reference is in.
661-
// For testing, odd reference IDs are hot, even are cold.
662-
blobReferenceTierGetter := func(refID base.BlobReferenceID) base.StorageTier {
663-
if refID%2 == 1 {
664-
return base.HotTier
665-
}
666-
return base.ColdTier
667-
}
668-
669660
opts := &WriterOptions{
670661
BlockSize: blockSize,
671662
Comparer: testkeys.Comparer,
@@ -676,7 +667,20 @@ func TestWriterWithTieringHistogram(t *testing.T) {
676667
DisableValueBlocks: true,
677668
}
678669
if hasBlobHandles {
679-
opts.BlobReferenceTierGetter = blobReferenceTierGetter
670+
// Build BlobReferenceTiers for testing. For test simplicity, we
671+
// use the pattern: odd reference IDs are hot, even are cold.
672+
// We allocate enough space for up to 100 references.
673+
tiers := make([]base.StorageTier, 100)
674+
for i := range tiers {
675+
if i%2 == 1 {
676+
tiers[i] = base.HotTier
677+
} else {
678+
tiers[i] = base.ColdTier
679+
}
680+
}
681+
opts.SetInternal(sstableinternal.WriterOptions{
682+
BlobReferenceTiers: tiers,
683+
})
680684
}
681685

682686
meta, r, err = runBuildCmd(td, opts, nil /* cacheHandle */)
@@ -703,10 +707,6 @@ func TestWriterWithTieringHistogram(t *testing.T) {
703707
})
704708
}
705709

706-
// TODO(annie): Once the read path is ready, add a unit test that retrieves
707-
// values using the secondary blob handle (for dual-tier blob values) and
708-
// validates that we get the same result as using the primary handle.
709-
710710
func asciiOrHex(b []byte) string {
711711
if bytes.ContainsFunc(b, func(r rune) bool { return r < ' ' || r > '~' }) {
712712
return fmt.Sprintf("hex:%x", b)

0 commit comments

Comments
 (0)