|
| 1 | +package stointegrityverifier |
| 2 | + |
| 3 | +// With sampling we can select only a subset of the blobs to visit. |
| 4 | +// https://en.wikipedia.org/wiki/Sampling_(statistics) |
| 5 | + |
| 6 | +import ( |
| 7 | + "encoding/binary" |
| 8 | + "fmt" |
| 9 | + "strconv" |
| 10 | + |
| 11 | + "github.com/function61/varasto/pkg/stotypes" |
| 12 | +) |
| 13 | + |
| 14 | +// answers whether we should visit a blob |
| 15 | +type batchSampler func(stotypes.BlobRef) bool |
| 16 | + |
| 17 | +func CreateSampler(sampleSpecificationMaybe *string) (batchSampler, error) { |
| 18 | + if sampleSpecification := sampleSpecificationMaybe; sampleSpecification != nil { |
| 19 | + // bit string like `1111` to number (`15`) |
| 20 | + num, err := strconv.ParseUint(*sampleSpecification, 2, 32) |
| 21 | + if err != nil { |
| 22 | + return nil, fmt.Errorf("invalid sampling spec. expected binary string like 01; got '%s'", *sampleSpecification) |
| 23 | + } |
| 24 | + bitCount := len(*sampleSpecification) |
| 25 | + |
| 26 | + return prefixSampler(uint32(num), uint8(bitCount)), nil |
| 27 | + } else { |
| 28 | + return func(_ stotypes.BlobRef) bool { return true }, nil |
| 29 | + } |
| 30 | +} |
| 31 | + |
| 32 | +// only accepts blob refs that start with a specific bit pattern. that means that we can accept blob refs starting with: |
| 33 | +// - `0b0` => accepts 1/2 of refs |
| 34 | +// - `0b00` => accepts 1/4 of refs |
| 35 | +// - `0b000` => accepts 1/8 of refs |
| 36 | +// - and so on... |
| 37 | +// |
| 38 | +// the interesting property of this, as opposed to something like using random sampling for acceptance is that |
| 39 | +// this is deterministic based on blob ref and thus we can integrity verify first batch today and next back next week and |
| 40 | +// we are guaranteed that the next batch won't re-visit blobs from first batch. |
| 41 | +// |
| 42 | +// let's take the 1/4 acceptance as example. we have four batches: 1) `0b00` 2) `0b01` 3) `0b10` 4) `0b11` |
| 43 | +// |
| 44 | +// we could visit those four batches in four different weeks to guaranteed visit all blobs (except those added later to earlier batches' "partitions") |
| 45 | +func prefixSampler(value uint32, bitCount uint8) batchSampler { |
| 46 | + return func(blobRef stotypes.BlobRef) bool { |
| 47 | + // counter-intuitively: use big-endian encoding to *not* filter on the very first bits of the blob ref, because |
| 48 | + // the verifier is iterating the blobs ordered on the blob ref. we don't want us to consecutively ignore a large |
| 49 | + // portion of the scan but instead we want to accept blobs uniformly over time |
| 50 | + blobRefUint32 := binary.BigEndian.Uint32(blobRef) |
| 51 | + return blobRefUint32&bitmask(bitCount) == value |
| 52 | + } |
| 53 | +} |
| 54 | + |
| 55 | +// `1` => `0b1` |
| 56 | +// `3` => `0b111` |
| 57 | +// `8` => `0b11111111` |
| 58 | +func bitmask(n uint8) uint32 { |
| 59 | + return (1 << n) - 1 |
| 60 | +} |
0 commit comments