|
16 | 16 | #include "google/cloud/bigtable/emulator/column_family.h" |
17 | 17 | #include "google/cloud/bigtable/emulator/filter.h" |
18 | 18 | #include "google/cloud/bigtable/emulator/range_set.h" |
19 | | -#include "google/cloud/bigtable/internal/google_bytes_traits.h" |
| 19 | +#include "google/cloud/bigtable/emulator/row_streamer.h" |
20 | 20 | #include "google/cloud/internal/big_endian.h" |
21 | 21 | #include "google/cloud/internal/make_status.h" |
22 | 22 | #include "google/cloud/status.h" |
| 23 | +#include "google/cloud/status_or.h" |
23 | 24 | #include "google/protobuf/util/field_mask_util.h" |
24 | | -#include <grpc/grpc_security_constants.h> |
| 25 | +#include <google/bigtable/admin/v2/bigtable_table_admin.pb.h> |
| 26 | +#include <google/bigtable/admin/v2/table.pb.h> |
25 | 27 | #include <google/bigtable/admin/v2/types.pb.h> |
26 | 28 | #include <google/bigtable/v2/bigtable.pb.h> |
27 | 29 | #include <google/bigtable/v2/data.pb.h> |
| 30 | +#include <google/protobuf/field_mask.pb.h> |
28 | 31 | #include <absl/strings/match.h> |
29 | 32 | #include <absl/strings/str_format.h> |
30 | 33 | #include <absl/types/optional.h> |
| 34 | +#include <absl/types/variant.h> |
| 35 | +#include <grpcpp/support/sync_stream.h> |
31 | 36 | #include <re2/re2.h> |
| 37 | +#include <cassert> |
32 | 38 | #include <chrono> |
| 39 | +#include <climits> |
| 40 | +#include <cmath> |
| 41 | +#include <cstddef> |
33 | 42 | #include <cstdint> |
34 | 43 | #include <cstdlib> |
| 44 | +#include <functional> |
| 45 | +#include <iostream> |
| 46 | +#include <map> |
35 | 47 | #include <memory> |
36 | 48 | #include <mutex> |
37 | | -#include <optional> |
| 49 | +#include <ostream> |
| 50 | +#include <stack> |
38 | 51 | #include <string> |
39 | 52 | #include <type_traits> |
| 53 | +#include <utility> |
| 54 | +#include <vector> |
40 | 55 |
|
41 | 56 | namespace google { |
42 | 57 | namespace cloud { |
@@ -602,6 +617,134 @@ bool Table::IsDeleteProtectedNoLock() const { |
602 | 617 | return schema_.deletion_protection(); |
603 | 618 | } |
604 | 619 |
|
| 620 | +Status Table::SampleRowKeys( |
| 621 | + double pass_probability, |
| 622 | + grpc::ServerWriter<google::bigtable::v2::SampleRowKeysResponse>* writer) { |
| 623 | + if (pass_probability <= 0.0) { |
| 624 | + return InvalidArgumentError( |
| 625 | + "The sampling probabality must be positive", |
| 626 | + GCP_ERROR_INFO().WithMetadata("provided sampling probability", |
| 627 | + absl::StrFormat("%f", pass_probability))); |
| 628 | + } |
| 629 | + |
| 630 | + auto sample_every = |
| 631 | + static_cast<std::uint64_t>(std::ceil(1.0 / pass_probability)); |
| 632 | + |
| 633 | + std::lock_guard<std::mutex> lock(mu_); |
| 634 | + |
| 635 | + // First, stream all rows and cells and compute the offsets. |
| 636 | + auto all_rows_set = std::make_shared<StringRangeSet>(StringRangeSet::All()); |
| 637 | + auto maybe_all_rows_stream = CreateCellStream(all_rows_set, absl::nullopt); |
| 638 | + if (!maybe_all_rows_stream) { |
| 639 | + return maybe_all_rows_stream.status(); |
| 640 | + } |
| 641 | + |
| 642 | + auto& stream = *maybe_all_rows_stream; |
| 643 | + |
| 644 | + absl::optional<std::string> first_row_key; |
| 645 | + // The first row read will be used as a constant estimate of row |
| 646 | + // sizes. If we are sampling 1/n rows, the value added to the offset |
| 647 | + // (which is to be regarded as the size of all the rows before the |
| 648 | + // sampled one) will be (n * row_size_estimate). |
| 649 | + // |
| 650 | + // That is every time a row is sampled, we do: offset += (n * |
| 651 | + // row_size_estimate). |
| 652 | + std::size_t row_size_estimate = 0; |
| 653 | + |
| 654 | + for (; stream; ++stream) { |
| 655 | + if (first_row_key.has_value() && |
| 656 | + stream->row_key() != first_row_key.value()) { |
| 657 | + break; |
| 658 | + } |
| 659 | + |
| 660 | + first_row_key = stream->row_key(); |
| 661 | + |
| 662 | + row_size_estimate += stream->row_key().size(); |
| 663 | + row_size_estimate += stream->column_qualifier().size(); |
| 664 | + row_size_estimate += stream->value().size(); |
| 665 | + row_size_estimate += sizeof(stream->timestamp()); |
| 666 | + } |
| 667 | + |
| 668 | + if (!first_row_key.has_value()) { |
| 669 | + // No rows in the table |
| 670 | + google::bigtable::v2::SampleRowKeysResponse resp; |
| 671 | + resp.set_row_key(""); |
| 672 | + resp.set_offset_bytes(0); |
| 673 | + |
| 674 | + auto opts = grpc::WriteOptions(); |
| 675 | + opts.set_last_message(); |
| 676 | + |
| 677 | + writer->WriteLast(std::move(resp), opts); |
| 678 | + return Status(); |
| 679 | + } |
| 680 | + |
| 681 | + std::int64_t offset_delta = sample_every * row_size_estimate; |
| 682 | + |
| 683 | + google::bigtable::v2::RowFilter sample_filter; |
| 684 | + sample_filter.set_row_sample_filter(pass_probability); |
| 685 | + |
| 686 | + auto maybe_stream = CreateCellStream(all_rows_set, sample_filter); |
| 687 | + if (!maybe_stream) { |
| 688 | + return maybe_stream.status(); |
| 689 | + } |
| 690 | + |
| 691 | + auto& sampled_stream = *maybe_stream; |
| 692 | + |
| 693 | + std::int64_t offset = 0; |
| 694 | + |
| 695 | + bool wrote_a_sample; |
| 696 | + |
| 697 | + for (; sampled_stream; sampled_stream.Next(NextMode::kRow)) { |
| 698 | + google::bigtable::v2::SampleRowKeysResponse resp; |
| 699 | + offset += offset_delta; |
| 700 | + resp.set_row_key(sampled_stream->row_key()); |
| 701 | + resp.set_offset_bytes(offset); |
| 702 | + |
| 703 | + writer->Write(std::move(resp)); |
| 704 | + |
| 705 | + wrote_a_sample = true; |
| 706 | + } |
| 707 | + |
| 708 | + // Cloud bigtable client tests expect that, if they populated the |
| 709 | + // table with at least one row, then at least one row sample is |
| 710 | + // returned. |
| 711 | + // |
| 712 | + // In such a case, return any string that represents the last key, |
| 713 | + // and an offset that is the estimated row size * the number of rows |
| 714 | + // in the largest column family. We can return any string because |
| 715 | + // the keys returned need not be in the table. See the proto |
| 716 | + // specification. |
| 717 | + if (!wrote_a_sample) { |
| 718 | + std::size_t row_count_estimate = 0; |
| 719 | + |
| 720 | + for (auto const& cf : *get()) { |
| 721 | + if (cf.second->size() > row_count_estimate) { |
| 722 | + row_count_estimate = cf.second->size(); |
| 723 | + } |
| 724 | + } |
| 725 | + |
| 726 | + std::int64_t this_offset = row_count_estimate * row_size_estimate; |
| 727 | + |
| 728 | + google::bigtable::v2::SampleRowKeysResponse resp; |
| 729 | + resp.set_row_key("last_key"); |
| 730 | + resp.set_offset_bytes(this_offset); |
| 731 | + writer->Write(std::move(resp)); |
| 732 | + |
| 733 | + offset += this_offset; |
| 734 | + } |
| 735 | + |
| 736 | + google::bigtable::v2::SampleRowKeysResponse resp; |
| 737 | + resp.set_row_key(""); |
| 738 | + // Client test code expects offset_bytes to be strictly |
| 739 | + // increasing. |
| 740 | + resp.set_offset_bytes(offset + 1); |
| 741 | + auto opts = grpc::WriteOptions(); |
| 742 | + opts.set_last_message(); |
| 743 | + writer->WriteLast(std::move(resp), opts); |
| 744 | + |
| 745 | + return Status(); |
| 746 | +} |
| 747 | + |
605 | 748 | Status Table::DropRowRange( |
606 | 749 | ::google::bigtable::admin::v2::DropRowRangeRequest const& request) { |
607 | 750 | std::lock_guard<std::mutex> lock(mu_); |
|
0 commit comments