From b4d302f28767dda4f350b831aa1a578d91d454cb Mon Sep 17 00:00:00 2001 From: aibin Date: Thu, 23 Oct 2025 17:11:02 +0800 Subject: [PATCH 01/18] Add: finish top_k reserve and info implementation --- src/commands/cmd_topk.cc | 214 +++++++++++++++++++ src/commands/commander.h | 4 +- src/storage/redis_metadata.cc | 26 ++- src/storage/redis_metadata.h | 24 ++- src/types/redis_topk.cc | 113 ++++++++++ src/types/redis_topk.h | 65 ++++++ src/types/topk.cc | 380 ++++++++++++++++++++++++++++++++++ src/types/topk.h | 91 ++++++++ 8 files changed, 912 insertions(+), 5 deletions(-) create mode 100644 src/commands/cmd_topk.cc create mode 100644 src/types/redis_topk.cc create mode 100644 src/types/redis_topk.h create mode 100644 src/types/topk.cc create mode 100644 src/types/topk.h diff --git a/src/commands/cmd_topk.cc b/src/commands/cmd_topk.cc new file mode 100644 index 00000000000..5c138c66d3c --- /dev/null +++ b/src/commands/cmd_topk.cc @@ -0,0 +1,214 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + */ + +#include "commander.h" +#include "command_parser.h" +#include "error_constants.h" +#include "server/server.h" +#include "types/redis_topk.h" + +namespace { +constexpr const char *errBadK = "Bad K"; +constexpr const char *errBadWidth = "Bad width"; +constexpr const char *errBadDepth = "Bad depth"; +constexpr const char *errBadDecay = "Bad decay"; +constexpr const char *errInvalidDecay = "Decay must be between 0 and 1"; +} + +namespace redis { + +class CommandTopKReserve final : public Commander { + public: + Status Parse(const std::vector &args) override { + if (args.size() < 3) { + return {Status::RedisParseErr, errWrongNumOfArguments}; + } + auto parse_k = ParseInt(args[2], 10); + if (!parse_k) { + return {Status::RedisParseErr, errBadK}; + } + k_ = *parse_k; + if (args_.size() >= 4) { + auto parse_width = ParseInt(args[3], 10); + if (!parse_width) { + return {Status::RedisParseErr, errBadWidth}; + } + width_ = *parse_width; + } + if (args_.size() >= 5) { + auto parse_depth = ParseInt(args[4], 10); + if (!parse_depth) { + return {Status::RedisParseErr, errBadDepth}; + } + depth_ = *parse_depth; + } + if (args_.size() >= 6) { + auto parse_decay = ParseFloat(args[5]); + if (!parse_decay) { + return {Status::RedisParseErr, errBadDecay}; + } + decay_ = *parse_decay; + if (decay_ <= 0.0 || decay_ >= 1.0) { + return {Status::RedisParseErr, errInvalidDecay}; + } + } + if (args_.size() > 6) { + return {Status::RedisParseErr, errWrongNumOfArguments}; + } + return Status::OK(); + } + + Status Execute(engine::Context &ctx, Server *srv, Connection *conn, std::string *output) override { + redis::TopK topk(srv->storage, conn->GetNamespace()); + + auto s = topk.Reserve(ctx, args_[1], k_, width_, depth_, decay_); + if (!s.ok()) return {Status::RedisExecErr, s.ToString()}; + + *output = redis::RESP_OK; + return Status::OK(); + } + private: + uint32_t k_; + uint32_t width_ = 7; + uint32_t depth_ = 8; + double decay_ = 0.9; +}; + +class CommandTopKAdd final : public Commander { + public: + Status Execute(engine::Context &ctx, Server *srv, Connection *conn, std::string *output) override { + redis::TopK topk(srv->storage, conn->GetNamespace()); + CHECK(args_.size() == 3); + + auto s = topk.Add(ctx, args_[1], args_[2]); + if (!s.ok()) { + return {Status::RedisExecErr, s.ToString()}; + } + *output = redis::RESP_OK; + return Status::OK(); + } +}; + +class CommandTopKList final : public Commander { + public: + Status Execute(engine::Context &ctx, Server *srv, Connection *conn, std::string *output) override { + redis::TopK topk(srv->storage, conn->GetNamespace()); + CHECK(args_.size() == 2); + + std::vector items; + auto s = topk.List(ctx, args_[1], items); + if (!s.ok()) { + return {Status::RedisExecErr, s.ToString()}; + } + *output = MultiBulkString(redis::RESP::v2, items); + return Status::OK(); + } +}; + +class CommandTopKInfo final : public Commander { + public: + Status Parse(const std::vector &args) override { + if (args.size() > 3) { + return {Status::InvalidArgument, errWrongNumOfArguments}; + } + + CommandParser parser(args, 2); + if (parser.Good()) { + if (args.size() == 3) { + std::string type_str = args[2]; + if (type_str == "topk") { + type_ = TopKInfoType::kTopK; + } else if (type_str == "width") { + type_ = TopKInfoType::kWidth; + } else if (type_str == "depth") { + type_ = TopKInfoType::kDepth; + } else if (type_str == "decay") { + type_ = TopKInfoType::kDecay; + } else { + return {Status::InvalidArgument, "Invalid info type"}; + } + } + } + + return Commander::Parse(args); + } + + Status Execute(engine::Context &ctx, Server *srv, Connection *conn, [[maybe_unused]]std::string *output) override { + redis::TopK topk_db(srv->storage, conn->GetNamespace()); + TopKInfo info; + + auto s = topk_db.Info(ctx, args_[1], &info); + if (s.IsNotFound()) return {Status::RedisExecErr, "key does not exist"}; + if (!s.ok()) return {Status::RedisExecErr, s.ToString()}; + + switch (type_) { + case TopKInfoType::kAll: + *output = redis::MultiLen(2 * 4); + *output += redis::SimpleString("K"); + *output += redis::Integer(info.k); + *output += redis::SimpleString("Width"); + *output += redis::Integer(info.width); + *output += redis::SimpleString("Depth"); + *output += redis::Integer(info.depth); + *output += redis::SimpleString("Decay"); + *output += redis::Double(redis::RESP::v2, info.decay); + break; + case TopKInfoType::kTopK: + *output = redis::Integer(info.k); + break; + case TopKInfoType::kWidth: + *output = redis::Integer(info.width); + break; + case TopKInfoType::kDepth: + *output = redis::Integer(info.depth); + break; + case TopKInfoType::kDecay: + *output = redis::Double(redis::RESP::v2, info.decay); + break; + } + return Status::OK(); + } + private: + TopKInfoType type_ = TopKInfoType::kAll; +}; + +class CommandTopKQuery final : public Commander { + public: + Status Execute(engine::Context &ctx, Server *srv, Connection *conn, std::string *output) override { + redis::TopK topk(srv->storage, conn->GetNamespace()); + CHECK(args_.size() == 3); + + bool is_exists_; + auto s = topk.Query(ctx, args_[1], args_[2], &is_exists_); + if (!s.ok()) { + return {Status::RedisExecErr, s.ToString()}; + } + *output = redis::Bool(redis::RESP::v2, is_exists_); + return Status::OK(); + } +}; + +REDIS_REGISTER_COMMANDS(TopK, MakeCmdAttr("topk.add", 3, "write", 1, 1, 1), + MakeCmdAttr("topk.list", 2, "read-only", 1, 1, 1), + MakeCmdAttr("topk.info", 2, "read-only", 1, 1, 1), + MakeCmdAttr("topk.query", 3, "read-only", 1, 1, 1), + MakeCmdAttr("topk.reserve", -3, "write", 1, 1, 1)); + +} // namespace redis \ No newline at end of file diff --git a/src/commands/commander.h b/src/commands/commander.h index 3f38db02580..bf24bbac67d 100644 --- a/src/commands/commander.h +++ b/src/commands/commander.h @@ -116,9 +116,7 @@ enum class CommandCategory : uint8_t { Txn, ZSet, Timeseries, - // this is a special category for disabling commands, - // basically can be used for version releasing or debugging - Disabled, + TopK, }; class Commander { diff --git a/src/storage/redis_metadata.cc b/src/storage/redis_metadata.cc index 692f8804db1..7040776f876 100644 --- a/src/storage/redis_metadata.cc +++ b/src/storage/redis_metadata.cc @@ -334,7 +334,7 @@ bool Metadata::IsSingleKVType() const { return Type() == kRedisString || Type() bool Metadata::IsEmptyableType() const { return IsSingleKVType() || Type() == kRedisStream || Type() == kRedisBloomFilter || Type() == kRedisHyperLogLog || - Type() == kRedisTDigest || Type() == kRedisTimeSeries; + Type() == kRedisTDigest || Type() == kRedisTimeSeries || Type() == kRedisTopK; } bool Metadata::Expired() const { return ExpireAt(util::GetTimeStampMS()); } @@ -569,3 +569,27 @@ rocksdb::Status TimeSeriesMetadata::Decode(Slice *input) { return rocksdb::Status::OK(); } + +void TopKMetadata::Encode(std::string *dst) const { + Metadata::Encode(dst); + PutFixed32(dst, top_k); + PutFixed16(dst, width); + PutFixed32(dst, depth); + PutDouble(dst, decay); +} + +rocksdb::Status TopKMetadata::Decode(Slice *input) { + if (auto s = Metadata::Decode(input); !s.ok()) { + return s; + } + if (input->size() < sizeof(double) + sizeof(uint32_t) * 2 + sizeof(uint16_t)) { + return rocksdb::Status::InvalidArgument(kErrMetadataTooShort); + } + + GetFixed32(input, &top_k); + GetFixed16(input, &width); + GetFixed32(input, &depth); + GetDouble(input, &decay); + + return rocksdb::Status::OK(); +} \ No newline at end of file diff --git a/src/storage/redis_metadata.h b/src/storage/redis_metadata.h index fd80e5a5ba0..0b1ff777f8a 100644 --- a/src/storage/redis_metadata.h +++ b/src/storage/redis_metadata.h @@ -54,12 +54,14 @@ enum RedisType : uint8_t { kRedisHyperLogLog = 11, kRedisTDigest = 12, kRedisTimeSeries = 13, + kRedisTopK = 14, kRedisTypeMax }; inline constexpr const std::array RedisTypeNames = { "none", "string", "hash", "list", "set", "zset", "bitmap", - "sortedint", "stream", "MBbloom--", "ReJSON-RL", "hyperloglog", "TDIS-TYPE", "timeseries"}; + "sortedint", "stream", "MBbloom--", "ReJSON-RL", "hyperloglog", "TDIS-TYPE", "timeseries", + "topk"}; struct RedisTypes { RedisTypes(std::initializer_list list) { @@ -409,3 +411,23 @@ class TimeSeriesMetadata : public Metadata { void Encode(std::string *dst) const override; rocksdb::Status Decode(Slice *input) override; }; + +class TopKMetadata : public Metadata { +public: + uint32_t top_k; + uint16_t width; + uint32_t depth; + double decay; + + explicit TopKMetadata(bool generate_version = true) : Metadata(kRedisTopK, generate_version) {} + + TopKMetadata(uint64_t top_k, uint64_t width = 7, uint64_t depth = 8, double decay = 0.9, bool generate_version = true) + : Metadata(kRedisTopK, generate_version), + top_k(top_k), + width(width), + depth(depth), + decay(decay) {} + + void Encode(std::string *dst) const override; + rocksdb::Status Decode(Slice *input) override; +}; \ No newline at end of file diff --git a/src/types/redis_topk.cc b/src/types/redis_topk.cc new file mode 100644 index 00000000000..9d563dd96e8 --- /dev/null +++ b/src/types/redis_topk.cc @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + */ + +#include "redis_topk.h" +#include "topk.h" +#include "commands/ttl_util.h" + +namespace redis { + +rocksdb::Status TopK::Reserve(engine::Context &ctx, const Slice& user_key, + uint32_t k, uint32_t width, uint32_t depth, double decay) { + std::string ns_key = AppendNamespacePrefix(user_key); + + TopKMetadata topk_metadata; + rocksdb::Status s = getTopKMetadata(ctx, ns_key, &topk_metadata); + if (!s.ok() && !s.IsNotFound()) return s; + if (!s.IsNotFound()) { + return rocksdb::Status::InvalidArgument("TopK already exists"); + } + + return createTopK(ctx, ns_key, k, width, depth, decay, &topk_metadata); +} + +/* TODO: implemention */ +rocksdb::Status TopK::Add([[maybe_unused]] engine::Context &ctx, [[maybe_unused]] const Slice &user_key, + [[maybe_unused]] const Slice &items) { + return rocksdb::Status::OK(); +} + +/* TODO: implemention */ +rocksdb::Status TopK::Query([[maybe_unused]] engine::Context &ctx, [[maybe_unused]] const Slice& user_key, + [[maybe_unused]] const Slice &items, [[maybe_unused]] bool *exists) { + return rocksdb::Status::OK(); +} + +/* TODO: implemention */ +rocksdb::Status TopK::List([[maybe_unused]] engine::Context &ctx, [[maybe_unused]] const Slice& user_key, + [[maybe_unused]] std::vector &items) { + return rocksdb::Status::OK(); +} + +rocksdb::Status TopK::Info(engine::Context &ctx, const Slice& user_key, TopKInfo *info) { + std::string ns_key = AppendNamespacePrefix(user_key); + + TopKMetadata metadata; + auto s = getTopKMetadata(ctx, ns_key, &metadata); + if (!s.ok()) return s; + + info->k = metadata.top_k; + info->width = metadata.width; + info->depth = metadata.depth; + info->decay = metadata.decay; + + return rocksdb::Status::OK(); +} + +rocksdb::Status TopK::getTopKMetadata(engine::Context &ctx, const Slice &ns_key, TopKMetadata *metadata) { + return Database::GetMetadata(ctx, {kRedisTopK}, ns_key, metadata); +} + +rocksdb::Status TopK::createTopK(engine::Context &ctx, const Slice &ns_key, + uint32_t k, uint32_t width, uint32_t depth, double decay, + TopKMetadata *metadata) { + metadata->top_k = k; + metadata->width = width; + metadata->depth = depth; + metadata->decay = decay; + + auto block_split_top_k = CreateBlockSplitTopK(k, width, depth, decay); + + auto batch = storage_->GetWriteBatchBase(); + WriteBatchLogData log_data(kRedisTopK, {"createTopK"}); + auto s = batch->PutLogData(log_data.Encode()); + if (!s.ok()) return s; + + std::string top_k_meta_bytes; + metadata->Encode(&top_k_meta_bytes); + s = batch->Put(metadata_cf_handle_, ns_key, top_k_meta_bytes); + if (!s.ok()) return s; + + std::string tk_key = getTKKey(ns_key, *metadata); + // TODO: how to save data structure---block split topk + s = batch->Put(ns_key, block_split_top_k.GetData()); + if (!s.ok()) return s; + + return storage_->Write(ctx, storage_->DefaultWriteOptions(), batch->GetWriteBatch()); +} + +std::string TopK::getTKKey(const Slice &ns_key, const TopKMetadata &metadata) { + std::string sub_key; + + return InternalKey(ns_key, sub_key, metadata.version, storage_->IsSlotIdEncoded()).Encode(); +} + + +} // namespace redis \ No newline at end of file diff --git a/src/types/redis_topk.h b/src/types/redis_topk.h new file mode 100644 index 00000000000..ad8cca2280f --- /dev/null +++ b/src/types/redis_topk.h @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + */ + +#pragma once + +#include "storage/redis_db.h" + +namespace redis { + +enum class TopKInfoType { + kAll, + kTopK, + kWidth, + kDepth, + kDecay +}; + +struct TopKInfo { + uint32_t k; + uint32_t width; + uint32_t depth; + double decay; +}; + +class TopK : public SubKeyScanner { + public: + using Slice = rocksdb::Slice; + + explicit TopK(engine::Storage* storage, const std::string& ns) + : SubKeyScanner(storage, ns) {} + + rocksdb::Status Reserve(engine::Context &ctx, const Slice& user_key, uint32_t k, uint32_t width, uint32_t depth, double decay); + rocksdb::Status Query(engine::Context &ctx, const Slice& user_key, + const Slice &items, + bool *exists); + rocksdb::Status Add(engine::Context &ctx, const Slice &user_key, + const Slice &items); + rocksdb::Status List(engine::Context &ctx, const Slice& user_key, std::vector &items); + rocksdb::Status Info(engine::Context &ctx, const Slice& user_key, TopKInfo *info); + private: + rocksdb::Status getTopKMetadata(engine::Context &ctx, const Slice &ns_key, TopKMetadata *metadata); + rocksdb::Status createTopK(engine::Context &ctx, const Slice &ns_key, + uint32_t k, uint32_t width, uint32_t depth, double decay, + TopKMetadata *metadata); + std::string getTKKey(const Slice &ns_key, const TopKMetadata &metadata); +}; + +} // namespace redis \ No newline at end of file diff --git a/src/types/topk.cc b/src/types/topk.cc new file mode 100644 index 00000000000..1cf07a3c1f2 --- /dev/null +++ b/src/types/topk.cc @@ -0,0 +1,380 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + */ + +#include "topk.h" + +#include +#include +#include +#include +#include + +#define TOPK_HASH(item, itemlen, i) MurmurHash2(item, itemlen, i) +#define GA 1919 + +//----------------------------------------------------------------------------- +// MurmurHash2 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +// Note - This code makes a few assumptions about how your machine behaves - + +// 1. We can read a 4-byte value from any address without crashing +// 2. sizeof(int) == 4 + +// And it has a few limitations - + +// 1. It will not work incrementally. +// 2. It will not produce the same results on little-endian and big-endian +// machines. +#define BIG_CONSTANT(x) (x##LLU) + +//----------------------------------------------------------------------------- + +static uint32_t MurmurHash2(const void *key, int len, uint32_t seed) { + // 'm' and 'r' are mixing constants generated offline. + // They're not really 'magic', they just happen to work well. + + const uint32_t m = 0x5bd1e995; + const int r = 24; + + // Initialize the hash to a 'random' value + + uint32_t h = seed ^ len; + + // Mix 4 bytes at a time into the hash + + const unsigned char *data = (const unsigned char *)key; + + while (len >= 4) { + uint32_t k = *(uint32_t *)data; + + k *= m; + k ^= k >> r; + k *= m; + + h *= m; + h ^= k; + + data += 4; + len -= 4; + } + + // Handle the last few bytes of the input array + + switch (len) { + case 3: + h ^= data[2] << 16; + case 2: + h ^= data[1] << 8; + case 1: + h ^= data[0]; + h *= m; + }; + + // Do a few final mixes of the hash to ensure the last few + // bytes are well-incorporated. + + h ^= h >> 13; + h *= m; + h ^= h >> 15; + + return h; +} + +//----------------------------------------------------------------------------- +// MurmurHash2, 64-bit versions, by Austin Appleby + +// The same caveats as 32-bit MurmurHash2 apply here - beware of alignment +// and endian-ness issues if used across multiple platforms. + +// 64-bit hash for 64-bit platforms + +[[maybe_unused]]static uint64_t MurmurHash64A_Bloom(const void *key, int len, uint64_t seed) { + const uint64_t m = BIG_CONSTANT(0xc6a4a7935bd1e995); + const int r = 47; + + uint64_t h = seed ^ (len * m); + + const uint64_t *data = (const uint64_t *)key; + const uint64_t *end = data + (len / 8); + + while (data != end) { + uint64_t k = *data++; + + k *= m; + k ^= k >> r; + k *= m; + + h ^= k; + h *= m; + } + + const unsigned char *data2 = (const unsigned char *)data; + + switch (len & 7) { + case 7: + h ^= ((uint64_t)data2[6]) << 48; + case 6: + h ^= ((uint64_t)data2[5]) << 40; + case 5: + h ^= ((uint64_t)data2[4]) << 32; + case 4: + h ^= ((uint64_t)data2[3]) << 24; + case 3: + h ^= ((uint64_t)data2[2]) << 16; + case 2: + h ^= ((uint64_t)data2[1]) << 8; + case 1: + h ^= ((uint64_t)data2[0]); + h *= m; + }; + + h ^= h >> r; + h *= m; + h ^= h >> r; + + return h; +} + +// 64-bit hash for 32-bit platforms + +[[maybe_unused]]static uint64_t MurmurHash64B(const void *key, int len, uint64_t seed) { + const uint32_t m = 0x5bd1e995; + const int r = 24; + + uint32_t h1 = (uint32_t)(seed ^ len); + uint32_t h2 = (uint32_t)(seed >> 32); + + const uint32_t *data = (const uint32_t *)key; + + while (len >= 8) { + uint32_t k1 = *data++; + k1 *= m; + k1 ^= k1 >> r; + k1 *= m; + h1 *= m; + h1 ^= k1; + len -= 4; + + uint32_t k2 = *data++; + k2 *= m; + k2 ^= k2 >> r; + k2 *= m; + h2 *= m; + h2 ^= k2; + len -= 4; + } + + if (len >= 4) { + uint32_t k1 = *data++; + k1 *= m; + k1 ^= k1 >> r; + k1 *= m; + h1 *= m; + h1 ^= k1; + len -= 4; + } + + switch (len) { + case 3: + h2 ^= ((unsigned char *)data)[2] << 16; + case 2: + h2 ^= ((unsigned char *)data)[1] << 8; + case 1: + h2 ^= ((unsigned char *)data)[0]; + h2 *= m; + }; + + h1 ^= h2 >> 18; + h1 *= m; + h2 ^= h1 >> 22; + h2 *= m; + h1 ^= h2 >> 17; + h1 *= m; + h2 ^= h1 >> 19; + h2 *= m; + + uint64_t h = h1; + + h = (h << 32) | h2; + + return h; +} + +/* ---------------------------------------------------------------------- */ +void BlockSplitTopK::swapHeapBucket(HeapBucket& a, HeapBucket& b) { + HeapBucket temp = a; + a = b; + b = temp; +} + +void BlockSplitTopK::heapifyDown(int start) { + size_t parent = start; + + // check whether larger than children + if (heap_size < 2 || (heap_size - 2) / 2 < parent) { + return; + } + + while (parent < heap_size) { + size_t left_child = 2 * parent + 1; + size_t right_child = left_child + 1; + + // check whether left child is larger than parent + if (left_child < heap_size && heap[left_child].count < heap[parent].count) { + parent = left_child; + } + // check whether right child is larger than parent + if (right_child < heap_size && heap[right_child].count < heap[parent].count) { + parent = right_child; + } + if (parent == left_child || parent == right_child) { + swapHeapBucket(heap[(parent-1)/2], heap[parent]); + } else { + break; + } + } +} + +void BlockSplitTopK::heapifyUp(int start) { + size_t child = start; + + // check whether smaller than parent + if (heap_size < 2 || child == 0) { + return; + } + + while (child > 0) { + size_t parent = (child - 1) / 2; + + if (parent >= 0 && heap[child].count < heap[parent].count) { + swapHeapBucket(heap[parent], heap[child]); + child = parent; + } else { + break; + } + } +} + +int BlockSplitTopK::checkExistInHeap(const std::string &item) { + uint32_t itemlen = item.size(); + const char *data = item.c_str(); + for (int32_t i = heap_size - 1; i >= 0; --i) { + if (heap[i].itemlen == itemlen && memcmp(heap[i].item, data, itemlen) == 0) { + return i; + } + } + return -1; +} + +int BlockSplitTopK::cmpHeapBucketCount(const HeapBucket& a, const HeapBucket& b) { + return a.count < b.count ? 1 : a.count > b.count ? -1 : 0; +} + +bool BlockSplitTopK::cmpHeapBucketItem(const HeapBucket& a, const HeapBucket& b) { + return a.itemlen == b.itemlen && (memcmp(a.item, b.item, a.itemlen) == 0); +} + +void BlockSplitTopK::TopkAdd(const std::string &item, uint32_t increment) { + uint32_t itemlen = item.size(); + const char *data = item.c_str(); + counter_t maxCount = 0; + uint32_t fp = TOPK_HASH(data, itemlen, GA); + + int location = checkExistInHeap(item); + + for (size_t i = 0; i < depth; ++i) { + uint32_t loc = TOPK_HASH(data, itemlen, i) % width; + if (buckets[i][loc].fp == 0) { + buckets[i][loc].fp = fp; + buckets[i][loc].count = increment; + } else if (buckets[i][loc].fp == fp) { + buckets[i][loc].count += increment; + } else { + // decay + uint32_t local_incr = increment; + for (; local_incr > 0; --local_incr) { + double decay; + if (buckets[i][loc].count < TOPK_DECAY_LOOKUP_TABLE) { + decay = lookupTable[buckets[i][loc].count]; + } else { + decay = pow(lookupTable[TOPK_DECAY_LOOKUP_TABLE - 1], + (buckets[i][loc].count / (TOPK_DECAY_LOOKUP_TABLE - 1))) * + lookupTable[buckets[i][loc].count % (TOPK_DECAY_LOOKUP_TABLE - 1)]; + } + double chance = rand() / (double)RAND_MAX; + if (chance < decay) { + -- buckets[i][loc].count; + if (buckets[i][loc].count == 0) { + buckets[i][loc].fp = fp; + buckets[i][loc].count = local_incr; + break; + } + } + } + } + maxCount = std::max(maxCount, buckets[i][loc].count); + } + + if (heap.size() == heap_size) { + if (location == -1) { + if (heap[0].count == maxCount || heap[0].count + 1 == maxCount) { + heap[0].fp = fp; + heap[0].itemlen = itemlen; + heap[0].item = const_cast(data); + heap[0].count = maxCount; + + heapifyDown(0); + } + } else { + heap[location].count += 1; + heapifyDown(location); + } + } else { + heap[heap_size].fp = fp; + heap[heap_size].itemlen = itemlen; + heap[heap_size].item = const_cast(data); + heap[heap_size].count = maxCount; + + heapifyUp(heap_size); + heap_size ++; + } +} + +bool BlockSplitTopK::TopkQuery(const std::string &item) { + return checkExistInHeap(item) != -1; +} + +std::vector BlockSplitTopK::TopkList() { + std::vector result(heap.begin(), heap.begin() + heap_size); + std::sort(result.begin(), result.end(), [this] (const HeapBucket& a, const HeapBucket& b) { + return cmpHeapBucketCount(a, b) > 0; + }); + return result; +} + +std::string_view BlockSplitTopK::GetData() const { + // Serialization not implemented yet + return std::string_view(); +} + +BlockSplitTopK CreateBlockSplitTopK(uint32_t k, uint32_t width, uint32_t depth, double decay) { + return BlockSplitTopK(k, width, depth, decay); +} \ No newline at end of file diff --git a/src/types/topk.h b/src/types/topk.h new file mode 100644 index 00000000000..24b83d4e06e --- /dev/null +++ b/src/types/topk.h @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + */ + +#pragma once + +#include +#include +#include +#include +#include + +static constexpr int TOPK_DECAY_LOOKUP_TABLE = 256; + +using counter_t = uint32_t; + +struct HeapBucket { + uint32_t fp; + uint32_t itemlen; + char* item; + counter_t count; + + HeapBucket& operator=(const HeapBucket& other) { + if (this != &other) { + fp = other.fp; + itemlen = other.itemlen; + item = other.item; + count = other.count; + } + return *this; + } +}; + +struct Bucket { + uint32_t fp; + counter_t count; +}; + +class BlockSplitTopK { +public: + explicit BlockSplitTopK(uint32_t k, uint32_t width, uint32_t depth, double decay) : + k(k), width(width), depth(depth), decay(decay), heap_size(0) { + buckets.resize(depth, std::vector(width, Bucket{0, 0})); + heap.resize(k, HeapBucket{0, 0, nullptr, 0}); + for (int i = 0; i < TOPK_DECAY_LOOKUP_TABLE; ++i) { + lookupTable[i] = pow(decay, i); + } + } + + void TopkAdd(const std::string &item, uint32_t increment); + bool TopkQuery(const std::string &item); + std::vector TopkList(); + + std::string_view GetData() const; +private: + void heapifyDown(int start); + void heapifyUp(int start); + int checkExistInHeap(const std::string &item); + int cmpHeapBucketCount(const HeapBucket& a, const HeapBucket& b); + bool cmpHeapBucketItem(const HeapBucket& a, const HeapBucket& b); + void swapHeapBucket(HeapBucket& a, HeapBucket& b); + + uint32_t k; + uint32_t width; + uint32_t depth; + double decay; + + size_t heap_size; + + std::vector> buckets; + std::vector heap; + double lookupTable[TOPK_DECAY_LOOKUP_TABLE]; +}; + +BlockSplitTopK CreateBlockSplitTopK(uint32_t k, uint32_t width, uint32_t depth, double decay); \ No newline at end of file From 88048f8666262d832dd693398c4ab9d45b8c8eb9 Mon Sep 17 00:00:00 2001 From: aibin <2573214643@qq.com> Date: Wed, 29 Oct 2025 18:49:42 +0800 Subject: [PATCH 02/18] fix: topk data structure --- src/types/redis_topk.cc | 146 +++++++++++++++++++++++++++++++++++----- src/types/redis_topk.h | 15 ++++- src/types/topk.cc | 89 ++++++++++++------------ src/types/topk.h | 46 ++++++------- 4 files changed, 210 insertions(+), 86 deletions(-) diff --git a/src/types/redis_topk.cc b/src/types/redis_topk.cc index 9d563dd96e8..9569641fffc 100644 --- a/src/types/redis_topk.cc +++ b/src/types/redis_topk.cc @@ -38,21 +38,65 @@ rocksdb::Status TopK::Reserve(engine::Context &ctx, const Slice& user_key, return createTopK(ctx, ns_key, k, width, depth, decay, &topk_metadata); } -/* TODO: implemention */ -rocksdb::Status TopK::Add([[maybe_unused]] engine::Context &ctx, [[maybe_unused]] const Slice &user_key, - [[maybe_unused]] const Slice &items) { - return rocksdb::Status::OK(); +rocksdb::Status TopK::Add(engine::Context &ctx, const Slice &user_key, + const Slice &items) { + std::string ns_key = AppendNamespacePrefix(user_key); + + TopKMetadata topk_metadata; + rocksdb::Status s = getTopKMetadata(ctx, ns_key, &topk_metadata); + if (!s.ok()) return s; + + auto batch = storage_->GetWriteBatchBase(); + WriteBatchLogData log_data(kRedisTopK, {"Add"}); + s = batch->PutLogData(log_data.Encode()); + if (!s.ok()) return s; + + BlockSplitTopK topk(topk_metadata.top_k, topk_metadata.width, topk_metadata.depth, topk_metadata.decay); + s = getTopKData(ctx, ns_key, topk_metadata, &topk); + if (!s.ok()) return s; + + topk.Add(items.data_, 1); + + s = setTopkData(ctx, ns_key, topk_metadata, topk); + if (!s.ok()) return s; + + return storage_->Write(ctx, storage_->DefaultWriteOptions(), batch->GetWriteBatch()); } -/* TODO: implemention */ -rocksdb::Status TopK::Query([[maybe_unused]] engine::Context &ctx, [[maybe_unused]] const Slice& user_key, - [[maybe_unused]] const Slice &items, [[maybe_unused]] bool *exists) { +rocksdb::Status TopK::Query(engine::Context &ctx, const Slice& user_key, + const Slice &items, bool *exists) { + std::string ns_key = AppendNamespacePrefix(user_key); + + TopKMetadata topk_metadata; + rocksdb::Status s = getTopKMetadata(ctx, ns_key, &topk_metadata); + if (!s.ok()) return s; + + BlockSplitTopK topk(topk_metadata.top_k, topk_metadata.width, topk_metadata.depth, topk_metadata.decay); + s = getTopKData(ctx, ns_key, topk_metadata, &topk); + if (!s.ok()) return s; + + *exists = topk.Query(items.data_); + return rocksdb::Status::OK(); } -/* TODO: implemention */ -rocksdb::Status TopK::List([[maybe_unused]] engine::Context &ctx, [[maybe_unused]] const Slice& user_key, - [[maybe_unused]] std::vector &items) { +rocksdb::Status TopK::List(engine::Context &ctx, const Slice& user_key, + std::vector &items) { + std::string ns_key = AppendNamespacePrefix(user_key); + + TopKMetadata topk_metadata; + rocksdb::Status s = getTopKMetadata(ctx, ns_key, &topk_metadata); + if (!s.ok()) return s; + + BlockSplitTopK topk(topk_metadata.top_k, topk_metadata.width, topk_metadata.depth, topk_metadata.decay); + s = getTopKData(ctx, ns_key, topk_metadata, &topk); + if (!s.ok()) return s; + + auto heap_buckets = topk.List(); + for (auto &bucket : heap_buckets) { + items.emplace_back(bucket.item, bucket.itemlen); + } + return rocksdb::Status::OK(); } @@ -83,7 +127,7 @@ rocksdb::Status TopK::createTopK(engine::Context &ctx, const Slice &ns_key, metadata->depth = depth; metadata->decay = decay; - auto block_split_top_k = CreateBlockSplitTopK(k, width, depth, decay); + BlockSplitTopK block_split_top_k(k, width, depth, decay); auto batch = storage_->GetWriteBatchBase(); WriteBatchLogData log_data(kRedisTopK, {"createTopK"}); @@ -95,19 +139,89 @@ rocksdb::Status TopK::createTopK(engine::Context &ctx, const Slice &ns_key, s = batch->Put(metadata_cf_handle_, ns_key, top_k_meta_bytes); if (!s.ok()) return s; - std::string tk_key = getTKKey(ns_key, *metadata); - // TODO: how to save data structure---block split topk - s = batch->Put(ns_key, block_split_top_k.GetData()); + s = setTopkData(ctx, ns_key, *metadata, block_split_top_k); if (!s.ok()) return s; return storage_->Write(ctx, storage_->DefaultWriteOptions(), batch->GetWriteBatch()); } -std::string TopK::getTKKey(const Slice &ns_key, const TopKMetadata &metadata) { +rocksdb::Status TopK::getTopKData(engine::Context &ctx, const Slice& ns_key, const TopKMetadata &metadata, + BlockSplitTopK *topk) { + for (uint8_t i = 0; i < 3; i++) { + std::string tk_key = getTKKey(ns_key, metadata, i); + rocksdb::PinnableSlice pinnable_value; + rocksdb::Status s = storage_->Get(ctx, ctx.GetReadOptions(), tk_key, &pinnable_value); + if (!s.ok()) return s; + if (i == 0) { + if (pinnable_value.size() != metadata.width * metadata.depth * sizeof(Bucket)) { + return rocksdb::Status::Corruption("TopK data corrupted: buckets size mismatch"); + } + memcpy(topk->buckets, pinnable_value.data(), pinnable_value.size()); + } else if (i == 1) { + if (pinnable_value.size() != metadata.top_k * sizeof(HeapBucket)) { + return rocksdb::Status::Corruption("TopK data corrupted: heap size mismatch"); + } + memcpy(topk->heap, pinnable_value.data(), pinnable_value.size()); + for (uint32_t j = 0; j < metadata.top_k; j++) { + std::string hb_key = getHBKey(ns_key, metadata, i, j); + rocksdb::PinnableSlice hb_value; + rocksdb::Status s = storage_->Get(ctx, ctx.GetReadOptions(), hb_key, &hb_value); + if (!s.ok()) return s; + if (hb_value.size() != topk->heap[j].itemlen) { + return rocksdb::Status::Corruption("TopK data corrupted: heap bucket size mismatch"); + } + topk->heap[j].item = new char[topk->heap[j].itemlen]; + memcpy(topk->heap[j].item, hb_value.data(), hb_value.size()); + } + } else { + topk->heap_size = static_cast(std::stoul(pinnable_value.data())); + } + } + return rocksdb::Status::OK(); +} + +rocksdb::Status TopK::setTopkData(engine::Context &ctx, const Slice& ns_key, const TopKMetadata &metadata, + const BlockSplitTopK &topk) { + auto batch = storage_->GetWriteBatchBase(); + WriteBatchLogData log_data(kRedisTopK, {"setTopkData"}); + rocksdb::Status s = batch->PutLogData(log_data.Encode()); + if (!s.ok()) return s; + + for (uint8_t i = 0; i < 3; i++) { + std::string tk_key = getTKKey(ns_key, metadata, i); + std::string tk_value; + if (i == 0) { + tk_value.assign(reinterpret_cast(topk.buckets), metadata.width * metadata.depth * sizeof(Bucket)); + } else if (i == 1) { + tk_value.assign(reinterpret_cast(topk.heap), metadata.top_k * sizeof(HeapBucket)); + for (uint32_t j = 0; j < metadata.top_k; j++) { + std::string hb_key = getHBKey(ns_key, metadata, i, j); + std::string hb_value(topk.heap[j].item, topk.heap[j].itemlen); + s = batch->Put(hb_key, hb_value); + if (!s.ok()) return s; + } + } else { + tk_value = std::to_string(topk.heap_size); + } + rocksdb::Status s = batch->Put(tk_key, tk_value); + if (!s.ok()) return s; + } + + return storage_->Write(ctx, storage_->DefaultWriteOptions(), batch->GetWriteBatch()); +} + +std::string TopK::getTKKey(const Slice &ns_key, const TopKMetadata &metadata, uint8_t index) { std::string sub_key; + PutFixed8(&sub_key, index); + std::string bf_key = InternalKey(ns_key, sub_key, metadata.version, storage_->IsSlotIdEncoded()).Encode(); + return bf_key; +} +std::string TopK::getHBKey(const Slice &ns_key, const TopKMetadata &metadata, uint8_t topk_index, uint32_t hp_index) { + std::string sub_key; + PutFixed8(&sub_key, topk_index); + PutFixed32(&sub_key, hp_index); return InternalKey(ns_key, sub_key, metadata.version, storage_->IsSlotIdEncoded()).Encode(); } - } // namespace redis \ No newline at end of file diff --git a/src/types/redis_topk.h b/src/types/redis_topk.h index ad8cca2280f..2759a0436d6 100644 --- a/src/types/redis_topk.h +++ b/src/types/redis_topk.h @@ -21,6 +21,7 @@ #pragma once #include "storage/redis_db.h" +#include "topk.h" namespace redis { @@ -46,7 +47,8 @@ class TopK : public SubKeyScanner { explicit TopK(engine::Storage* storage, const std::string& ns) : SubKeyScanner(storage, ns) {} - rocksdb::Status Reserve(engine::Context &ctx, const Slice& user_key, uint32_t k, uint32_t width, uint32_t depth, double decay); + rocksdb::Status Reserve(engine::Context &ctx, const Slice& user_key, uint32_t k, + uint32_t width, uint32_t depth, double decay); rocksdb::Status Query(engine::Context &ctx, const Slice& user_key, const Slice &items, bool *exists); @@ -54,12 +56,21 @@ class TopK : public SubKeyScanner { const Slice &items); rocksdb::Status List(engine::Context &ctx, const Slice& user_key, std::vector &items); rocksdb::Status Info(engine::Context &ctx, const Slice& user_key, TopKInfo *info); + private: rocksdb::Status getTopKMetadata(engine::Context &ctx, const Slice &ns_key, TopKMetadata *metadata); rocksdb::Status createTopK(engine::Context &ctx, const Slice &ns_key, uint32_t k, uint32_t width, uint32_t depth, double decay, TopKMetadata *metadata); - std::string getTKKey(const Slice &ns_key, const TopKMetadata &metadata); + + rocksdb::Status getTopKData(engine::Context &ctx, const Slice& ns_key, const TopKMetadata &metadata, + BlockSplitTopK *topk); + rocksdb::Status setTopkData(engine::Context &ctx, const Slice& ns_key, const TopKMetadata &metadata, + const BlockSplitTopK &topk); + + std::string getTKKey(const Slice &ns_key, const TopKMetadata &metadata, uint8_t index); + + std::string getHBKey(const Slice &ns_key, const TopKMetadata &metadata, uint8_t topk_index, uint32_t hp_index); }; } // namespace redis \ No newline at end of file diff --git a/src/types/topk.cc b/src/types/topk.cc index 1cf07a3c1f2..54daeaec51c 100644 --- a/src/types/topk.cc +++ b/src/types/topk.cc @@ -219,10 +219,17 @@ static uint32_t MurmurHash2(const void *key, int len, uint32_t seed) { } /* ---------------------------------------------------------------------- */ -void BlockSplitTopK::swapHeapBucket(HeapBucket& a, HeapBucket& b) { - HeapBucket temp = a; - a = b; - b = temp; +void BlockSplitTopK::swapHeapBucket(HeapBucket *a, HeapBucket* b) { + HeapBucket tmp = *a; + a->count = b->count; + a->fp = b->fp; + a->itemlen = b->itemlen; + a->item = b->item; + + b->count = tmp.count; + b->fp = tmp.fp; + b->itemlen = tmp.itemlen; + b->item = tmp.item; } void BlockSplitTopK::heapifyDown(int start) { @@ -246,7 +253,7 @@ void BlockSplitTopK::heapifyDown(int start) { parent = right_child; } if (parent == left_child || parent == right_child) { - swapHeapBucket(heap[(parent-1)/2], heap[parent]); + swapHeapBucket(&heap[(parent-1)/2], &heap[parent]); } else { break; } @@ -265,7 +272,7 @@ void BlockSplitTopK::heapifyUp(int start) { size_t parent = (child - 1) / 2; if (parent >= 0 && heap[child].count < heap[parent].count) { - swapHeapBucket(heap[parent], heap[child]); + swapHeapBucket(&heap[parent], &heap[child]); child = parent; } else { break; @@ -284,15 +291,11 @@ int BlockSplitTopK::checkExistInHeap(const std::string &item) { return -1; } -int BlockSplitTopK::cmpHeapBucketCount(const HeapBucket& a, const HeapBucket& b) { +int BlockSplitTopK::cmpHeapBucketCount(const HeapBucket &a, const HeapBucket &b) { return a.count < b.count ? 1 : a.count > b.count ? -1 : 0; } -bool BlockSplitTopK::cmpHeapBucketItem(const HeapBucket& a, const HeapBucket& b) { - return a.itemlen == b.itemlen && (memcmp(a.item, b.item, a.itemlen) == 0); -} - -void BlockSplitTopK::TopkAdd(const std::string &item, uint32_t increment) { +void BlockSplitTopK::Add(const std::string &item, uint32_t increment) { uint32_t itemlen = item.size(); const char *data = item.c_str(); counter_t maxCount = 0; @@ -302,55 +305,61 @@ void BlockSplitTopK::TopkAdd(const std::string &item, uint32_t increment) { for (size_t i = 0; i < depth; ++i) { uint32_t loc = TOPK_HASH(data, itemlen, i) % width; - if (buckets[i][loc].fp == 0) { - buckets[i][loc].fp = fp; - buckets[i][loc].count = increment; - } else if (buckets[i][loc].fp == fp) { - buckets[i][loc].count += increment; + + loc += i * width; + if (buckets[loc].count == 0) { + buckets[loc].fp = fp; + buckets[loc].count = increment; + } else if (buckets[loc].fp == fp && location != -1) { + buckets[loc].count += increment; } else { // decay uint32_t local_incr = increment; for (; local_incr > 0; --local_incr) { double decay; - if (buckets[i][loc].count < TOPK_DECAY_LOOKUP_TABLE) { - decay = lookupTable[buckets[i][loc].count]; + if (buckets[loc].count < TOPK_DECAY_LOOKUP_TABLE) { + decay = lookupTable[buckets[loc].count]; } else { decay = pow(lookupTable[TOPK_DECAY_LOOKUP_TABLE - 1], - (buckets[i][loc].count / (TOPK_DECAY_LOOKUP_TABLE - 1))) * - lookupTable[buckets[i][loc].count % (TOPK_DECAY_LOOKUP_TABLE - 1)]; + (buckets[loc].count / (TOPK_DECAY_LOOKUP_TABLE - 1))) * + lookupTable[buckets[loc].count % (TOPK_DECAY_LOOKUP_TABLE - 1)]; } double chance = rand() / (double)RAND_MAX; if (chance < decay) { - -- buckets[i][loc].count; - if (buckets[i][loc].count == 0) { - buckets[i][loc].fp = fp; - buckets[i][loc].count = local_incr; + -- buckets[loc].count; + if (buckets[loc].count == 0) { + buckets[loc].fp = fp; + buckets[loc].count = 1; break; } } } } - maxCount = std::max(maxCount, buckets[i][loc].count); + maxCount = std::max(maxCount, buckets[loc].count); } - if (heap.size() == heap_size) { + if (k == heap_size) { if (location == -1) { if (heap[0].count == maxCount || heap[0].count + 1 == maxCount) { heap[0].fp = fp; heap[0].itemlen = itemlen; - heap[0].item = const_cast(data); + delete heap[0].item; + heap[0].item = new char[itemlen]; + memcpy(heap[0].item, data, itemlen); + heap[0].count = maxCount; heapifyDown(0); } } else { - heap[location].count += 1; + heap[location].count += increment; heapifyDown(location); } } else { heap[heap_size].fp = fp; heap[heap_size].itemlen = itemlen; - heap[heap_size].item = const_cast(data); + heap[heap_size].item = new char[itemlen]; + memcpy(heap[heap_size].item, data, itemlen); heap[heap_size].count = maxCount; heapifyUp(heap_size); @@ -358,23 +367,17 @@ void BlockSplitTopK::TopkAdd(const std::string &item, uint32_t increment) { } } -bool BlockSplitTopK::TopkQuery(const std::string &item) { +bool BlockSplitTopK::Query(const std::string &item) { return checkExistInHeap(item) != -1; } -std::vector BlockSplitTopK::TopkList() { - std::vector result(heap.begin(), heap.begin() + heap_size); - std::sort(result.begin(), result.end(), [this] (const HeapBucket& a, const HeapBucket& b) { +std::vector BlockSplitTopK::List() { + std::vector result(heap_size); + for (uint32_t i = 0; i < heap_size; i ++) { + result[i] = heap[i]; + } + std::sort(result.begin(), result.end(), [this] (const HeapBucket &a, const HeapBucket &b) { return cmpHeapBucketCount(a, b) > 0; }); return result; -} - -std::string_view BlockSplitTopK::GetData() const { - // Serialization not implemented yet - return std::string_view(); -} - -BlockSplitTopK CreateBlockSplitTopK(uint32_t k, uint32_t width, uint32_t depth, double decay) { - return BlockSplitTopK(k, width, depth, decay); } \ No newline at end of file diff --git a/src/types/topk.h b/src/types/topk.h index 24b83d4e06e..c0e3374e21e 100644 --- a/src/types/topk.h +++ b/src/types/topk.h @@ -35,16 +35,6 @@ struct HeapBucket { uint32_t itemlen; char* item; counter_t count; - - HeapBucket& operator=(const HeapBucket& other) { - if (this != &other) { - fp = other.fp; - itemlen = other.itemlen; - item = other.item; - count = other.count; - } - return *this; - } }; struct Bucket { @@ -54,27 +44,35 @@ struct Bucket { class BlockSplitTopK { public: + BlockSplitTopK() = delete; explicit BlockSplitTopK(uint32_t k, uint32_t width, uint32_t depth, double decay) : k(k), width(width), depth(depth), decay(decay), heap_size(0) { - buckets.resize(depth, std::vector(width, Bucket{0, 0})); - heap.resize(k, HeapBucket{0, 0, nullptr, 0}); + buckets = new Bucket[width * depth]; + heap = new HeapBucket[k]; + std::fill_n(buckets, width * depth, Bucket{0, 0}); + std::fill_n(heap, k, HeapBucket{0, 0, nullptr, 0}); for (int i = 0; i < TOPK_DECAY_LOOKUP_TABLE; ++i) { lookupTable[i] = pow(decay, i); } } - void TopkAdd(const std::string &item, uint32_t increment); - bool TopkQuery(const std::string &item); - std::vector TopkList(); + ~BlockSplitTopK() { + for (size_t i = 0; i < k; ++i) { + delete[] heap[i].item; + } + delete[] buckets; + delete[] heap; + } + + void Add(const std::string &item, uint32_t increment); + bool Query(const std::string &item); + std::vector List(); - std::string_view GetData() const; -private: void heapifyDown(int start); void heapifyUp(int start); int checkExistInHeap(const std::string &item); - int cmpHeapBucketCount(const HeapBucket& a, const HeapBucket& b); - bool cmpHeapBucketItem(const HeapBucket& a, const HeapBucket& b); - void swapHeapBucket(HeapBucket& a, HeapBucket& b); + int cmpHeapBucketCount(const HeapBucket &a, const HeapBucket &b); + void swapHeapBucket(HeapBucket *a, HeapBucket *b); uint32_t k; uint32_t width; @@ -83,9 +81,7 @@ class BlockSplitTopK { size_t heap_size; - std::vector> buckets; - std::vector heap; + Bucket *buckets; + HeapBucket *heap; double lookupTable[TOPK_DECAY_LOOKUP_TABLE]; -}; - -BlockSplitTopK CreateBlockSplitTopK(uint32_t k, uint32_t width, uint32_t depth, double decay); \ No newline at end of file +}; \ No newline at end of file From ed39779d8d30a7a98ecbdafc833c42f4dc2593a4 Mon Sep 17 00:00:00 2001 From: aibin <2573214643@qq.com> Date: Thu, 30 Oct 2025 18:30:38 +0800 Subject: [PATCH 03/18] add: cpp test of top_k data structure. --- src/types/topk.cc | 84 +++++++++---------- src/types/topk.h | 1 - tests/cppunit/types/topk_test.cc | 136 +++++++++++++++++++++++++++++++ 3 files changed, 179 insertions(+), 42 deletions(-) create mode 100644 tests/cppunit/types/topk_test.cc diff --git a/src/types/topk.cc b/src/types/topk.cc index 54daeaec51c..ff34822cbcf 100644 --- a/src/types/topk.cc +++ b/src/types/topk.cc @@ -76,7 +76,7 @@ static uint32_t MurmurHash2(const void *key, int len, uint32_t seed) { len -= 4; } - // Handle the last few bytes of the input array + // Handle the last few bytes of the input heap switch (len) { case 3: @@ -219,65 +219,65 @@ static uint32_t MurmurHash2(const void *key, int len, uint32_t seed) { } /* ---------------------------------------------------------------------- */ -void BlockSplitTopK::swapHeapBucket(HeapBucket *a, HeapBucket* b) { - HeapBucket tmp = *a; - a->count = b->count; - a->fp = b->fp; - a->itemlen = b->itemlen; - a->item = b->item; - - b->count = tmp.count; - b->fp = tmp.fp; - b->itemlen = tmp.itemlen; - b->item = tmp.item; -} - void BlockSplitTopK::heapifyDown(int start) { - size_t parent = start; + size_t child = start; // check whether larger than children - if (heap_size < 2 || (heap_size - 2) / 2 < parent) { + if (heap_size < 2 || (heap_size - 2) / 2 < child) { return; } - while (parent < heap_size) { - size_t left_child = 2 * parent + 1; - size_t right_child = left_child + 1; + child = 2 * child + 1; + if ((child + 1) < heap_size && (heap[child].count > heap[child + 1].count)) { + ++child; + } + if (heap[child].count > heap[start].count) { + return; + } - // check whether left child is larger than parent - if (left_child < heap_size && heap[left_child].count < heap[parent].count) { - parent = left_child; - } - // check whether right child is larger than parent - if (right_child < heap_size && heap[right_child].count < heap[parent].count) { - parent = right_child; - } - if (parent == left_child || parent == right_child) { - swapHeapBucket(&heap[(parent-1)/2], &heap[parent]); - } else { + HeapBucket top; + memcpy(&top, &heap[start], sizeof(HeapBucket)); + do { + memcpy(&heap[start], &heap[child], sizeof(HeapBucket)); + start = child; + + if ((heap_size - 2) / 2 < child) { break; } - } + child = 2 * child + 1; + + if ((child + 1) < heap_size && (heap[child].count > heap[child + 1].count)) { + ++child; + } + } while (heap[child].count < top.count); + memcpy(&heap[start], &top, sizeof(HeapBucket)); } void BlockSplitTopK::heapifyUp(int start) { - size_t child = start; + size_t parent = start; // check whether smaller than parent - if (heap_size < 2 || child == 0) { + if (heap_size < 2 || parent == 0) { return; } - while (child > 0) { - size_t parent = (child - 1) / 2; + parent = (parent - 1) / 2; + if (heap[parent].count > heap[start].count) { + return; + } - if (parent >= 0 && heap[child].count < heap[parent].count) { - swapHeapBucket(&heap[parent], &heap[child]); - child = parent; - } else { + HeapBucket bottom; + memcpy(&bottom, &heap[start], sizeof(HeapBucket)); + do { + memcpy(&heap[start], &heap[parent], sizeof(HeapBucket)); + start = parent; + + if (start == 0) { break; } - } + parent = (parent - 1) / 2; + } while (heap[parent].count > bottom.count); + memcpy(&heap[start], &bottom, sizeof(HeapBucket)); } int BlockSplitTopK::checkExistInHeap(const std::string &item) { @@ -310,8 +310,10 @@ void BlockSplitTopK::Add(const std::string &item, uint32_t increment) { if (buckets[loc].count == 0) { buckets[loc].fp = fp; buckets[loc].count = increment; + maxCount = std::max(maxCount, buckets[loc].count); } else if (buckets[loc].fp == fp && location != -1) { buckets[loc].count += increment; + maxCount = std::max(maxCount, buckets[loc].count); } else { // decay uint32_t local_incr = increment; @@ -330,12 +332,12 @@ void BlockSplitTopK::Add(const std::string &item, uint32_t increment) { if (buckets[loc].count == 0) { buckets[loc].fp = fp; buckets[loc].count = 1; + maxCount = std::max(maxCount, buckets[loc].count); break; } } } } - maxCount = std::max(maxCount, buckets[loc].count); } if (k == heap_size) { diff --git a/src/types/topk.h b/src/types/topk.h index c0e3374e21e..559d4c010e3 100644 --- a/src/types/topk.h +++ b/src/types/topk.h @@ -72,7 +72,6 @@ class BlockSplitTopK { void heapifyUp(int start); int checkExistInHeap(const std::string &item); int cmpHeapBucketCount(const HeapBucket &a, const HeapBucket &b); - void swapHeapBucket(HeapBucket *a, HeapBucket *b); uint32_t k; uint32_t width; diff --git a/tests/cppunit/types/topk_test.cc b/tests/cppunit/types/topk_test.cc new file mode 100644 index 00000000000..21af3171480 --- /dev/null +++ b/tests/cppunit/types/topk_test.cc @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + */ + +#include + +#include + +#include "test_base.h" +#include "types/redis_topk.h" + +static constexpr uint32_t k = 5; +static constexpr uint32_t width = 7; +static constexpr uint32_t depth = 8; +static constexpr double decay = 0.9; + +class RedisTopKTest : public TestBase { + protected: + explicit RedisTopKTest() : TestBase() { + top_k_ = std::make_unique(storage_.get(), "topk_ns"); + } + ~RedisTopKTest() override = default; + + void SetUp() override { + key_ = "test_topk->key"; + top_k_->Reserve(*ctx_, key_, k, width, depth, decay); + } + + void TearDown() override {} + + std::unique_ptr top_k_; +}; + +TEST_F(RedisTopKTest, TestTopKInfo) { + // test exist key + redis::TopKInfo info1; + top_k_->Info(*ctx_, key_, &info1); + ASSERT_EQ(info1.k, k); + ASSERT_EQ(info1.width, width); + ASSERT_EQ(info1.depth, depth); + ASSERT_EQ(info1.decay, decay); + + // test not exist key + redis::TopKInfo info2; + auto s = top_k_->Info(*ctx_, "not_exist_key", &info2); + ASSERT_FALSE(s.ok()); +} + +TEST_F(RedisTopKTest, TestTopKAddAndQuery) { + // test not exist key + std::string no_exist_key = "no_exist_key"; + auto s = top_k_->Add(*ctx_, no_exist_key, "1"); + ASSERT_FALSE(s.ok()); + + bool exist; + s = top_k_->Query(*ctx_, no_exist_key, "1", &exist); + ASSERT_FALSE(s.ok()); + + std::vector list; + s = top_k_->List(*ctx_, no_exist_key, list); + ASSERT_FALSE(s.ok()); + + // test exist key + std::vector values1 = {"1", "2", "3", "4", "5"}; + std::vector values2 = {"6", "7", "8", "9", "10"}; + std::unordered_set values_set1(values1.begin(), values1.end()); + std::unordered_set values_set2(values2.begin(), values2.end()); + + // found not exist values1 + for (size_t i = 0; i < values1.size(); ++i) { + bool found = true; + top_k_->Query(*ctx_, key_, values1[i], &found); + ASSERT_FALSE(found); + } + // add values1, and query values1. + for (size_t i = 0; i < values1.size(); ++i) { + top_k_->Add(*ctx_, key_, values1[i]); + bool found = false; + top_k_->Query(*ctx_, key_, values1[i], &found); + ASSERT_TRUE(found); + } + for (size_t i = 0; i < values1.size(); ++i) { + bool found = false; + top_k_->Query(*ctx_, key_, values1[i], &found); + ASSERT_TRUE(found); + } + + // found topk list. + std::vector top_k_list; + top_k_->List(*ctx_, key_, top_k_list); + ASSERT_EQ(top_k_list.size(), values1.size()); + for (size_t i = 0; i < k; ++i) { + ASSERT_TRUE(values_set1.find(top_k_list[i]) != values_set1.end()); + } + + // heap is full, need remove values1. + for (size_t i = 0; i < values2.size(); ++i) { + bool found = false; + // due to decay, topk is possiable to remove values1. + while (!found) { + top_k_->Add(*ctx_, key_, values2[i]); + top_k_->Query(*ctx_, key_, values2[i], &found); + } + top_k_->Add(*ctx_, key_, values2[i]); + } + + // values1 is removed. + for (size_t i = 0; i < values1.size(); ++i) { + bool found = true; + top_k_->Query(*ctx_, key_, values1[i], &found); + ASSERT_FALSE(found); + } + + // found topk list. + top_k_list.clear(); + top_k_->List(*ctx_, key_, top_k_list); + for (size_t i = 0; i < top_k_list.size(); ++i) { + ASSERT_TRUE(values_set2.find(top_k_list[i]) != values_set2.end()); + } +} \ No newline at end of file From 0838c6d7f0a4a7dafca762c7410cade080a9ac79 Mon Sep 17 00:00:00 2001 From: aibin <2573214643@qq.com> Date: Fri, 31 Oct 2025 10:30:57 +0800 Subject: [PATCH 04/18] fix: clang-format code --- src/commands/cmd_topk.cc | 12 +- src/storage/redis_metadata.h | 15 +- src/types/redis_topk.cc | 303 ++++++++++--------- src/types/redis_topk.h | 37 +-- src/types/topk.cc | 488 +++++++++++++++---------------- src/types/topk.h | 81 ++--- tests/cppunit/types/topk_test.cc | 6 +- 7 files changed, 460 insertions(+), 482 deletions(-) diff --git a/src/commands/cmd_topk.cc b/src/commands/cmd_topk.cc index 5c138c66d3c..d1f2db3295b 100644 --- a/src/commands/cmd_topk.cc +++ b/src/commands/cmd_topk.cc @@ -18,8 +18,8 @@ * */ -#include "commander.h" #include "command_parser.h" +#include "commander.h" #include "error_constants.h" #include "server/server.h" #include "types/redis_topk.h" @@ -30,7 +30,7 @@ constexpr const char *errBadWidth = "Bad width"; constexpr const char *errBadDepth = "Bad depth"; constexpr const char *errBadDecay = "Bad decay"; constexpr const char *errInvalidDecay = "Decay must be between 0 and 1"; -} +} // namespace namespace redis { @@ -84,6 +84,7 @@ class CommandTopKReserve final : public Commander { *output = redis::RESP_OK; return Status::OK(); } + private: uint32_t k_; uint32_t width_ = 7; @@ -150,7 +151,7 @@ class CommandTopKInfo final : public Commander { return Commander::Parse(args); } - Status Execute(engine::Context &ctx, Server *srv, Connection *conn, [[maybe_unused]]std::string *output) override { + Status Execute(engine::Context &ctx, Server *srv, Connection *conn, std::string *output) override { redis::TopK topk_db(srv->storage, conn->GetNamespace()); TopKInfo info; @@ -185,6 +186,7 @@ class CommandTopKInfo final : public Commander { } return Status::OK(); } + private: TopKInfoType type_ = TopKInfoType::kAll; }; @@ -199,7 +201,7 @@ class CommandTopKQuery final : public Commander { auto s = topk.Query(ctx, args_[1], args_[2], &is_exists_); if (!s.ok()) { return {Status::RedisExecErr, s.ToString()}; - } + } *output = redis::Bool(redis::RESP::v2, is_exists_); return Status::OK(); } @@ -211,4 +213,4 @@ REDIS_REGISTER_COMMANDS(TopK, MakeCmdAttr("topk.add", 3, "write" MakeCmdAttr("topk.query", 3, "read-only", 1, 1, 1), MakeCmdAttr("topk.reserve", -3, "write", 1, 1, 1)); -} // namespace redis \ No newline at end of file +} // namespace redis \ No newline at end of file diff --git a/src/storage/redis_metadata.h b/src/storage/redis_metadata.h index 0b1ff777f8a..516fde6ec05 100644 --- a/src/storage/redis_metadata.h +++ b/src/storage/redis_metadata.h @@ -59,9 +59,8 @@ enum RedisType : uint8_t { }; inline constexpr const std::array RedisTypeNames = { - "none", "string", "hash", "list", "set", "zset", "bitmap", - "sortedint", "stream", "MBbloom--", "ReJSON-RL", "hyperloglog", "TDIS-TYPE", "timeseries", - "topk"}; + "none", "string", "hash", "list", "set", "zset", "bitmap", "sortedint", + "stream", "MBbloom--", "ReJSON-RL", "hyperloglog", "TDIS-TYPE", "timeseries", "topk"}; struct RedisTypes { RedisTypes(std::initializer_list list) { @@ -413,7 +412,7 @@ class TimeSeriesMetadata : public Metadata { }; class TopKMetadata : public Metadata { -public: + public: uint32_t top_k; uint16_t width; uint32_t depth; @@ -421,12 +420,8 @@ class TopKMetadata : public Metadata { explicit TopKMetadata(bool generate_version = true) : Metadata(kRedisTopK, generate_version) {} - TopKMetadata(uint64_t top_k, uint64_t width = 7, uint64_t depth = 8, double decay = 0.9, bool generate_version = true) - : Metadata(kRedisTopK, generate_version), - top_k(top_k), - width(width), - depth(depth), - decay(decay) {} + TopKMetadata(uint64_t top_k, uint64_t width = 7, uint64_t depth = 8, double decay = 0.9, bool generate_version = true) + : Metadata(kRedisTopK, generate_version), top_k(top_k), width(width), depth(depth), decay(decay) {} void Encode(std::string *dst) const override; rocksdb::Status Decode(Slice *input) override; diff --git a/src/types/redis_topk.cc b/src/types/redis_topk.cc index 9569641fffc..498758971e8 100644 --- a/src/types/redis_topk.cc +++ b/src/types/redis_topk.cc @@ -19,209 +19,206 @@ */ #include "redis_topk.h" -#include "topk.h" + #include "commands/ttl_util.h" +#include "topk.h" namespace redis { - -rocksdb::Status TopK::Reserve(engine::Context &ctx, const Slice& user_key, - uint32_t k, uint32_t width, uint32_t depth, double decay) { - std::string ns_key = AppendNamespacePrefix(user_key); - - TopKMetadata topk_metadata; - rocksdb::Status s = getTopKMetadata(ctx, ns_key, &topk_metadata); - if (!s.ok() && !s.IsNotFound()) return s; - if (!s.IsNotFound()) { - return rocksdb::Status::InvalidArgument("TopK already exists"); - } - return createTopK(ctx, ns_key, k, width, depth, decay, &topk_metadata); +rocksdb::Status TopK::Reserve(engine::Context &ctx, const Slice &user_key, uint32_t k, uint32_t width, uint32_t depth, + double decay) { + std::string ns_key = AppendNamespacePrefix(user_key); + + TopKMetadata topk_metadata; + rocksdb::Status s = getTopKMetadata(ctx, ns_key, &topk_metadata); + if (!s.ok() && !s.IsNotFound()) return s; + if (!s.IsNotFound()) { + return rocksdb::Status::InvalidArgument("TopK already exists"); + } + + return createTopK(ctx, ns_key, k, width, depth, decay, &topk_metadata); } -rocksdb::Status TopK::Add(engine::Context &ctx, const Slice &user_key, - const Slice &items) { - std::string ns_key = AppendNamespacePrefix(user_key); +rocksdb::Status TopK::Add(engine::Context &ctx, const Slice &user_key, const Slice &items) { + std::string ns_key = AppendNamespacePrefix(user_key); - TopKMetadata topk_metadata; - rocksdb::Status s = getTopKMetadata(ctx, ns_key, &topk_metadata); - if (!s.ok()) return s; - - auto batch = storage_->GetWriteBatchBase(); - WriteBatchLogData log_data(kRedisTopK, {"Add"}); - s = batch->PutLogData(log_data.Encode()); - if (!s.ok()) return s; + TopKMetadata topk_metadata; + rocksdb::Status s = getTopKMetadata(ctx, ns_key, &topk_metadata); + if (!s.ok()) return s; - BlockSplitTopK topk(topk_metadata.top_k, topk_metadata.width, topk_metadata.depth, topk_metadata.decay); - s = getTopKData(ctx, ns_key, topk_metadata, &topk); - if (!s.ok()) return s; + auto batch = storage_->GetWriteBatchBase(); + WriteBatchLogData log_data(kRedisTopK, {"Add"}); + s = batch->PutLogData(log_data.Encode()); + if (!s.ok()) return s; - topk.Add(items.data_, 1); + BlockSplitTopK topk(topk_metadata.top_k, topk_metadata.width, topk_metadata.depth, topk_metadata.decay); + s = getTopKData(ctx, ns_key, topk_metadata, &topk); + if (!s.ok()) return s; - s = setTopkData(ctx, ns_key, topk_metadata, topk); - if (!s.ok()) return s; + topk.Add(items.data_, 1); - return storage_->Write(ctx, storage_->DefaultWriteOptions(), batch->GetWriteBatch()); + s = setTopkData(ctx, ns_key, topk_metadata, topk); + if (!s.ok()) return s; + + return storage_->Write(ctx, storage_->DefaultWriteOptions(), batch->GetWriteBatch()); } -rocksdb::Status TopK::Query(engine::Context &ctx, const Slice& user_key, - const Slice &items, bool *exists) { - std::string ns_key = AppendNamespacePrefix(user_key); +rocksdb::Status TopK::Query(engine::Context &ctx, const Slice &user_key, const Slice &items, bool *exists) { + std::string ns_key = AppendNamespacePrefix(user_key); - TopKMetadata topk_metadata; - rocksdb::Status s = getTopKMetadata(ctx, ns_key, &topk_metadata); - if (!s.ok()) return s; + TopKMetadata topk_metadata; + rocksdb::Status s = getTopKMetadata(ctx, ns_key, &topk_metadata); + if (!s.ok()) return s; - BlockSplitTopK topk(topk_metadata.top_k, topk_metadata.width, topk_metadata.depth, topk_metadata.decay); - s = getTopKData(ctx, ns_key, topk_metadata, &topk); - if (!s.ok()) return s; + BlockSplitTopK topk(topk_metadata.top_k, topk_metadata.width, topk_metadata.depth, topk_metadata.decay); + s = getTopKData(ctx, ns_key, topk_metadata, &topk); + if (!s.ok()) return s; - *exists = topk.Query(items.data_); + *exists = topk.Query(items.data_); - return rocksdb::Status::OK(); + return rocksdb::Status::OK(); } -rocksdb::Status TopK::List(engine::Context &ctx, const Slice& user_key, - std::vector &items) { - std::string ns_key = AppendNamespacePrefix(user_key); +rocksdb::Status TopK::List(engine::Context &ctx, const Slice &user_key, std::vector &items) { + std::string ns_key = AppendNamespacePrefix(user_key); - TopKMetadata topk_metadata; - rocksdb::Status s = getTopKMetadata(ctx, ns_key, &topk_metadata); - if (!s.ok()) return s; - - BlockSplitTopK topk(topk_metadata.top_k, topk_metadata.width, topk_metadata.depth, topk_metadata.decay); - s = getTopKData(ctx, ns_key, topk_metadata, &topk); - if (!s.ok()) return s; + TopKMetadata topk_metadata; + rocksdb::Status s = getTopKMetadata(ctx, ns_key, &topk_metadata); + if (!s.ok()) return s; - auto heap_buckets = topk.List(); - for (auto &bucket : heap_buckets) { - items.emplace_back(bucket.item, bucket.itemlen); - } + BlockSplitTopK topk(topk_metadata.top_k, topk_metadata.width, topk_metadata.depth, topk_metadata.decay); + s = getTopKData(ctx, ns_key, topk_metadata, &topk); + if (!s.ok()) return s; + + auto heap_buckets = topk.List(); + for (auto &bucket : heap_buckets) { + items.emplace_back(bucket.item, bucket.itemlen); + } - return rocksdb::Status::OK(); + return rocksdb::Status::OK(); } -rocksdb::Status TopK::Info(engine::Context &ctx, const Slice& user_key, TopKInfo *info) { - std::string ns_key = AppendNamespacePrefix(user_key); - - TopKMetadata metadata; - auto s = getTopKMetadata(ctx, ns_key, &metadata); - if (!s.ok()) return s; +rocksdb::Status TopK::Info(engine::Context &ctx, const Slice &user_key, TopKInfo *info) { + std::string ns_key = AppendNamespacePrefix(user_key); + + TopKMetadata metadata; + auto s = getTopKMetadata(ctx, ns_key, &metadata); + if (!s.ok()) return s; - info->k = metadata.top_k; - info->width = metadata.width; - info->depth = metadata.depth; - info->decay = metadata.decay; + info->k = metadata.top_k; + info->width = metadata.width; + info->depth = metadata.depth; + info->decay = metadata.decay; - return rocksdb::Status::OK(); + return rocksdb::Status::OK(); } rocksdb::Status TopK::getTopKMetadata(engine::Context &ctx, const Slice &ns_key, TopKMetadata *metadata) { - return Database::GetMetadata(ctx, {kRedisTopK}, ns_key, metadata); + return Database::GetMetadata(ctx, {kRedisTopK}, ns_key, metadata); } -rocksdb::Status TopK::createTopK(engine::Context &ctx, const Slice &ns_key, - uint32_t k, uint32_t width, uint32_t depth, double decay, - TopKMetadata *metadata) { - metadata->top_k = k; - metadata->width = width; - metadata->depth = depth; - metadata->decay = decay; - - BlockSplitTopK block_split_top_k(k, width, depth, decay); - - auto batch = storage_->GetWriteBatchBase(); - WriteBatchLogData log_data(kRedisTopK, {"createTopK"}); - auto s = batch->PutLogData(log_data.Encode()); - if (!s.ok()) return s; - - std::string top_k_meta_bytes; - metadata->Encode(&top_k_meta_bytes); - s = batch->Put(metadata_cf_handle_, ns_key, top_k_meta_bytes); - if (!s.ok()) return s; +rocksdb::Status TopK::createTopK(engine::Context &ctx, const Slice &ns_key, uint32_t k, uint32_t width, uint32_t depth, + double decay, TopKMetadata *metadata) { + metadata->top_k = k; + metadata->width = width; + metadata->depth = depth; + metadata->decay = decay; - s = setTopkData(ctx, ns_key, *metadata, block_split_top_k); - if (!s.ok()) return s; + BlockSplitTopK block_split_top_k(k, width, depth, decay); + + auto batch = storage_->GetWriteBatchBase(); + WriteBatchLogData log_data(kRedisTopK, {"createTopK"}); + auto s = batch->PutLogData(log_data.Encode()); + if (!s.ok()) return s; - return storage_->Write(ctx, storage_->DefaultWriteOptions(), batch->GetWriteBatch()); + std::string top_k_meta_bytes; + metadata->Encode(&top_k_meta_bytes); + s = batch->Put(metadata_cf_handle_, ns_key, top_k_meta_bytes); + if (!s.ok()) return s; + + s = setTopkData(ctx, ns_key, *metadata, block_split_top_k); + if (!s.ok()) return s; + + return storage_->Write(ctx, storage_->DefaultWriteOptions(), batch->GetWriteBatch()); } -rocksdb::Status TopK::getTopKData(engine::Context &ctx, const Slice& ns_key, const TopKMetadata &metadata, +rocksdb::Status TopK::getTopKData(engine::Context &ctx, const Slice &ns_key, const TopKMetadata &metadata, BlockSplitTopK *topk) { - for (uint8_t i = 0; i < 3; i++) { - std::string tk_key = getTKKey(ns_key, metadata, i); - rocksdb::PinnableSlice pinnable_value; - rocksdb::Status s = storage_->Get(ctx, ctx.GetReadOptions(), tk_key, &pinnable_value); + for (uint8_t i = 0; i < 3; i++) { + std::string tk_key = getTKKey(ns_key, metadata, i); + rocksdb::PinnableSlice pinnable_value; + rocksdb::Status s = storage_->Get(ctx, ctx.GetReadOptions(), tk_key, &pinnable_value); + if (!s.ok()) return s; + if (i == 0) { + if (pinnable_value.size() != metadata.width * metadata.depth * sizeof(Bucket)) { + return rocksdb::Status::Corruption("TopK data corrupted: buckets size mismatch"); + } + memcpy(topk->buckets, pinnable_value.data(), pinnable_value.size()); + } else if (i == 1) { + if (pinnable_value.size() != metadata.top_k * sizeof(HeapBucket)) { + return rocksdb::Status::Corruption("TopK data corrupted: heap size mismatch"); + } + memcpy(topk->heap, pinnable_value.data(), pinnable_value.size()); + for (uint32_t j = 0; j < metadata.top_k; j++) { + std::string hb_key = getHBKey(ns_key, metadata, i, j); + rocksdb::PinnableSlice hb_value; + rocksdb::Status s = storage_->Get(ctx, ctx.GetReadOptions(), hb_key, &hb_value); if (!s.ok()) return s; - if (i == 0) { - if (pinnable_value.size() != metadata.width * metadata.depth * sizeof(Bucket)) { - return rocksdb::Status::Corruption("TopK data corrupted: buckets size mismatch"); - } - memcpy(topk->buckets, pinnable_value.data(), pinnable_value.size()); - } else if (i == 1) { - if (pinnable_value.size() != metadata.top_k * sizeof(HeapBucket)) { - return rocksdb::Status::Corruption("TopK data corrupted: heap size mismatch"); - } - memcpy(topk->heap, pinnable_value.data(), pinnable_value.size()); - for (uint32_t j = 0; j < metadata.top_k; j++) { - std::string hb_key = getHBKey(ns_key, metadata, i, j); - rocksdb::PinnableSlice hb_value; - rocksdb::Status s = storage_->Get(ctx, ctx.GetReadOptions(), hb_key, &hb_value); - if (!s.ok()) return s; - if (hb_value.size() != topk->heap[j].itemlen) { - return rocksdb::Status::Corruption("TopK data corrupted: heap bucket size mismatch"); - } - topk->heap[j].item = new char[topk->heap[j].itemlen]; - memcpy(topk->heap[j].item, hb_value.data(), hb_value.size()); - } - } else { - topk->heap_size = static_cast(std::stoul(pinnable_value.data())); + if (hb_value.size() != topk->heap[j].itemlen) { + return rocksdb::Status::Corruption("TopK data corrupted: heap bucket size mismatch"); } + topk->heap[j].item = new char[topk->heap[j].itemlen]; + memcpy(topk->heap[j].item, hb_value.data(), hb_value.size()); + } + } else { + topk->heap_size = static_cast(std::stoul(pinnable_value.data())); } - return rocksdb::Status::OK(); + } + return rocksdb::Status::OK(); } -rocksdb::Status TopK::setTopkData(engine::Context &ctx, const Slice& ns_key, const TopKMetadata &metadata, +rocksdb::Status TopK::setTopkData(engine::Context &ctx, const Slice &ns_key, const TopKMetadata &metadata, const BlockSplitTopK &topk) { - auto batch = storage_->GetWriteBatchBase(); - WriteBatchLogData log_data(kRedisTopK, {"setTopkData"}); - rocksdb::Status s = batch->PutLogData(log_data.Encode()); - if (!s.ok()) return s; - - for (uint8_t i = 0; i < 3; i++) { - std::string tk_key = getTKKey(ns_key, metadata, i); - std::string tk_value; - if (i == 0) { - tk_value.assign(reinterpret_cast(topk.buckets), metadata.width * metadata.depth * sizeof(Bucket)); - } else if (i == 1) { - tk_value.assign(reinterpret_cast(topk.heap), metadata.top_k * sizeof(HeapBucket)); - for (uint32_t j = 0; j < metadata.top_k; j++) { - std::string hb_key = getHBKey(ns_key, metadata, i, j); - std::string hb_value(topk.heap[j].item, topk.heap[j].itemlen); - s = batch->Put(hb_key, hb_value); - if (!s.ok()) return s; - } - } else { - tk_value = std::to_string(topk.heap_size); - } - rocksdb::Status s = batch->Put(tk_key, tk_value); + auto batch = storage_->GetWriteBatchBase(); + WriteBatchLogData log_data(kRedisTopK, {"setTopkData"}); + rocksdb::Status s = batch->PutLogData(log_data.Encode()); + if (!s.ok()) return s; + + for (uint8_t i = 0; i < 3; i++) { + std::string tk_key = getTKKey(ns_key, metadata, i); + std::string tk_value; + if (i == 0) { + tk_value.assign(reinterpret_cast(topk.buckets), metadata.width * metadata.depth * sizeof(Bucket)); + } else if (i == 1) { + tk_value.assign(reinterpret_cast(topk.heap), metadata.top_k * sizeof(HeapBucket)); + for (uint32_t j = 0; j < metadata.top_k; j++) { + std::string hb_key = getHBKey(ns_key, metadata, i, j); + std::string hb_value(topk.heap[j].item, topk.heap[j].itemlen); + s = batch->Put(hb_key, hb_value); if (!s.ok()) return s; + } + } else { + tk_value = std::to_string(topk.heap_size); } + rocksdb::Status s = batch->Put(tk_key, tk_value); + if (!s.ok()) return s; + } - return storage_->Write(ctx, storage_->DefaultWriteOptions(), batch->GetWriteBatch()); + return storage_->Write(ctx, storage_->DefaultWriteOptions(), batch->GetWriteBatch()); } std::string TopK::getTKKey(const Slice &ns_key, const TopKMetadata &metadata, uint8_t index) { - std::string sub_key; - PutFixed8(&sub_key, index); - std::string bf_key = InternalKey(ns_key, sub_key, metadata.version, storage_->IsSlotIdEncoded()).Encode(); - return bf_key; + std::string sub_key; + PutFixed8(&sub_key, index); + std::string bf_key = InternalKey(ns_key, sub_key, metadata.version, storage_->IsSlotIdEncoded()).Encode(); + return bf_key; } std::string TopK::getHBKey(const Slice &ns_key, const TopKMetadata &metadata, uint8_t topk_index, uint32_t hp_index) { - std::string sub_key; - PutFixed8(&sub_key, topk_index); - PutFixed32(&sub_key, hp_index); - return InternalKey(ns_key, sub_key, metadata.version, storage_->IsSlotIdEncoded()).Encode(); + std::string sub_key; + PutFixed8(&sub_key, topk_index); + PutFixed32(&sub_key, hp_index); + return InternalKey(ns_key, sub_key, metadata.version, storage_->IsSlotIdEncoded()).Encode(); } -} // namespace redis \ No newline at end of file +} // namespace redis \ No newline at end of file diff --git a/src/types/redis_topk.h b/src/types/redis_topk.h index 2759a0436d6..fd3f67e70e6 100644 --- a/src/types/redis_topk.h +++ b/src/types/redis_topk.h @@ -25,13 +25,7 @@ namespace redis { -enum class TopKInfoType { - kAll, - kTopK, - kWidth, - kDepth, - kDecay -}; +enum class TopKInfoType { kAll, kTopK, kWidth, kDepth, kDecay }; struct TopKInfo { uint32_t k; @@ -43,29 +37,24 @@ struct TopKInfo { class TopK : public SubKeyScanner { public: using Slice = rocksdb::Slice; - - explicit TopK(engine::Storage* storage, const std::string& ns) - : SubKeyScanner(storage, ns) {} - rocksdb::Status Reserve(engine::Context &ctx, const Slice& user_key, uint32_t k, - uint32_t width, uint32_t depth, double decay); - rocksdb::Status Query(engine::Context &ctx, const Slice& user_key, - const Slice &items, - bool *exists); - rocksdb::Status Add(engine::Context &ctx, const Slice &user_key, - const Slice &items); - rocksdb::Status List(engine::Context &ctx, const Slice& user_key, std::vector &items); - rocksdb::Status Info(engine::Context &ctx, const Slice& user_key, TopKInfo *info); + explicit TopK(engine::Storage *storage, const std::string &ns) : SubKeyScanner(storage, ns) {} + + rocksdb::Status Reserve(engine::Context &ctx, const Slice &user_key, uint32_t k, uint32_t width, uint32_t depth, + double decay); + rocksdb::Status Query(engine::Context &ctx, const Slice &user_key, const Slice &items, bool *exists); + rocksdb::Status Add(engine::Context &ctx, const Slice &user_key, const Slice &items); + rocksdb::Status List(engine::Context &ctx, const Slice &user_key, std::vector &items); + rocksdb::Status Info(engine::Context &ctx, const Slice &user_key, TopKInfo *info); private: rocksdb::Status getTopKMetadata(engine::Context &ctx, const Slice &ns_key, TopKMetadata *metadata); - rocksdb::Status createTopK(engine::Context &ctx, const Slice &ns_key, - uint32_t k, uint32_t width, uint32_t depth, double decay, - TopKMetadata *metadata); + rocksdb::Status createTopK(engine::Context &ctx, const Slice &ns_key, uint32_t k, uint32_t width, uint32_t depth, + double decay, TopKMetadata *metadata); - rocksdb::Status getTopKData(engine::Context &ctx, const Slice& ns_key, const TopKMetadata &metadata, + rocksdb::Status getTopKData(engine::Context &ctx, const Slice &ns_key, const TopKMetadata &metadata, BlockSplitTopK *topk); - rocksdb::Status setTopkData(engine::Context &ctx, const Slice& ns_key, const TopKMetadata &metadata, + rocksdb::Status setTopkData(engine::Context &ctx, const Slice &ns_key, const TopKMetadata &metadata, const BlockSplitTopK &topk); std::string getTKKey(const Slice &ns_key, const TopKMetadata &metadata, uint8_t index); diff --git a/src/types/topk.cc b/src/types/topk.cc index ff34822cbcf..19b0981fcbd 100644 --- a/src/types/topk.cc +++ b/src/types/topk.cc @@ -20,10 +20,10 @@ #include "topk.h" -#include +#include #include +#include #include -#include #include #define TOPK_HASH(item, itemlen, i) MurmurHash2(item, itemlen, i) @@ -48,54 +48,54 @@ //----------------------------------------------------------------------------- static uint32_t MurmurHash2(const void *key, int len, uint32_t seed) { - // 'm' and 'r' are mixing constants generated offline. - // They're not really 'magic', they just happen to work well. + // 'm' and 'r' are mixing constants generated offline. + // They're not really 'magic', they just happen to work well. - const uint32_t m = 0x5bd1e995; - const int r = 24; + const uint32_t m = 0x5bd1e995; + const int r = 24; - // Initialize the hash to a 'random' value + // Initialize the hash to a 'random' value - uint32_t h = seed ^ len; + uint32_t h = seed ^ len; - // Mix 4 bytes at a time into the hash + // Mix 4 bytes at a time into the hash - const unsigned char *data = (const unsigned char *)key; + const unsigned char *data = (const unsigned char *)key; - while (len >= 4) { - uint32_t k = *(uint32_t *)data; + while (len >= 4) { + uint32_t k = *(uint32_t *)data; - k *= m; - k ^= k >> r; - k *= m; + k *= m; + k ^= k >> r; + k *= m; - h *= m; - h ^= k; + h *= m; + h ^= k; - data += 4; - len -= 4; - } + data += 4; + len -= 4; + } - // Handle the last few bytes of the input heap + // Handle the last few bytes of the input heap - switch (len) { + switch (len) { case 3: - h ^= data[2] << 16; + h ^= data[2] << 16; case 2: - h ^= data[1] << 8; + h ^= data[1] << 8; case 1: - h ^= data[0]; - h *= m; - }; + h ^= data[0]; + h *= m; + }; - // Do a few final mixes of the hash to ensure the last few - // bytes are well-incorporated. + // Do a few final mixes of the hash to ensure the last few + // bytes are well-incorporated. - h ^= h >> 13; - h *= m; - h ^= h >> 15; + h ^= h >> 13; + h *= m; + h ^= h >> 15; - return h; + return h; } //----------------------------------------------------------------------------- @@ -106,280 +106,276 @@ static uint32_t MurmurHash2(const void *key, int len, uint32_t seed) { // 64-bit hash for 64-bit platforms -[[maybe_unused]]static uint64_t MurmurHash64A_Bloom(const void *key, int len, uint64_t seed) { - const uint64_t m = BIG_CONSTANT(0xc6a4a7935bd1e995); - const int r = 47; +[[maybe_unused]] static uint64_t MurmurHash64A_Bloom(const void *key, int len, uint64_t seed) { + const uint64_t m = BIG_CONSTANT(0xc6a4a7935bd1e995); + const int r = 47; - uint64_t h = seed ^ (len * m); + uint64_t h = seed ^ (len * m); - const uint64_t *data = (const uint64_t *)key; - const uint64_t *end = data + (len / 8); + const uint64_t *data = (const uint64_t *)key; + const uint64_t *end = data + (len / 8); - while (data != end) { - uint64_t k = *data++; + while (data != end) { + uint64_t k = *data++; - k *= m; - k ^= k >> r; - k *= m; + k *= m; + k ^= k >> r; + k *= m; - h ^= k; - h *= m; - } + h ^= k; + h *= m; + } - const unsigned char *data2 = (const unsigned char *)data; + const unsigned char *data2 = (const unsigned char *)data; - switch (len & 7) { + switch (len & 7) { case 7: - h ^= ((uint64_t)data2[6]) << 48; + h ^= ((uint64_t)data2[6]) << 48; case 6: - h ^= ((uint64_t)data2[5]) << 40; + h ^= ((uint64_t)data2[5]) << 40; case 5: - h ^= ((uint64_t)data2[4]) << 32; + h ^= ((uint64_t)data2[4]) << 32; case 4: - h ^= ((uint64_t)data2[3]) << 24; + h ^= ((uint64_t)data2[3]) << 24; case 3: - h ^= ((uint64_t)data2[2]) << 16; + h ^= ((uint64_t)data2[2]) << 16; case 2: - h ^= ((uint64_t)data2[1]) << 8; + h ^= ((uint64_t)data2[1]) << 8; case 1: - h ^= ((uint64_t)data2[0]); - h *= m; - }; + h ^= ((uint64_t)data2[0]); + h *= m; + }; - h ^= h >> r; - h *= m; - h ^= h >> r; + h ^= h >> r; + h *= m; + h ^= h >> r; - return h; + return h; } // 64-bit hash for 32-bit platforms -[[maybe_unused]]static uint64_t MurmurHash64B(const void *key, int len, uint64_t seed) { - const uint32_t m = 0x5bd1e995; - const int r = 24; - - uint32_t h1 = (uint32_t)(seed ^ len); - uint32_t h2 = (uint32_t)(seed >> 32); - - const uint32_t *data = (const uint32_t *)key; - - while (len >= 8) { - uint32_t k1 = *data++; - k1 *= m; - k1 ^= k1 >> r; - k1 *= m; - h1 *= m; - h1 ^= k1; - len -= 4; - - uint32_t k2 = *data++; - k2 *= m; - k2 ^= k2 >> r; - k2 *= m; - h2 *= m; - h2 ^= k2; - len -= 4; - } +[[maybe_unused]] static uint64_t MurmurHash64B(const void *key, int len, uint64_t seed) { + const uint32_t m = 0x5bd1e995; + const int r = 24; - if (len >= 4) { - uint32_t k1 = *data++; - k1 *= m; - k1 ^= k1 >> r; - k1 *= m; - h1 *= m; - h1 ^= k1; - len -= 4; - } + uint32_t h1 = (uint32_t)(seed ^ len); + uint32_t h2 = (uint32_t)(seed >> 32); - switch (len) { - case 3: - h2 ^= ((unsigned char *)data)[2] << 16; - case 2: - h2 ^= ((unsigned char *)data)[1] << 8; - case 1: - h2 ^= ((unsigned char *)data)[0]; - h2 *= m; - }; + const uint32_t *data = (const uint32_t *)key; - h1 ^= h2 >> 18; + while (len >= 8) { + uint32_t k1 = *data++; + k1 *= m; + k1 ^= k1 >> r; + k1 *= m; h1 *= m; - h2 ^= h1 >> 22; + h1 ^= k1; + len -= 4; + + uint32_t k2 = *data++; + k2 *= m; + k2 ^= k2 >> r; + k2 *= m; h2 *= m; - h1 ^= h2 >> 17; + h2 ^= k2; + len -= 4; + } + + if (len >= 4) { + uint32_t k1 = *data++; + k1 *= m; + k1 ^= k1 >> r; + k1 *= m; h1 *= m; - h2 ^= h1 >> 19; - h2 *= m; + h1 ^= k1; + len -= 4; + } + + switch (len) { + case 3: + h2 ^= ((unsigned char *)data)[2] << 16; + case 2: + h2 ^= ((unsigned char *)data)[1] << 8; + case 1: + h2 ^= ((unsigned char *)data)[0]; + h2 *= m; + }; - uint64_t h = h1; + h1 ^= h2 >> 18; + h1 *= m; + h2 ^= h1 >> 22; + h2 *= m; + h1 ^= h2 >> 17; + h1 *= m; + h2 ^= h1 >> 19; + h2 *= m; - h = (h << 32) | h2; + uint64_t h = h1; - return h; + h = (h << 32) | h2; + + return h; } /* ---------------------------------------------------------------------- */ void BlockSplitTopK::heapifyDown(int start) { - size_t child = start; - - // check whether larger than children - if (heap_size < 2 || (heap_size - 2) / 2 < child) { - return; + size_t child = start; + + // check whether larger than children + if (heap_size < 2 || (heap_size - 2) / 2 < child) { + return; + } + + child = 2 * child + 1; + if ((child + 1) < heap_size && (heap[child].count > heap[child + 1].count)) { + ++child; + } + if (heap[child].count > heap[start].count) { + return; + } + + HeapBucket top; + memcpy(&top, &heap[start], sizeof(HeapBucket)); + do { + memcpy(&heap[start], &heap[child], sizeof(HeapBucket)); + start = child; + + if ((heap_size - 2) / 2 < child) { + break; } - child = 2 * child + 1; + if ((child + 1) < heap_size && (heap[child].count > heap[child + 1].count)) { - ++child; + ++child; } - if (heap[child].count > heap[start].count) { - return; - } - - HeapBucket top; - memcpy(&top, &heap[start], sizeof(HeapBucket)); - do { - memcpy(&heap[start], &heap[child], sizeof(HeapBucket)); - start = child; - - if ((heap_size - 2) / 2 < child) { - break; - } - child = 2 * child + 1; - - if ((child + 1) < heap_size && (heap[child].count > heap[child + 1].count)) { - ++child; - } - } while (heap[child].count < top.count); - memcpy(&heap[start], &top, sizeof(HeapBucket)); + } while (heap[child].count < top.count); + memcpy(&heap[start], &top, sizeof(HeapBucket)); } void BlockSplitTopK::heapifyUp(int start) { - size_t parent = start; - - // check whether smaller than parent - if (heap_size < 2 || parent == 0) { - return; + size_t parent = start; + + // check whether smaller than parent + if (heap_size < 2 || parent == 0) { + return; + } + + parent = (parent - 1) / 2; + if (heap[parent].count > heap[start].count) { + return; + } + + HeapBucket bottom; + memcpy(&bottom, &heap[start], sizeof(HeapBucket)); + do { + memcpy(&heap[start], &heap[parent], sizeof(HeapBucket)); + start = parent; + + if (start == 0) { + break; } - parent = (parent - 1) / 2; - if (heap[parent].count > heap[start].count) { - return; - } - - HeapBucket bottom; - memcpy(&bottom, &heap[start], sizeof(HeapBucket)); - do { - memcpy(&heap[start], &heap[parent], sizeof(HeapBucket)); - start = parent; - - if (start == 0) { - break; - } - parent = (parent - 1) / 2; - } while (heap[parent].count > bottom.count); - memcpy(&heap[start], &bottom, sizeof(HeapBucket)); + } while (heap[parent].count > bottom.count); + memcpy(&heap[start], &bottom, sizeof(HeapBucket)); } int BlockSplitTopK::checkExistInHeap(const std::string &item) { - uint32_t itemlen = item.size(); - const char *data = item.c_str(); - for (int32_t i = heap_size - 1; i >= 0; --i) { - if (heap[i].itemlen == itemlen && memcmp(heap[i].item, data, itemlen) == 0) { - return i; - } + uint32_t itemlen = item.size(); + const char *data = item.c_str(); + for (int32_t i = heap_size - 1; i >= 0; --i) { + if (heap[i].itemlen == itemlen && memcmp(heap[i].item, data, itemlen) == 0) { + return i; } - return -1; + } + return -1; } int BlockSplitTopK::cmpHeapBucketCount(const HeapBucket &a, const HeapBucket &b) { - return a.count < b.count ? 1 : a.count > b.count ? -1 : 0; + return a.count < b.count ? 1 : a.count > b.count ? -1 : 0; } void BlockSplitTopK::Add(const std::string &item, uint32_t increment) { - uint32_t itemlen = item.size(); - const char *data = item.c_str(); - counter_t maxCount = 0; - uint32_t fp = TOPK_HASH(data, itemlen, GA); - - int location = checkExistInHeap(item); - - for (size_t i = 0; i < depth; ++i) { - uint32_t loc = TOPK_HASH(data, itemlen, i) % width; - - loc += i * width; - if (buckets[loc].count == 0) { + uint32_t itemlen = item.size(); + const char *data = item.c_str(); + counter_t maxCount = 0; + uint32_t fp = TOPK_HASH(data, itemlen, GA); + + int location = checkExistInHeap(item); + + for (size_t i = 0; i < depth; ++i) { + uint32_t loc = TOPK_HASH(data, itemlen, i) % width; + + loc += i * width; + if (buckets[loc].count == 0) { + buckets[loc].fp = fp; + buckets[loc].count = increment; + maxCount = std::max(maxCount, buckets[loc].count); + } else if (buckets[loc].fp == fp && location != -1) { + buckets[loc].count += increment; + maxCount = std::max(maxCount, buckets[loc].count); + } else { + // decay + uint32_t local_incr = increment; + for (; local_incr > 0; --local_incr) { + double decay; + if (buckets[loc].count < TOPK_DECAY_LOOKUP_TABLE) { + decay = lookupTable[buckets[loc].count]; + } else { + decay = pow(lookupTable[TOPK_DECAY_LOOKUP_TABLE - 1], (buckets[loc].count / (TOPK_DECAY_LOOKUP_TABLE - 1))) * + lookupTable[buckets[loc].count % (TOPK_DECAY_LOOKUP_TABLE - 1)]; + } + double chance = rand() / (double)RAND_MAX; + if (chance < decay) { + --buckets[loc].count; + if (buckets[loc].count == 0) { buckets[loc].fp = fp; - buckets[loc].count = increment; + buckets[loc].count = 1; maxCount = std::max(maxCount, buckets[loc].count); - } else if (buckets[loc].fp == fp && location != -1) { - buckets[loc].count += increment; - maxCount = std::max(maxCount, buckets[loc].count); - } else { - // decay - uint32_t local_incr = increment; - for (; local_incr > 0; --local_incr) { - double decay; - if (buckets[loc].count < TOPK_DECAY_LOOKUP_TABLE) { - decay = lookupTable[buckets[loc].count]; - } else { - decay = pow(lookupTable[TOPK_DECAY_LOOKUP_TABLE - 1], - (buckets[loc].count / (TOPK_DECAY_LOOKUP_TABLE - 1))) * - lookupTable[buckets[loc].count % (TOPK_DECAY_LOOKUP_TABLE - 1)]; - } - double chance = rand() / (double)RAND_MAX; - if (chance < decay) { - -- buckets[loc].count; - if (buckets[loc].count == 0) { - buckets[loc].fp = fp; - buckets[loc].count = 1; - maxCount = std::max(maxCount, buckets[loc].count); - break; - } - } - } + break; + } } + } } + } - if (k == heap_size) { - if (location == -1) { - if (heap[0].count == maxCount || heap[0].count + 1 == maxCount) { - heap[0].fp = fp; - heap[0].itemlen = itemlen; - delete heap[0].item; - heap[0].item = new char[itemlen]; - memcpy(heap[0].item, data, itemlen); - - heap[0].count = maxCount; - - heapifyDown(0); - } - } else { - heap[location].count += increment; - heapifyDown(location); - } + if (k == heap_size) { + if (location == -1) { + if (heap[0].count == maxCount || heap[0].count + 1 == maxCount) { + heap[0].fp = fp; + heap[0].itemlen = itemlen; + delete heap[0].item; + heap[0].item = new char[itemlen]; + memcpy(heap[0].item, data, itemlen); + + heap[0].count = maxCount; + + heapifyDown(0); + } } else { - heap[heap_size].fp = fp; - heap[heap_size].itemlen = itemlen; - heap[heap_size].item = new char[itemlen]; - memcpy(heap[heap_size].item, data, itemlen); - heap[heap_size].count = maxCount; - - heapifyUp(heap_size); - heap_size ++; + heap[location].count += increment; + heapifyDown(location); } + } else { + heap[heap_size].fp = fp; + heap[heap_size].itemlen = itemlen; + heap[heap_size].item = new char[itemlen]; + memcpy(heap[heap_size].item, data, itemlen); + heap[heap_size].count = maxCount; + + heapifyUp(heap_size); + heap_size++; + } } -bool BlockSplitTopK::Query(const std::string &item) { - return checkExistInHeap(item) != -1; -} +bool BlockSplitTopK::Query(const std::string &item) { return checkExistInHeap(item) != -1; } std::vector BlockSplitTopK::List() { - std::vector result(heap_size); - for (uint32_t i = 0; i < heap_size; i ++) { - result[i] = heap[i]; - } - std::sort(result.begin(), result.end(), [this] (const HeapBucket &a, const HeapBucket &b) { - return cmpHeapBucketCount(a, b) > 0; - }); - return result; + std::vector result(heap_size); + for (uint32_t i = 0; i < heap_size; i++) { + result[i] = heap[i]; + } + std::sort(result.begin(), result.end(), + [this](const HeapBucket &a, const HeapBucket &b) { return cmpHeapBucketCount(a, b) > 0; }); + return result; } \ No newline at end of file diff --git a/src/types/topk.h b/src/types/topk.h index 559d4c010e3..34fbc684bb0 100644 --- a/src/types/topk.h +++ b/src/types/topk.h @@ -21,66 +21,67 @@ #pragma once #include -#include -#include + #include +#include #include +#include static constexpr int TOPK_DECAY_LOOKUP_TABLE = 256; using counter_t = uint32_t; struct HeapBucket { - uint32_t fp; - uint32_t itemlen; - char* item; - counter_t count; + uint32_t fp; + uint32_t itemlen; + char *item; + counter_t count; }; struct Bucket { - uint32_t fp; - counter_t count; + uint32_t fp; + counter_t count; }; class BlockSplitTopK { -public: - BlockSplitTopK() = delete; - explicit BlockSplitTopK(uint32_t k, uint32_t width, uint32_t depth, double decay) : - k(k), width(width), depth(depth), decay(decay), heap_size(0) { - buckets = new Bucket[width * depth]; - heap = new HeapBucket[k]; - std::fill_n(buckets, width * depth, Bucket{0, 0}); - std::fill_n(heap, k, HeapBucket{0, 0, nullptr, 0}); - for (int i = 0; i < TOPK_DECAY_LOOKUP_TABLE; ++i) { - lookupTable[i] = pow(decay, i); - } + public: + BlockSplitTopK() = delete; + explicit BlockSplitTopK(uint32_t k, uint32_t width, uint32_t depth, double decay) + : k(k), width(width), depth(depth), decay(decay), heap_size(0) { + buckets = new Bucket[width * depth]; + heap = new HeapBucket[k]; + std::fill_n(buckets, width * depth, Bucket{0, 0}); + std::fill_n(heap, k, HeapBucket{0, 0, nullptr, 0}); + for (int i = 0; i < TOPK_DECAY_LOOKUP_TABLE; ++i) { + lookupTable[i] = pow(decay, i); } + } - ~BlockSplitTopK() { - for (size_t i = 0; i < k; ++i) { - delete[] heap[i].item; - } - delete[] buckets; - delete[] heap; + ~BlockSplitTopK() { + for (size_t i = 0; i < k; ++i) { + delete[] heap[i].item; } + delete[] buckets; + delete[] heap; + } - void Add(const std::string &item, uint32_t increment); - bool Query(const std::string &item); - std::vector List(); + void Add(const std::string &item, uint32_t increment); + bool Query(const std::string &item); + std::vector List(); - void heapifyDown(int start); - void heapifyUp(int start); - int checkExistInHeap(const std::string &item); - int cmpHeapBucketCount(const HeapBucket &a, const HeapBucket &b); + void heapifyDown(int start); + void heapifyUp(int start); + int checkExistInHeap(const std::string &item); + int cmpHeapBucketCount(const HeapBucket &a, const HeapBucket &b); - uint32_t k; - uint32_t width; - uint32_t depth; - double decay; + uint32_t k; + uint32_t width; + uint32_t depth; + double decay; - size_t heap_size; + size_t heap_size; - Bucket *buckets; - HeapBucket *heap; - double lookupTable[TOPK_DECAY_LOOKUP_TABLE]; + Bucket *buckets; + HeapBucket *heap; + double lookupTable[TOPK_DECAY_LOOKUP_TABLE]; }; \ No newline at end of file diff --git a/tests/cppunit/types/topk_test.cc b/tests/cppunit/types/topk_test.cc index 21af3171480..d22ccb9d5c7 100644 --- a/tests/cppunit/types/topk_test.cc +++ b/tests/cppunit/types/topk_test.cc @@ -32,9 +32,7 @@ static constexpr double decay = 0.9; class RedisTopKTest : public TestBase { protected: - explicit RedisTopKTest() : TestBase() { - top_k_ = std::make_unique(storage_.get(), "topk_ns"); - } + explicit RedisTopKTest() : TestBase() { top_k_ = std::make_unique(storage_.get(), "topk_ns"); } ~RedisTopKTest() override = default; void SetUp() override { @@ -112,7 +110,7 @@ TEST_F(RedisTopKTest, TestTopKAddAndQuery) { // heap is full, need remove values1. for (size_t i = 0; i < values2.size(); ++i) { bool found = false; - // due to decay, topk is possiable to remove values1. + // due to decay, topk is possible to remove values1. while (!found) { top_k_->Add(*ctx_, key_, values2[i]); top_k_->Query(*ctx_, key_, values2[i], &found); From 01861cff51eeec10b6779f872008496ec925b32d Mon Sep 17 00:00:00 2001 From: aibin <2573214643@qq.com> Date: Fri, 31 Oct 2025 10:42:59 +0800 Subject: [PATCH 05/18] Revert "fix: clang-format code" This reverts commit 915580d121fb674b1f096aa66515faf4c6357d03. --- src/commands/cmd_topk.cc | 12 +- src/storage/redis_metadata.h | 15 +- src/types/redis_topk.cc | 303 +++++++++---------- src/types/redis_topk.h | 37 ++- src/types/topk.cc | 488 ++++++++++++++++--------------- src/types/topk.h | 81 +++-- tests/cppunit/types/topk_test.cc | 6 +- 7 files changed, 482 insertions(+), 460 deletions(-) diff --git a/src/commands/cmd_topk.cc b/src/commands/cmd_topk.cc index d1f2db3295b..5c138c66d3c 100644 --- a/src/commands/cmd_topk.cc +++ b/src/commands/cmd_topk.cc @@ -18,8 +18,8 @@ * */ -#include "command_parser.h" #include "commander.h" +#include "command_parser.h" #include "error_constants.h" #include "server/server.h" #include "types/redis_topk.h" @@ -30,7 +30,7 @@ constexpr const char *errBadWidth = "Bad width"; constexpr const char *errBadDepth = "Bad depth"; constexpr const char *errBadDecay = "Bad decay"; constexpr const char *errInvalidDecay = "Decay must be between 0 and 1"; -} // namespace +} namespace redis { @@ -84,7 +84,6 @@ class CommandTopKReserve final : public Commander { *output = redis::RESP_OK; return Status::OK(); } - private: uint32_t k_; uint32_t width_ = 7; @@ -151,7 +150,7 @@ class CommandTopKInfo final : public Commander { return Commander::Parse(args); } - Status Execute(engine::Context &ctx, Server *srv, Connection *conn, std::string *output) override { + Status Execute(engine::Context &ctx, Server *srv, Connection *conn, [[maybe_unused]]std::string *output) override { redis::TopK topk_db(srv->storage, conn->GetNamespace()); TopKInfo info; @@ -186,7 +185,6 @@ class CommandTopKInfo final : public Commander { } return Status::OK(); } - private: TopKInfoType type_ = TopKInfoType::kAll; }; @@ -201,7 +199,7 @@ class CommandTopKQuery final : public Commander { auto s = topk.Query(ctx, args_[1], args_[2], &is_exists_); if (!s.ok()) { return {Status::RedisExecErr, s.ToString()}; - } + } *output = redis::Bool(redis::RESP::v2, is_exists_); return Status::OK(); } @@ -213,4 +211,4 @@ REDIS_REGISTER_COMMANDS(TopK, MakeCmdAttr("topk.add", 3, "write" MakeCmdAttr("topk.query", 3, "read-only", 1, 1, 1), MakeCmdAttr("topk.reserve", -3, "write", 1, 1, 1)); -} // namespace redis \ No newline at end of file +} // namespace redis \ No newline at end of file diff --git a/src/storage/redis_metadata.h b/src/storage/redis_metadata.h index 516fde6ec05..0b1ff777f8a 100644 --- a/src/storage/redis_metadata.h +++ b/src/storage/redis_metadata.h @@ -59,8 +59,9 @@ enum RedisType : uint8_t { }; inline constexpr const std::array RedisTypeNames = { - "none", "string", "hash", "list", "set", "zset", "bitmap", "sortedint", - "stream", "MBbloom--", "ReJSON-RL", "hyperloglog", "TDIS-TYPE", "timeseries", "topk"}; + "none", "string", "hash", "list", "set", "zset", "bitmap", + "sortedint", "stream", "MBbloom--", "ReJSON-RL", "hyperloglog", "TDIS-TYPE", "timeseries", + "topk"}; struct RedisTypes { RedisTypes(std::initializer_list list) { @@ -412,7 +413,7 @@ class TimeSeriesMetadata : public Metadata { }; class TopKMetadata : public Metadata { - public: +public: uint32_t top_k; uint16_t width; uint32_t depth; @@ -420,8 +421,12 @@ class TopKMetadata : public Metadata { explicit TopKMetadata(bool generate_version = true) : Metadata(kRedisTopK, generate_version) {} - TopKMetadata(uint64_t top_k, uint64_t width = 7, uint64_t depth = 8, double decay = 0.9, bool generate_version = true) - : Metadata(kRedisTopK, generate_version), top_k(top_k), width(width), depth(depth), decay(decay) {} + TopKMetadata(uint64_t top_k, uint64_t width = 7, uint64_t depth = 8, double decay = 0.9, bool generate_version = true) + : Metadata(kRedisTopK, generate_version), + top_k(top_k), + width(width), + depth(depth), + decay(decay) {} void Encode(std::string *dst) const override; rocksdb::Status Decode(Slice *input) override; diff --git a/src/types/redis_topk.cc b/src/types/redis_topk.cc index 498758971e8..9569641fffc 100644 --- a/src/types/redis_topk.cc +++ b/src/types/redis_topk.cc @@ -19,206 +19,209 @@ */ #include "redis_topk.h" - -#include "commands/ttl_util.h" #include "topk.h" +#include "commands/ttl_util.h" namespace redis { + +rocksdb::Status TopK::Reserve(engine::Context &ctx, const Slice& user_key, + uint32_t k, uint32_t width, uint32_t depth, double decay) { + std::string ns_key = AppendNamespacePrefix(user_key); + + TopKMetadata topk_metadata; + rocksdb::Status s = getTopKMetadata(ctx, ns_key, &topk_metadata); + if (!s.ok() && !s.IsNotFound()) return s; + if (!s.IsNotFound()) { + return rocksdb::Status::InvalidArgument("TopK already exists"); + } -rocksdb::Status TopK::Reserve(engine::Context &ctx, const Slice &user_key, uint32_t k, uint32_t width, uint32_t depth, - double decay) { - std::string ns_key = AppendNamespacePrefix(user_key); - - TopKMetadata topk_metadata; - rocksdb::Status s = getTopKMetadata(ctx, ns_key, &topk_metadata); - if (!s.ok() && !s.IsNotFound()) return s; - if (!s.IsNotFound()) { - return rocksdb::Status::InvalidArgument("TopK already exists"); - } - - return createTopK(ctx, ns_key, k, width, depth, decay, &topk_metadata); + return createTopK(ctx, ns_key, k, width, depth, decay, &topk_metadata); } -rocksdb::Status TopK::Add(engine::Context &ctx, const Slice &user_key, const Slice &items) { - std::string ns_key = AppendNamespacePrefix(user_key); +rocksdb::Status TopK::Add(engine::Context &ctx, const Slice &user_key, + const Slice &items) { + std::string ns_key = AppendNamespacePrefix(user_key); - TopKMetadata topk_metadata; - rocksdb::Status s = getTopKMetadata(ctx, ns_key, &topk_metadata); - if (!s.ok()) return s; - - auto batch = storage_->GetWriteBatchBase(); - WriteBatchLogData log_data(kRedisTopK, {"Add"}); - s = batch->PutLogData(log_data.Encode()); - if (!s.ok()) return s; + TopKMetadata topk_metadata; + rocksdb::Status s = getTopKMetadata(ctx, ns_key, &topk_metadata); + if (!s.ok()) return s; + + auto batch = storage_->GetWriteBatchBase(); + WriteBatchLogData log_data(kRedisTopK, {"Add"}); + s = batch->PutLogData(log_data.Encode()); + if (!s.ok()) return s; - BlockSplitTopK topk(topk_metadata.top_k, topk_metadata.width, topk_metadata.depth, topk_metadata.decay); - s = getTopKData(ctx, ns_key, topk_metadata, &topk); - if (!s.ok()) return s; + BlockSplitTopK topk(topk_metadata.top_k, topk_metadata.width, topk_metadata.depth, topk_metadata.decay); + s = getTopKData(ctx, ns_key, topk_metadata, &topk); + if (!s.ok()) return s; - topk.Add(items.data_, 1); + topk.Add(items.data_, 1); - s = setTopkData(ctx, ns_key, topk_metadata, topk); - if (!s.ok()) return s; + s = setTopkData(ctx, ns_key, topk_metadata, topk); + if (!s.ok()) return s; - return storage_->Write(ctx, storage_->DefaultWriteOptions(), batch->GetWriteBatch()); + return storage_->Write(ctx, storage_->DefaultWriteOptions(), batch->GetWriteBatch()); } -rocksdb::Status TopK::Query(engine::Context &ctx, const Slice &user_key, const Slice &items, bool *exists) { - std::string ns_key = AppendNamespacePrefix(user_key); +rocksdb::Status TopK::Query(engine::Context &ctx, const Slice& user_key, + const Slice &items, bool *exists) { + std::string ns_key = AppendNamespacePrefix(user_key); - TopKMetadata topk_metadata; - rocksdb::Status s = getTopKMetadata(ctx, ns_key, &topk_metadata); - if (!s.ok()) return s; + TopKMetadata topk_metadata; + rocksdb::Status s = getTopKMetadata(ctx, ns_key, &topk_metadata); + if (!s.ok()) return s; - BlockSplitTopK topk(topk_metadata.top_k, topk_metadata.width, topk_metadata.depth, topk_metadata.decay); - s = getTopKData(ctx, ns_key, topk_metadata, &topk); - if (!s.ok()) return s; + BlockSplitTopK topk(topk_metadata.top_k, topk_metadata.width, topk_metadata.depth, topk_metadata.decay); + s = getTopKData(ctx, ns_key, topk_metadata, &topk); + if (!s.ok()) return s; - *exists = topk.Query(items.data_); + *exists = topk.Query(items.data_); - return rocksdb::Status::OK(); + return rocksdb::Status::OK(); } -rocksdb::Status TopK::List(engine::Context &ctx, const Slice &user_key, std::vector &items) { - std::string ns_key = AppendNamespacePrefix(user_key); - - TopKMetadata topk_metadata; - rocksdb::Status s = getTopKMetadata(ctx, ns_key, &topk_metadata); - if (!s.ok()) return s; +rocksdb::Status TopK::List(engine::Context &ctx, const Slice& user_key, + std::vector &items) { + std::string ns_key = AppendNamespacePrefix(user_key); - BlockSplitTopK topk(topk_metadata.top_k, topk_metadata.width, topk_metadata.depth, topk_metadata.decay); - s = getTopKData(ctx, ns_key, topk_metadata, &topk); - if (!s.ok()) return s; + TopKMetadata topk_metadata; + rocksdb::Status s = getTopKMetadata(ctx, ns_key, &topk_metadata); + if (!s.ok()) return s; + + BlockSplitTopK topk(topk_metadata.top_k, topk_metadata.width, topk_metadata.depth, topk_metadata.decay); + s = getTopKData(ctx, ns_key, topk_metadata, &topk); + if (!s.ok()) return s; - auto heap_buckets = topk.List(); - for (auto &bucket : heap_buckets) { - items.emplace_back(bucket.item, bucket.itemlen); - } + auto heap_buckets = topk.List(); + for (auto &bucket : heap_buckets) { + items.emplace_back(bucket.item, bucket.itemlen); + } - return rocksdb::Status::OK(); + return rocksdb::Status::OK(); } -rocksdb::Status TopK::Info(engine::Context &ctx, const Slice &user_key, TopKInfo *info) { - std::string ns_key = AppendNamespacePrefix(user_key); - - TopKMetadata metadata; - auto s = getTopKMetadata(ctx, ns_key, &metadata); - if (!s.ok()) return s; +rocksdb::Status TopK::Info(engine::Context &ctx, const Slice& user_key, TopKInfo *info) { + std::string ns_key = AppendNamespacePrefix(user_key); + + TopKMetadata metadata; + auto s = getTopKMetadata(ctx, ns_key, &metadata); + if (!s.ok()) return s; - info->k = metadata.top_k; - info->width = metadata.width; - info->depth = metadata.depth; - info->decay = metadata.decay; + info->k = metadata.top_k; + info->width = metadata.width; + info->depth = metadata.depth; + info->decay = metadata.decay; - return rocksdb::Status::OK(); + return rocksdb::Status::OK(); } rocksdb::Status TopK::getTopKMetadata(engine::Context &ctx, const Slice &ns_key, TopKMetadata *metadata) { - return Database::GetMetadata(ctx, {kRedisTopK}, ns_key, metadata); + return Database::GetMetadata(ctx, {kRedisTopK}, ns_key, metadata); } -rocksdb::Status TopK::createTopK(engine::Context &ctx, const Slice &ns_key, uint32_t k, uint32_t width, uint32_t depth, - double decay, TopKMetadata *metadata) { - metadata->top_k = k; - metadata->width = width; - metadata->depth = depth; - metadata->decay = decay; - - BlockSplitTopK block_split_top_k(k, width, depth, decay); - - auto batch = storage_->GetWriteBatchBase(); - WriteBatchLogData log_data(kRedisTopK, {"createTopK"}); - auto s = batch->PutLogData(log_data.Encode()); - if (!s.ok()) return s; - - std::string top_k_meta_bytes; - metadata->Encode(&top_k_meta_bytes); - s = batch->Put(metadata_cf_handle_, ns_key, top_k_meta_bytes); - if (!s.ok()) return s; +rocksdb::Status TopK::createTopK(engine::Context &ctx, const Slice &ns_key, + uint32_t k, uint32_t width, uint32_t depth, double decay, + TopKMetadata *metadata) { + metadata->top_k = k; + metadata->width = width; + metadata->depth = depth; + metadata->decay = decay; + + BlockSplitTopK block_split_top_k(k, width, depth, decay); + + auto batch = storage_->GetWriteBatchBase(); + WriteBatchLogData log_data(kRedisTopK, {"createTopK"}); + auto s = batch->PutLogData(log_data.Encode()); + if (!s.ok()) return s; + + std::string top_k_meta_bytes; + metadata->Encode(&top_k_meta_bytes); + s = batch->Put(metadata_cf_handle_, ns_key, top_k_meta_bytes); + if (!s.ok()) return s; - s = setTopkData(ctx, ns_key, *metadata, block_split_top_k); - if (!s.ok()) return s; + s = setTopkData(ctx, ns_key, *metadata, block_split_top_k); + if (!s.ok()) return s; - return storage_->Write(ctx, storage_->DefaultWriteOptions(), batch->GetWriteBatch()); + return storage_->Write(ctx, storage_->DefaultWriteOptions(), batch->GetWriteBatch()); } -rocksdb::Status TopK::getTopKData(engine::Context &ctx, const Slice &ns_key, const TopKMetadata &metadata, +rocksdb::Status TopK::getTopKData(engine::Context &ctx, const Slice& ns_key, const TopKMetadata &metadata, BlockSplitTopK *topk) { - for (uint8_t i = 0; i < 3; i++) { - std::string tk_key = getTKKey(ns_key, metadata, i); - rocksdb::PinnableSlice pinnable_value; - rocksdb::Status s = storage_->Get(ctx, ctx.GetReadOptions(), tk_key, &pinnable_value); - if (!s.ok()) return s; - if (i == 0) { - if (pinnable_value.size() != metadata.width * metadata.depth * sizeof(Bucket)) { - return rocksdb::Status::Corruption("TopK data corrupted: buckets size mismatch"); - } - memcpy(topk->buckets, pinnable_value.data(), pinnable_value.size()); - } else if (i == 1) { - if (pinnable_value.size() != metadata.top_k * sizeof(HeapBucket)) { - return rocksdb::Status::Corruption("TopK data corrupted: heap size mismatch"); - } - memcpy(topk->heap, pinnable_value.data(), pinnable_value.size()); - for (uint32_t j = 0; j < metadata.top_k; j++) { - std::string hb_key = getHBKey(ns_key, metadata, i, j); - rocksdb::PinnableSlice hb_value; - rocksdb::Status s = storage_->Get(ctx, ctx.GetReadOptions(), hb_key, &hb_value); + for (uint8_t i = 0; i < 3; i++) { + std::string tk_key = getTKKey(ns_key, metadata, i); + rocksdb::PinnableSlice pinnable_value; + rocksdb::Status s = storage_->Get(ctx, ctx.GetReadOptions(), tk_key, &pinnable_value); if (!s.ok()) return s; - if (hb_value.size() != topk->heap[j].itemlen) { - return rocksdb::Status::Corruption("TopK data corrupted: heap bucket size mismatch"); + if (i == 0) { + if (pinnable_value.size() != metadata.width * metadata.depth * sizeof(Bucket)) { + return rocksdb::Status::Corruption("TopK data corrupted: buckets size mismatch"); + } + memcpy(topk->buckets, pinnable_value.data(), pinnable_value.size()); + } else if (i == 1) { + if (pinnable_value.size() != metadata.top_k * sizeof(HeapBucket)) { + return rocksdb::Status::Corruption("TopK data corrupted: heap size mismatch"); + } + memcpy(topk->heap, pinnable_value.data(), pinnable_value.size()); + for (uint32_t j = 0; j < metadata.top_k; j++) { + std::string hb_key = getHBKey(ns_key, metadata, i, j); + rocksdb::PinnableSlice hb_value; + rocksdb::Status s = storage_->Get(ctx, ctx.GetReadOptions(), hb_key, &hb_value); + if (!s.ok()) return s; + if (hb_value.size() != topk->heap[j].itemlen) { + return rocksdb::Status::Corruption("TopK data corrupted: heap bucket size mismatch"); + } + topk->heap[j].item = new char[topk->heap[j].itemlen]; + memcpy(topk->heap[j].item, hb_value.data(), hb_value.size()); + } + } else { + topk->heap_size = static_cast(std::stoul(pinnable_value.data())); } - topk->heap[j].item = new char[topk->heap[j].itemlen]; - memcpy(topk->heap[j].item, hb_value.data(), hb_value.size()); - } - } else { - topk->heap_size = static_cast(std::stoul(pinnable_value.data())); } - } - return rocksdb::Status::OK(); + return rocksdb::Status::OK(); } -rocksdb::Status TopK::setTopkData(engine::Context &ctx, const Slice &ns_key, const TopKMetadata &metadata, +rocksdb::Status TopK::setTopkData(engine::Context &ctx, const Slice& ns_key, const TopKMetadata &metadata, const BlockSplitTopK &topk) { - auto batch = storage_->GetWriteBatchBase(); - WriteBatchLogData log_data(kRedisTopK, {"setTopkData"}); - rocksdb::Status s = batch->PutLogData(log_data.Encode()); - if (!s.ok()) return s; - - for (uint8_t i = 0; i < 3; i++) { - std::string tk_key = getTKKey(ns_key, metadata, i); - std::string tk_value; - if (i == 0) { - tk_value.assign(reinterpret_cast(topk.buckets), metadata.width * metadata.depth * sizeof(Bucket)); - } else if (i == 1) { - tk_value.assign(reinterpret_cast(topk.heap), metadata.top_k * sizeof(HeapBucket)); - for (uint32_t j = 0; j < metadata.top_k; j++) { - std::string hb_key = getHBKey(ns_key, metadata, i, j); - std::string hb_value(topk.heap[j].item, topk.heap[j].itemlen); - s = batch->Put(hb_key, hb_value); + auto batch = storage_->GetWriteBatchBase(); + WriteBatchLogData log_data(kRedisTopK, {"setTopkData"}); + rocksdb::Status s = batch->PutLogData(log_data.Encode()); + if (!s.ok()) return s; + + for (uint8_t i = 0; i < 3; i++) { + std::string tk_key = getTKKey(ns_key, metadata, i); + std::string tk_value; + if (i == 0) { + tk_value.assign(reinterpret_cast(topk.buckets), metadata.width * metadata.depth * sizeof(Bucket)); + } else if (i == 1) { + tk_value.assign(reinterpret_cast(topk.heap), metadata.top_k * sizeof(HeapBucket)); + for (uint32_t j = 0; j < metadata.top_k; j++) { + std::string hb_key = getHBKey(ns_key, metadata, i, j); + std::string hb_value(topk.heap[j].item, topk.heap[j].itemlen); + s = batch->Put(hb_key, hb_value); + if (!s.ok()) return s; + } + } else { + tk_value = std::to_string(topk.heap_size); + } + rocksdb::Status s = batch->Put(tk_key, tk_value); if (!s.ok()) return s; - } - } else { - tk_value = std::to_string(topk.heap_size); } - rocksdb::Status s = batch->Put(tk_key, tk_value); - if (!s.ok()) return s; - } - return storage_->Write(ctx, storage_->DefaultWriteOptions(), batch->GetWriteBatch()); + return storage_->Write(ctx, storage_->DefaultWriteOptions(), batch->GetWriteBatch()); } std::string TopK::getTKKey(const Slice &ns_key, const TopKMetadata &metadata, uint8_t index) { - std::string sub_key; - PutFixed8(&sub_key, index); - std::string bf_key = InternalKey(ns_key, sub_key, metadata.version, storage_->IsSlotIdEncoded()).Encode(); - return bf_key; + std::string sub_key; + PutFixed8(&sub_key, index); + std::string bf_key = InternalKey(ns_key, sub_key, metadata.version, storage_->IsSlotIdEncoded()).Encode(); + return bf_key; } std::string TopK::getHBKey(const Slice &ns_key, const TopKMetadata &metadata, uint8_t topk_index, uint32_t hp_index) { - std::string sub_key; - PutFixed8(&sub_key, topk_index); - PutFixed32(&sub_key, hp_index); - return InternalKey(ns_key, sub_key, metadata.version, storage_->IsSlotIdEncoded()).Encode(); + std::string sub_key; + PutFixed8(&sub_key, topk_index); + PutFixed32(&sub_key, hp_index); + return InternalKey(ns_key, sub_key, metadata.version, storage_->IsSlotIdEncoded()).Encode(); } -} // namespace redis \ No newline at end of file +} // namespace redis \ No newline at end of file diff --git a/src/types/redis_topk.h b/src/types/redis_topk.h index fd3f67e70e6..2759a0436d6 100644 --- a/src/types/redis_topk.h +++ b/src/types/redis_topk.h @@ -25,7 +25,13 @@ namespace redis { -enum class TopKInfoType { kAll, kTopK, kWidth, kDepth, kDecay }; +enum class TopKInfoType { + kAll, + kTopK, + kWidth, + kDepth, + kDecay +}; struct TopKInfo { uint32_t k; @@ -37,24 +43,29 @@ struct TopKInfo { class TopK : public SubKeyScanner { public: using Slice = rocksdb::Slice; + + explicit TopK(engine::Storage* storage, const std::string& ns) + : SubKeyScanner(storage, ns) {} - explicit TopK(engine::Storage *storage, const std::string &ns) : SubKeyScanner(storage, ns) {} - - rocksdb::Status Reserve(engine::Context &ctx, const Slice &user_key, uint32_t k, uint32_t width, uint32_t depth, - double decay); - rocksdb::Status Query(engine::Context &ctx, const Slice &user_key, const Slice &items, bool *exists); - rocksdb::Status Add(engine::Context &ctx, const Slice &user_key, const Slice &items); - rocksdb::Status List(engine::Context &ctx, const Slice &user_key, std::vector &items); - rocksdb::Status Info(engine::Context &ctx, const Slice &user_key, TopKInfo *info); + rocksdb::Status Reserve(engine::Context &ctx, const Slice& user_key, uint32_t k, + uint32_t width, uint32_t depth, double decay); + rocksdb::Status Query(engine::Context &ctx, const Slice& user_key, + const Slice &items, + bool *exists); + rocksdb::Status Add(engine::Context &ctx, const Slice &user_key, + const Slice &items); + rocksdb::Status List(engine::Context &ctx, const Slice& user_key, std::vector &items); + rocksdb::Status Info(engine::Context &ctx, const Slice& user_key, TopKInfo *info); private: rocksdb::Status getTopKMetadata(engine::Context &ctx, const Slice &ns_key, TopKMetadata *metadata); - rocksdb::Status createTopK(engine::Context &ctx, const Slice &ns_key, uint32_t k, uint32_t width, uint32_t depth, - double decay, TopKMetadata *metadata); + rocksdb::Status createTopK(engine::Context &ctx, const Slice &ns_key, + uint32_t k, uint32_t width, uint32_t depth, double decay, + TopKMetadata *metadata); - rocksdb::Status getTopKData(engine::Context &ctx, const Slice &ns_key, const TopKMetadata &metadata, + rocksdb::Status getTopKData(engine::Context &ctx, const Slice& ns_key, const TopKMetadata &metadata, BlockSplitTopK *topk); - rocksdb::Status setTopkData(engine::Context &ctx, const Slice &ns_key, const TopKMetadata &metadata, + rocksdb::Status setTopkData(engine::Context &ctx, const Slice& ns_key, const TopKMetadata &metadata, const BlockSplitTopK &topk); std::string getTKKey(const Slice &ns_key, const TopKMetadata &metadata, uint8_t index); diff --git a/src/types/topk.cc b/src/types/topk.cc index 19b0981fcbd..ff34822cbcf 100644 --- a/src/types/topk.cc +++ b/src/types/topk.cc @@ -20,10 +20,10 @@ #include "topk.h" -#include -#include #include +#include #include +#include #include #define TOPK_HASH(item, itemlen, i) MurmurHash2(item, itemlen, i) @@ -48,54 +48,54 @@ //----------------------------------------------------------------------------- static uint32_t MurmurHash2(const void *key, int len, uint32_t seed) { - // 'm' and 'r' are mixing constants generated offline. - // They're not really 'magic', they just happen to work well. + // 'm' and 'r' are mixing constants generated offline. + // They're not really 'magic', they just happen to work well. - const uint32_t m = 0x5bd1e995; - const int r = 24; + const uint32_t m = 0x5bd1e995; + const int r = 24; - // Initialize the hash to a 'random' value + // Initialize the hash to a 'random' value - uint32_t h = seed ^ len; + uint32_t h = seed ^ len; - // Mix 4 bytes at a time into the hash + // Mix 4 bytes at a time into the hash - const unsigned char *data = (const unsigned char *)key; + const unsigned char *data = (const unsigned char *)key; - while (len >= 4) { - uint32_t k = *(uint32_t *)data; + while (len >= 4) { + uint32_t k = *(uint32_t *)data; - k *= m; - k ^= k >> r; - k *= m; + k *= m; + k ^= k >> r; + k *= m; - h *= m; - h ^= k; + h *= m; + h ^= k; - data += 4; - len -= 4; - } + data += 4; + len -= 4; + } - // Handle the last few bytes of the input heap + // Handle the last few bytes of the input heap - switch (len) { + switch (len) { case 3: - h ^= data[2] << 16; + h ^= data[2] << 16; case 2: - h ^= data[1] << 8; + h ^= data[1] << 8; case 1: - h ^= data[0]; - h *= m; - }; + h ^= data[0]; + h *= m; + }; - // Do a few final mixes of the hash to ensure the last few - // bytes are well-incorporated. + // Do a few final mixes of the hash to ensure the last few + // bytes are well-incorporated. - h ^= h >> 13; - h *= m; - h ^= h >> 15; + h ^= h >> 13; + h *= m; + h ^= h >> 15; - return h; + return h; } //----------------------------------------------------------------------------- @@ -106,276 +106,280 @@ static uint32_t MurmurHash2(const void *key, int len, uint32_t seed) { // 64-bit hash for 64-bit platforms -[[maybe_unused]] static uint64_t MurmurHash64A_Bloom(const void *key, int len, uint64_t seed) { - const uint64_t m = BIG_CONSTANT(0xc6a4a7935bd1e995); - const int r = 47; +[[maybe_unused]]static uint64_t MurmurHash64A_Bloom(const void *key, int len, uint64_t seed) { + const uint64_t m = BIG_CONSTANT(0xc6a4a7935bd1e995); + const int r = 47; - uint64_t h = seed ^ (len * m); + uint64_t h = seed ^ (len * m); - const uint64_t *data = (const uint64_t *)key; - const uint64_t *end = data + (len / 8); + const uint64_t *data = (const uint64_t *)key; + const uint64_t *end = data + (len / 8); - while (data != end) { - uint64_t k = *data++; + while (data != end) { + uint64_t k = *data++; - k *= m; - k ^= k >> r; - k *= m; + k *= m; + k ^= k >> r; + k *= m; - h ^= k; - h *= m; - } + h ^= k; + h *= m; + } - const unsigned char *data2 = (const unsigned char *)data; + const unsigned char *data2 = (const unsigned char *)data; - switch (len & 7) { + switch (len & 7) { case 7: - h ^= ((uint64_t)data2[6]) << 48; + h ^= ((uint64_t)data2[6]) << 48; case 6: - h ^= ((uint64_t)data2[5]) << 40; + h ^= ((uint64_t)data2[5]) << 40; case 5: - h ^= ((uint64_t)data2[4]) << 32; + h ^= ((uint64_t)data2[4]) << 32; case 4: - h ^= ((uint64_t)data2[3]) << 24; + h ^= ((uint64_t)data2[3]) << 24; case 3: - h ^= ((uint64_t)data2[2]) << 16; + h ^= ((uint64_t)data2[2]) << 16; case 2: - h ^= ((uint64_t)data2[1]) << 8; + h ^= ((uint64_t)data2[1]) << 8; case 1: - h ^= ((uint64_t)data2[0]); - h *= m; - }; + h ^= ((uint64_t)data2[0]); + h *= m; + }; - h ^= h >> r; - h *= m; - h ^= h >> r; + h ^= h >> r; + h *= m; + h ^= h >> r; - return h; + return h; } // 64-bit hash for 32-bit platforms -[[maybe_unused]] static uint64_t MurmurHash64B(const void *key, int len, uint64_t seed) { - const uint32_t m = 0x5bd1e995; - const int r = 24; - - uint32_t h1 = (uint32_t)(seed ^ len); - uint32_t h2 = (uint32_t)(seed >> 32); - - const uint32_t *data = (const uint32_t *)key; - - while (len >= 8) { - uint32_t k1 = *data++; - k1 *= m; - k1 ^= k1 >> r; - k1 *= m; - h1 *= m; - h1 ^= k1; - len -= 4; +[[maybe_unused]]static uint64_t MurmurHash64B(const void *key, int len, uint64_t seed) { + const uint32_t m = 0x5bd1e995; + const int r = 24; + + uint32_t h1 = (uint32_t)(seed ^ len); + uint32_t h2 = (uint32_t)(seed >> 32); + + const uint32_t *data = (const uint32_t *)key; + + while (len >= 8) { + uint32_t k1 = *data++; + k1 *= m; + k1 ^= k1 >> r; + k1 *= m; + h1 *= m; + h1 ^= k1; + len -= 4; + + uint32_t k2 = *data++; + k2 *= m; + k2 ^= k2 >> r; + k2 *= m; + h2 *= m; + h2 ^= k2; + len -= 4; + } - uint32_t k2 = *data++; - k2 *= m; - k2 ^= k2 >> r; - k2 *= m; - h2 *= m; - h2 ^= k2; - len -= 4; - } - - if (len >= 4) { - uint32_t k1 = *data++; - k1 *= m; - k1 ^= k1 >> r; - k1 *= m; - h1 *= m; - h1 ^= k1; - len -= 4; - } + if (len >= 4) { + uint32_t k1 = *data++; + k1 *= m; + k1 ^= k1 >> r; + k1 *= m; + h1 *= m; + h1 ^= k1; + len -= 4; + } - switch (len) { + switch (len) { case 3: - h2 ^= ((unsigned char *)data)[2] << 16; + h2 ^= ((unsigned char *)data)[2] << 16; case 2: - h2 ^= ((unsigned char *)data)[1] << 8; + h2 ^= ((unsigned char *)data)[1] << 8; case 1: - h2 ^= ((unsigned char *)data)[0]; - h2 *= m; - }; + h2 ^= ((unsigned char *)data)[0]; + h2 *= m; + }; - h1 ^= h2 >> 18; - h1 *= m; - h2 ^= h1 >> 22; - h2 *= m; - h1 ^= h2 >> 17; - h1 *= m; - h2 ^= h1 >> 19; - h2 *= m; + h1 ^= h2 >> 18; + h1 *= m; + h2 ^= h1 >> 22; + h2 *= m; + h1 ^= h2 >> 17; + h1 *= m; + h2 ^= h1 >> 19; + h2 *= m; - uint64_t h = h1; + uint64_t h = h1; - h = (h << 32) | h2; + h = (h << 32) | h2; - return h; + return h; } /* ---------------------------------------------------------------------- */ void BlockSplitTopK::heapifyDown(int start) { - size_t child = start; - - // check whether larger than children - if (heap_size < 2 || (heap_size - 2) / 2 < child) { - return; - } - - child = 2 * child + 1; - if ((child + 1) < heap_size && (heap[child].count > heap[child + 1].count)) { - ++child; - } - if (heap[child].count > heap[start].count) { - return; - } - - HeapBucket top; - memcpy(&top, &heap[start], sizeof(HeapBucket)); - do { - memcpy(&heap[start], &heap[child], sizeof(HeapBucket)); - start = child; - - if ((heap_size - 2) / 2 < child) { - break; + size_t child = start; + + // check whether larger than children + if (heap_size < 2 || (heap_size - 2) / 2 < child) { + return; } - child = 2 * child + 1; + child = 2 * child + 1; if ((child + 1) < heap_size && (heap[child].count > heap[child + 1].count)) { - ++child; + ++child; } - } while (heap[child].count < top.count); - memcpy(&heap[start], &top, sizeof(HeapBucket)); + if (heap[child].count > heap[start].count) { + return; + } + + HeapBucket top; + memcpy(&top, &heap[start], sizeof(HeapBucket)); + do { + memcpy(&heap[start], &heap[child], sizeof(HeapBucket)); + start = child; + + if ((heap_size - 2) / 2 < child) { + break; + } + child = 2 * child + 1; + + if ((child + 1) < heap_size && (heap[child].count > heap[child + 1].count)) { + ++child; + } + } while (heap[child].count < top.count); + memcpy(&heap[start], &top, sizeof(HeapBucket)); } void BlockSplitTopK::heapifyUp(int start) { - size_t parent = start; - - // check whether smaller than parent - if (heap_size < 2 || parent == 0) { - return; - } - - parent = (parent - 1) / 2; - if (heap[parent].count > heap[start].count) { - return; - } - - HeapBucket bottom; - memcpy(&bottom, &heap[start], sizeof(HeapBucket)); - do { - memcpy(&heap[start], &heap[parent], sizeof(HeapBucket)); - start = parent; - - if (start == 0) { - break; + size_t parent = start; + + // check whether smaller than parent + if (heap_size < 2 || parent == 0) { + return; } + parent = (parent - 1) / 2; - } while (heap[parent].count > bottom.count); - memcpy(&heap[start], &bottom, sizeof(HeapBucket)); + if (heap[parent].count > heap[start].count) { + return; + } + + HeapBucket bottom; + memcpy(&bottom, &heap[start], sizeof(HeapBucket)); + do { + memcpy(&heap[start], &heap[parent], sizeof(HeapBucket)); + start = parent; + + if (start == 0) { + break; + } + parent = (parent - 1) / 2; + } while (heap[parent].count > bottom.count); + memcpy(&heap[start], &bottom, sizeof(HeapBucket)); } int BlockSplitTopK::checkExistInHeap(const std::string &item) { - uint32_t itemlen = item.size(); - const char *data = item.c_str(); - for (int32_t i = heap_size - 1; i >= 0; --i) { - if (heap[i].itemlen == itemlen && memcmp(heap[i].item, data, itemlen) == 0) { - return i; + uint32_t itemlen = item.size(); + const char *data = item.c_str(); + for (int32_t i = heap_size - 1; i >= 0; --i) { + if (heap[i].itemlen == itemlen && memcmp(heap[i].item, data, itemlen) == 0) { + return i; + } } - } - return -1; + return -1; } int BlockSplitTopK::cmpHeapBucketCount(const HeapBucket &a, const HeapBucket &b) { - return a.count < b.count ? 1 : a.count > b.count ? -1 : 0; + return a.count < b.count ? 1 : a.count > b.count ? -1 : 0; } void BlockSplitTopK::Add(const std::string &item, uint32_t increment) { - uint32_t itemlen = item.size(); - const char *data = item.c_str(); - counter_t maxCount = 0; - uint32_t fp = TOPK_HASH(data, itemlen, GA); - - int location = checkExistInHeap(item); - - for (size_t i = 0; i < depth; ++i) { - uint32_t loc = TOPK_HASH(data, itemlen, i) % width; - - loc += i * width; - if (buckets[loc].count == 0) { - buckets[loc].fp = fp; - buckets[loc].count = increment; - maxCount = std::max(maxCount, buckets[loc].count); - } else if (buckets[loc].fp == fp && location != -1) { - buckets[loc].count += increment; - maxCount = std::max(maxCount, buckets[loc].count); - } else { - // decay - uint32_t local_incr = increment; - for (; local_incr > 0; --local_incr) { - double decay; - if (buckets[loc].count < TOPK_DECAY_LOOKUP_TABLE) { - decay = lookupTable[buckets[loc].count]; - } else { - decay = pow(lookupTable[TOPK_DECAY_LOOKUP_TABLE - 1], (buckets[loc].count / (TOPK_DECAY_LOOKUP_TABLE - 1))) * - lookupTable[buckets[loc].count % (TOPK_DECAY_LOOKUP_TABLE - 1)]; - } - double chance = rand() / (double)RAND_MAX; - if (chance < decay) { - --buckets[loc].count; - if (buckets[loc].count == 0) { + uint32_t itemlen = item.size(); + const char *data = item.c_str(); + counter_t maxCount = 0; + uint32_t fp = TOPK_HASH(data, itemlen, GA); + + int location = checkExistInHeap(item); + + for (size_t i = 0; i < depth; ++i) { + uint32_t loc = TOPK_HASH(data, itemlen, i) % width; + + loc += i * width; + if (buckets[loc].count == 0) { buckets[loc].fp = fp; - buckets[loc].count = 1; + buckets[loc].count = increment; maxCount = std::max(maxCount, buckets[loc].count); - break; - } + } else if (buckets[loc].fp == fp && location != -1) { + buckets[loc].count += increment; + maxCount = std::max(maxCount, buckets[loc].count); + } else { + // decay + uint32_t local_incr = increment; + for (; local_incr > 0; --local_incr) { + double decay; + if (buckets[loc].count < TOPK_DECAY_LOOKUP_TABLE) { + decay = lookupTable[buckets[loc].count]; + } else { + decay = pow(lookupTable[TOPK_DECAY_LOOKUP_TABLE - 1], + (buckets[loc].count / (TOPK_DECAY_LOOKUP_TABLE - 1))) * + lookupTable[buckets[loc].count % (TOPK_DECAY_LOOKUP_TABLE - 1)]; + } + double chance = rand() / (double)RAND_MAX; + if (chance < decay) { + -- buckets[loc].count; + if (buckets[loc].count == 0) { + buckets[loc].fp = fp; + buckets[loc].count = 1; + maxCount = std::max(maxCount, buckets[loc].count); + break; + } + } + } } - } } - } - if (k == heap_size) { - if (location == -1) { - if (heap[0].count == maxCount || heap[0].count + 1 == maxCount) { - heap[0].fp = fp; - heap[0].itemlen = itemlen; - delete heap[0].item; - heap[0].item = new char[itemlen]; - memcpy(heap[0].item, data, itemlen); - - heap[0].count = maxCount; - - heapifyDown(0); - } + if (k == heap_size) { + if (location == -1) { + if (heap[0].count == maxCount || heap[0].count + 1 == maxCount) { + heap[0].fp = fp; + heap[0].itemlen = itemlen; + delete heap[0].item; + heap[0].item = new char[itemlen]; + memcpy(heap[0].item, data, itemlen); + + heap[0].count = maxCount; + + heapifyDown(0); + } + } else { + heap[location].count += increment; + heapifyDown(location); + } } else { - heap[location].count += increment; - heapifyDown(location); + heap[heap_size].fp = fp; + heap[heap_size].itemlen = itemlen; + heap[heap_size].item = new char[itemlen]; + memcpy(heap[heap_size].item, data, itemlen); + heap[heap_size].count = maxCount; + + heapifyUp(heap_size); + heap_size ++; } - } else { - heap[heap_size].fp = fp; - heap[heap_size].itemlen = itemlen; - heap[heap_size].item = new char[itemlen]; - memcpy(heap[heap_size].item, data, itemlen); - heap[heap_size].count = maxCount; - - heapifyUp(heap_size); - heap_size++; - } } -bool BlockSplitTopK::Query(const std::string &item) { return checkExistInHeap(item) != -1; } +bool BlockSplitTopK::Query(const std::string &item) { + return checkExistInHeap(item) != -1; +} std::vector BlockSplitTopK::List() { - std::vector result(heap_size); - for (uint32_t i = 0; i < heap_size; i++) { - result[i] = heap[i]; - } - std::sort(result.begin(), result.end(), - [this](const HeapBucket &a, const HeapBucket &b) { return cmpHeapBucketCount(a, b) > 0; }); - return result; + std::vector result(heap_size); + for (uint32_t i = 0; i < heap_size; i ++) { + result[i] = heap[i]; + } + std::sort(result.begin(), result.end(), [this] (const HeapBucket &a, const HeapBucket &b) { + return cmpHeapBucketCount(a, b) > 0; + }); + return result; } \ No newline at end of file diff --git a/src/types/topk.h b/src/types/topk.h index 34fbc684bb0..559d4c010e3 100644 --- a/src/types/topk.h +++ b/src/types/topk.h @@ -21,67 +21,66 @@ #pragma once #include - -#include +#include #include +#include #include -#include static constexpr int TOPK_DECAY_LOOKUP_TABLE = 256; using counter_t = uint32_t; struct HeapBucket { - uint32_t fp; - uint32_t itemlen; - char *item; - counter_t count; + uint32_t fp; + uint32_t itemlen; + char* item; + counter_t count; }; struct Bucket { - uint32_t fp; - counter_t count; + uint32_t fp; + counter_t count; }; class BlockSplitTopK { - public: - BlockSplitTopK() = delete; - explicit BlockSplitTopK(uint32_t k, uint32_t width, uint32_t depth, double decay) - : k(k), width(width), depth(depth), decay(decay), heap_size(0) { - buckets = new Bucket[width * depth]; - heap = new HeapBucket[k]; - std::fill_n(buckets, width * depth, Bucket{0, 0}); - std::fill_n(heap, k, HeapBucket{0, 0, nullptr, 0}); - for (int i = 0; i < TOPK_DECAY_LOOKUP_TABLE; ++i) { - lookupTable[i] = pow(decay, i); +public: + BlockSplitTopK() = delete; + explicit BlockSplitTopK(uint32_t k, uint32_t width, uint32_t depth, double decay) : + k(k), width(width), depth(depth), decay(decay), heap_size(0) { + buckets = new Bucket[width * depth]; + heap = new HeapBucket[k]; + std::fill_n(buckets, width * depth, Bucket{0, 0}); + std::fill_n(heap, k, HeapBucket{0, 0, nullptr, 0}); + for (int i = 0; i < TOPK_DECAY_LOOKUP_TABLE; ++i) { + lookupTable[i] = pow(decay, i); + } } - } - ~BlockSplitTopK() { - for (size_t i = 0; i < k; ++i) { - delete[] heap[i].item; + ~BlockSplitTopK() { + for (size_t i = 0; i < k; ++i) { + delete[] heap[i].item; + } + delete[] buckets; + delete[] heap; } - delete[] buckets; - delete[] heap; - } - void Add(const std::string &item, uint32_t increment); - bool Query(const std::string &item); - std::vector List(); + void Add(const std::string &item, uint32_t increment); + bool Query(const std::string &item); + std::vector List(); - void heapifyDown(int start); - void heapifyUp(int start); - int checkExistInHeap(const std::string &item); - int cmpHeapBucketCount(const HeapBucket &a, const HeapBucket &b); + void heapifyDown(int start); + void heapifyUp(int start); + int checkExistInHeap(const std::string &item); + int cmpHeapBucketCount(const HeapBucket &a, const HeapBucket &b); - uint32_t k; - uint32_t width; - uint32_t depth; - double decay; + uint32_t k; + uint32_t width; + uint32_t depth; + double decay; - size_t heap_size; + size_t heap_size; - Bucket *buckets; - HeapBucket *heap; - double lookupTable[TOPK_DECAY_LOOKUP_TABLE]; + Bucket *buckets; + HeapBucket *heap; + double lookupTable[TOPK_DECAY_LOOKUP_TABLE]; }; \ No newline at end of file diff --git a/tests/cppunit/types/topk_test.cc b/tests/cppunit/types/topk_test.cc index d22ccb9d5c7..21af3171480 100644 --- a/tests/cppunit/types/topk_test.cc +++ b/tests/cppunit/types/topk_test.cc @@ -32,7 +32,9 @@ static constexpr double decay = 0.9; class RedisTopKTest : public TestBase { protected: - explicit RedisTopKTest() : TestBase() { top_k_ = std::make_unique(storage_.get(), "topk_ns"); } + explicit RedisTopKTest() : TestBase() { + top_k_ = std::make_unique(storage_.get(), "topk_ns"); + } ~RedisTopKTest() override = default; void SetUp() override { @@ -110,7 +112,7 @@ TEST_F(RedisTopKTest, TestTopKAddAndQuery) { // heap is full, need remove values1. for (size_t i = 0; i < values2.size(); ++i) { bool found = false; - // due to decay, topk is possible to remove values1. + // due to decay, topk is possiable to remove values1. while (!found) { top_k_->Add(*ctx_, key_, values2[i]); top_k_->Query(*ctx_, key_, values2[i], &found); From 875b9749e5ec65eba1500e91c367339be68c1942 Mon Sep 17 00:00:00 2001 From: aibin <2573214643@qq.com> Date: Thu, 13 Nov 2025 15:19:10 +0800 Subject: [PATCH 06/18] feat: support topk.incrby command --- src/commands/cmd_topk.cc | 32 +++++++++++++++++++++++++++++++- src/types/redis_topk.cc | 8 ++++++-- src/types/redis_topk.h | 3 ++- 3 files changed, 39 insertions(+), 4 deletions(-) diff --git a/src/commands/cmd_topk.cc b/src/commands/cmd_topk.cc index 5c138c66d3c..fc0965e3abf 100644 --- a/src/commands/cmd_topk.cc +++ b/src/commands/cmd_topk.cc @@ -106,6 +106,35 @@ class CommandTopKAdd final : public Commander { } }; +class CommandTopKIncrBy final : public Commander { + public: + Status Parse(const std::vector &args) override { + if (args_.size() != 4) { + return {Status::InvalidArgument, "invalid argument"}; + } + auto parse_incr = ParseInt(args[3], 10); + if (!parse_incr) { + return {Status::InvalidArgument, "invalid argument"}; + } + incr_ = *parse_incr; + return Status::OK(); + } + + Status Execute(engine::Context &ctx, Server *srv, Connection *conn, std::string *output) override { + redis::TopK topk(srv->storage, conn->GetNamespace()); + CHECK(args_.size() == 4); + + auto s = topk.IncrBy(ctx, args_[1], args_[2], incr_); + if (!s.ok()) { + return {Status::RedisExecErr, s.ToString()}; + } + *output = redis::RESP_OK; + return Status::OK(); + } + private: + uint32_t incr_; +}; + class CommandTopKList final : public Commander { public: Status Execute(engine::Context &ctx, Server *srv, Connection *conn, std::string *output) override { @@ -209,6 +238,7 @@ REDIS_REGISTER_COMMANDS(TopK, MakeCmdAttr("topk.add", 3, "write" MakeCmdAttr("topk.list", 2, "read-only", 1, 1, 1), MakeCmdAttr("topk.info", 2, "read-only", 1, 1, 1), MakeCmdAttr("topk.query", 3, "read-only", 1, 1, 1), - MakeCmdAttr("topk.reserve", -3, "write", 1, 1, 1)); + MakeCmdAttr("topk.reserve", -3, "write", 1, 1, 1), + MakeCmdAttr("topk.incrby", 4, "write", 1, 1, 1)); } // namespace redis \ No newline at end of file diff --git a/src/types/redis_topk.cc b/src/types/redis_topk.cc index 9569641fffc..6c6682bd595 100644 --- a/src/types/redis_topk.cc +++ b/src/types/redis_topk.cc @@ -40,6 +40,10 @@ rocksdb::Status TopK::Reserve(engine::Context &ctx, const Slice& user_key, rocksdb::Status TopK::Add(engine::Context &ctx, const Slice &user_key, const Slice &items) { + return IncrBy(ctx, user_key, items, 1); +} + +rocksdb::Status TopK::IncrBy(engine::Context &ctx, const Slice &user_key, const Slice &items, uint32_t incr) { std::string ns_key = AppendNamespacePrefix(user_key); TopKMetadata topk_metadata; @@ -47,7 +51,7 @@ rocksdb::Status TopK::Add(engine::Context &ctx, const Slice &user_key, if (!s.ok()) return s; auto batch = storage_->GetWriteBatchBase(); - WriteBatchLogData log_data(kRedisTopK, {"Add"}); + WriteBatchLogData log_data(kRedisTopK, {"IncrBy"}); s = batch->PutLogData(log_data.Encode()); if (!s.ok()) return s; @@ -55,7 +59,7 @@ rocksdb::Status TopK::Add(engine::Context &ctx, const Slice &user_key, s = getTopKData(ctx, ns_key, topk_metadata, &topk); if (!s.ok()) return s; - topk.Add(items.data_, 1); + topk.Add(items.data_, incr); s = setTopkData(ctx, ns_key, topk_metadata, topk); if (!s.ok()) return s; diff --git a/src/types/redis_topk.h b/src/types/redis_topk.h index 2759a0436d6..a98420ad4b9 100644 --- a/src/types/redis_topk.h +++ b/src/types/redis_topk.h @@ -56,7 +56,8 @@ class TopK : public SubKeyScanner { const Slice &items); rocksdb::Status List(engine::Context &ctx, const Slice& user_key, std::vector &items); rocksdb::Status Info(engine::Context &ctx, const Slice& user_key, TopKInfo *info); - + rocksdb::Status IncrBy(engine::Context &ctx, const Slice &user_key, + const Slice &items, uint32_t incr); private: rocksdb::Status getTopKMetadata(engine::Context &ctx, const Slice &ns_key, TopKMetadata *metadata); rocksdb::Status createTopK(engine::Context &ctx, const Slice &ns_key, From 9a4c8858e00d7cd27c0dc910f7e6864b31973f0b Mon Sep 17 00:00:00 2001 From: aibin <2573214643@qq.com> Date: Thu, 13 Nov 2025 17:38:05 +0800 Subject: [PATCH 07/18] fix: clang-format code --- src/commands/cmd_topk.cc | 13 +- src/storage/redis_metadata.h | 15 +- src/types/redis_topk.cc | 305 ++++++++++--------- src/types/redis_topk.h | 41 +-- src/types/topk.cc | 488 +++++++++++++++---------------- src/types/topk.h | 81 ++--- tests/cppunit/types/topk_test.cc | 4 +- 7 files changed, 463 insertions(+), 484 deletions(-) diff --git a/src/commands/cmd_topk.cc b/src/commands/cmd_topk.cc index fc0965e3abf..24404870eba 100644 --- a/src/commands/cmd_topk.cc +++ b/src/commands/cmd_topk.cc @@ -18,8 +18,8 @@ * */ -#include "commander.h" #include "command_parser.h" +#include "commander.h" #include "error_constants.h" #include "server/server.h" #include "types/redis_topk.h" @@ -30,7 +30,7 @@ constexpr const char *errBadWidth = "Bad width"; constexpr const char *errBadDepth = "Bad depth"; constexpr const char *errBadDecay = "Bad decay"; constexpr const char *errInvalidDecay = "Decay must be between 0 and 1"; -} +} // namespace namespace redis { @@ -84,6 +84,7 @@ class CommandTopKReserve final : public Commander { *output = redis::RESP_OK; return Status::OK(); } + private: uint32_t k_; uint32_t width_ = 7; @@ -131,6 +132,7 @@ class CommandTopKIncrBy final : public Commander { *output = redis::RESP_OK; return Status::OK(); } + private: uint32_t incr_; }; @@ -179,7 +181,7 @@ class CommandTopKInfo final : public Commander { return Commander::Parse(args); } - Status Execute(engine::Context &ctx, Server *srv, Connection *conn, [[maybe_unused]]std::string *output) override { + Status Execute(engine::Context &ctx, Server *srv, Connection *conn, [[maybe_unused]] std::string *output) override { redis::TopK topk_db(srv->storage, conn->GetNamespace()); TopKInfo info; @@ -214,6 +216,7 @@ class CommandTopKInfo final : public Commander { } return Status::OK(); } + private: TopKInfoType type_ = TopKInfoType::kAll; }; @@ -228,7 +231,7 @@ class CommandTopKQuery final : public Commander { auto s = topk.Query(ctx, args_[1], args_[2], &is_exists_); if (!s.ok()) { return {Status::RedisExecErr, s.ToString()}; - } + } *output = redis::Bool(redis::RESP::v2, is_exists_); return Status::OK(); } @@ -241,4 +244,4 @@ REDIS_REGISTER_COMMANDS(TopK, MakeCmdAttr("topk.add", 3, "write" MakeCmdAttr("topk.reserve", -3, "write", 1, 1, 1), MakeCmdAttr("topk.incrby", 4, "write", 1, 1, 1)); -} // namespace redis \ No newline at end of file +} // namespace redis \ No newline at end of file diff --git a/src/storage/redis_metadata.h b/src/storage/redis_metadata.h index 0b1ff777f8a..516fde6ec05 100644 --- a/src/storage/redis_metadata.h +++ b/src/storage/redis_metadata.h @@ -59,9 +59,8 @@ enum RedisType : uint8_t { }; inline constexpr const std::array RedisTypeNames = { - "none", "string", "hash", "list", "set", "zset", "bitmap", - "sortedint", "stream", "MBbloom--", "ReJSON-RL", "hyperloglog", "TDIS-TYPE", "timeseries", - "topk"}; + "none", "string", "hash", "list", "set", "zset", "bitmap", "sortedint", + "stream", "MBbloom--", "ReJSON-RL", "hyperloglog", "TDIS-TYPE", "timeseries", "topk"}; struct RedisTypes { RedisTypes(std::initializer_list list) { @@ -413,7 +412,7 @@ class TimeSeriesMetadata : public Metadata { }; class TopKMetadata : public Metadata { -public: + public: uint32_t top_k; uint16_t width; uint32_t depth; @@ -421,12 +420,8 @@ class TopKMetadata : public Metadata { explicit TopKMetadata(bool generate_version = true) : Metadata(kRedisTopK, generate_version) {} - TopKMetadata(uint64_t top_k, uint64_t width = 7, uint64_t depth = 8, double decay = 0.9, bool generate_version = true) - : Metadata(kRedisTopK, generate_version), - top_k(top_k), - width(width), - depth(depth), - decay(decay) {} + TopKMetadata(uint64_t top_k, uint64_t width = 7, uint64_t depth = 8, double decay = 0.9, bool generate_version = true) + : Metadata(kRedisTopK, generate_version), top_k(top_k), width(width), depth(depth), decay(decay) {} void Encode(std::string *dst) const override; rocksdb::Status Decode(Slice *input) override; diff --git a/src/types/redis_topk.cc b/src/types/redis_topk.cc index 6c6682bd595..a547cc5bb82 100644 --- a/src/types/redis_topk.cc +++ b/src/types/redis_topk.cc @@ -19,213 +19,210 @@ */ #include "redis_topk.h" -#include "topk.h" + #include "commands/ttl_util.h" +#include "topk.h" namespace redis { - -rocksdb::Status TopK::Reserve(engine::Context &ctx, const Slice& user_key, - uint32_t k, uint32_t width, uint32_t depth, double decay) { - std::string ns_key = AppendNamespacePrefix(user_key); - - TopKMetadata topk_metadata; - rocksdb::Status s = getTopKMetadata(ctx, ns_key, &topk_metadata); - if (!s.ok() && !s.IsNotFound()) return s; - if (!s.IsNotFound()) { - return rocksdb::Status::InvalidArgument("TopK already exists"); - } - return createTopK(ctx, ns_key, k, width, depth, decay, &topk_metadata); +rocksdb::Status TopK::Reserve(engine::Context &ctx, const Slice &user_key, uint32_t k, uint32_t width, uint32_t depth, + double decay) { + std::string ns_key = AppendNamespacePrefix(user_key); + + TopKMetadata topk_metadata; + rocksdb::Status s = getTopKMetadata(ctx, ns_key, &topk_metadata); + if (!s.ok() && !s.IsNotFound()) return s; + if (!s.IsNotFound()) { + return rocksdb::Status::InvalidArgument("TopK already exists"); + } + + return createTopK(ctx, ns_key, k, width, depth, decay, &topk_metadata); } -rocksdb::Status TopK::Add(engine::Context &ctx, const Slice &user_key, - const Slice &items) { - return IncrBy(ctx, user_key, items, 1); +rocksdb::Status TopK::Add(engine::Context &ctx, const Slice &user_key, const Slice &items) { + return IncrBy(ctx, user_key, items, 1); } rocksdb::Status TopK::IncrBy(engine::Context &ctx, const Slice &user_key, const Slice &items, uint32_t incr) { - std::string ns_key = AppendNamespacePrefix(user_key); + std::string ns_key = AppendNamespacePrefix(user_key); - TopKMetadata topk_metadata; - rocksdb::Status s = getTopKMetadata(ctx, ns_key, &topk_metadata); - if (!s.ok()) return s; - - auto batch = storage_->GetWriteBatchBase(); - WriteBatchLogData log_data(kRedisTopK, {"IncrBy"}); - s = batch->PutLogData(log_data.Encode()); - if (!s.ok()) return s; + TopKMetadata topk_metadata; + rocksdb::Status s = getTopKMetadata(ctx, ns_key, &topk_metadata); + if (!s.ok()) return s; - BlockSplitTopK topk(topk_metadata.top_k, topk_metadata.width, topk_metadata.depth, topk_metadata.decay); - s = getTopKData(ctx, ns_key, topk_metadata, &topk); - if (!s.ok()) return s; + auto batch = storage_->GetWriteBatchBase(); + WriteBatchLogData log_data(kRedisTopK, {"IncrBy"}); + s = batch->PutLogData(log_data.Encode()); + if (!s.ok()) return s; - topk.Add(items.data_, incr); + BlockSplitTopK topk(topk_metadata.top_k, topk_metadata.width, topk_metadata.depth, topk_metadata.decay); + s = getTopKData(ctx, ns_key, topk_metadata, &topk); + if (!s.ok()) return s; - s = setTopkData(ctx, ns_key, topk_metadata, topk); - if (!s.ok()) return s; + topk.Add(items.data_, incr); - return storage_->Write(ctx, storage_->DefaultWriteOptions(), batch->GetWriteBatch()); + s = setTopkData(ctx, ns_key, topk_metadata, topk); + if (!s.ok()) return s; + + return storage_->Write(ctx, storage_->DefaultWriteOptions(), batch->GetWriteBatch()); } -rocksdb::Status TopK::Query(engine::Context &ctx, const Slice& user_key, - const Slice &items, bool *exists) { - std::string ns_key = AppendNamespacePrefix(user_key); +rocksdb::Status TopK::Query(engine::Context &ctx, const Slice &user_key, const Slice &items, bool *exists) { + std::string ns_key = AppendNamespacePrefix(user_key); - TopKMetadata topk_metadata; - rocksdb::Status s = getTopKMetadata(ctx, ns_key, &topk_metadata); - if (!s.ok()) return s; + TopKMetadata topk_metadata; + rocksdb::Status s = getTopKMetadata(ctx, ns_key, &topk_metadata); + if (!s.ok()) return s; - BlockSplitTopK topk(topk_metadata.top_k, topk_metadata.width, topk_metadata.depth, topk_metadata.decay); - s = getTopKData(ctx, ns_key, topk_metadata, &topk); - if (!s.ok()) return s; + BlockSplitTopK topk(topk_metadata.top_k, topk_metadata.width, topk_metadata.depth, topk_metadata.decay); + s = getTopKData(ctx, ns_key, topk_metadata, &topk); + if (!s.ok()) return s; - *exists = topk.Query(items.data_); + *exists = topk.Query(items.data_); - return rocksdb::Status::OK(); + return rocksdb::Status::OK(); } -rocksdb::Status TopK::List(engine::Context &ctx, const Slice& user_key, - std::vector &items) { - std::string ns_key = AppendNamespacePrefix(user_key); +rocksdb::Status TopK::List(engine::Context &ctx, const Slice &user_key, std::vector &items) { + std::string ns_key = AppendNamespacePrefix(user_key); - TopKMetadata topk_metadata; - rocksdb::Status s = getTopKMetadata(ctx, ns_key, &topk_metadata); - if (!s.ok()) return s; - - BlockSplitTopK topk(topk_metadata.top_k, topk_metadata.width, topk_metadata.depth, topk_metadata.decay); - s = getTopKData(ctx, ns_key, topk_metadata, &topk); - if (!s.ok()) return s; + TopKMetadata topk_metadata; + rocksdb::Status s = getTopKMetadata(ctx, ns_key, &topk_metadata); + if (!s.ok()) return s; - auto heap_buckets = topk.List(); - for (auto &bucket : heap_buckets) { - items.emplace_back(bucket.item, bucket.itemlen); - } + BlockSplitTopK topk(topk_metadata.top_k, topk_metadata.width, topk_metadata.depth, topk_metadata.decay); + s = getTopKData(ctx, ns_key, topk_metadata, &topk); + if (!s.ok()) return s; + + auto heap_buckets = topk.List(); + for (auto &bucket : heap_buckets) { + items.emplace_back(bucket.item, bucket.itemlen); + } - return rocksdb::Status::OK(); + return rocksdb::Status::OK(); } -rocksdb::Status TopK::Info(engine::Context &ctx, const Slice& user_key, TopKInfo *info) { - std::string ns_key = AppendNamespacePrefix(user_key); - - TopKMetadata metadata; - auto s = getTopKMetadata(ctx, ns_key, &metadata); - if (!s.ok()) return s; +rocksdb::Status TopK::Info(engine::Context &ctx, const Slice &user_key, TopKInfo *info) { + std::string ns_key = AppendNamespacePrefix(user_key); + + TopKMetadata metadata; + auto s = getTopKMetadata(ctx, ns_key, &metadata); + if (!s.ok()) return s; - info->k = metadata.top_k; - info->width = metadata.width; - info->depth = metadata.depth; - info->decay = metadata.decay; + info->k = metadata.top_k; + info->width = metadata.width; + info->depth = metadata.depth; + info->decay = metadata.decay; - return rocksdb::Status::OK(); + return rocksdb::Status::OK(); } rocksdb::Status TopK::getTopKMetadata(engine::Context &ctx, const Slice &ns_key, TopKMetadata *metadata) { - return Database::GetMetadata(ctx, {kRedisTopK}, ns_key, metadata); + return Database::GetMetadata(ctx, {kRedisTopK}, ns_key, metadata); } -rocksdb::Status TopK::createTopK(engine::Context &ctx, const Slice &ns_key, - uint32_t k, uint32_t width, uint32_t depth, double decay, - TopKMetadata *metadata) { - metadata->top_k = k; - metadata->width = width; - metadata->depth = depth; - metadata->decay = decay; - - BlockSplitTopK block_split_top_k(k, width, depth, decay); - - auto batch = storage_->GetWriteBatchBase(); - WriteBatchLogData log_data(kRedisTopK, {"createTopK"}); - auto s = batch->PutLogData(log_data.Encode()); - if (!s.ok()) return s; - - std::string top_k_meta_bytes; - metadata->Encode(&top_k_meta_bytes); - s = batch->Put(metadata_cf_handle_, ns_key, top_k_meta_bytes); - if (!s.ok()) return s; +rocksdb::Status TopK::createTopK(engine::Context &ctx, const Slice &ns_key, uint32_t k, uint32_t width, uint32_t depth, + double decay, TopKMetadata *metadata) { + metadata->top_k = k; + metadata->width = width; + metadata->depth = depth; + metadata->decay = decay; - s = setTopkData(ctx, ns_key, *metadata, block_split_top_k); - if (!s.ok()) return s; + BlockSplitTopK block_split_top_k(k, width, depth, decay); + + auto batch = storage_->GetWriteBatchBase(); + WriteBatchLogData log_data(kRedisTopK, {"createTopK"}); + auto s = batch->PutLogData(log_data.Encode()); + if (!s.ok()) return s; - return storage_->Write(ctx, storage_->DefaultWriteOptions(), batch->GetWriteBatch()); + std::string top_k_meta_bytes; + metadata->Encode(&top_k_meta_bytes); + s = batch->Put(metadata_cf_handle_, ns_key, top_k_meta_bytes); + if (!s.ok()) return s; + + s = setTopkData(ctx, ns_key, *metadata, block_split_top_k); + if (!s.ok()) return s; + + return storage_->Write(ctx, storage_->DefaultWriteOptions(), batch->GetWriteBatch()); } -rocksdb::Status TopK::getTopKData(engine::Context &ctx, const Slice& ns_key, const TopKMetadata &metadata, +rocksdb::Status TopK::getTopKData(engine::Context &ctx, const Slice &ns_key, const TopKMetadata &metadata, BlockSplitTopK *topk) { - for (uint8_t i = 0; i < 3; i++) { - std::string tk_key = getTKKey(ns_key, metadata, i); - rocksdb::PinnableSlice pinnable_value; - rocksdb::Status s = storage_->Get(ctx, ctx.GetReadOptions(), tk_key, &pinnable_value); + for (uint8_t i = 0; i < 3; i++) { + std::string tk_key = getTKKey(ns_key, metadata, i); + rocksdb::PinnableSlice pinnable_value; + rocksdb::Status s = storage_->Get(ctx, ctx.GetReadOptions(), tk_key, &pinnable_value); + if (!s.ok()) return s; + if (i == 0) { + if (pinnable_value.size() != metadata.width * metadata.depth * sizeof(Bucket)) { + return rocksdb::Status::Corruption("TopK data corrupted: buckets size mismatch"); + } + memcpy(topk->buckets, pinnable_value.data(), pinnable_value.size()); + } else if (i == 1) { + if (pinnable_value.size() != metadata.top_k * sizeof(HeapBucket)) { + return rocksdb::Status::Corruption("TopK data corrupted: heap size mismatch"); + } + memcpy(topk->heap, pinnable_value.data(), pinnable_value.size()); + for (uint32_t j = 0; j < metadata.top_k; j++) { + std::string hb_key = getHBKey(ns_key, metadata, i, j); + rocksdb::PinnableSlice hb_value; + rocksdb::Status s = storage_->Get(ctx, ctx.GetReadOptions(), hb_key, &hb_value); if (!s.ok()) return s; - if (i == 0) { - if (pinnable_value.size() != metadata.width * metadata.depth * sizeof(Bucket)) { - return rocksdb::Status::Corruption("TopK data corrupted: buckets size mismatch"); - } - memcpy(topk->buckets, pinnable_value.data(), pinnable_value.size()); - } else if (i == 1) { - if (pinnable_value.size() != metadata.top_k * sizeof(HeapBucket)) { - return rocksdb::Status::Corruption("TopK data corrupted: heap size mismatch"); - } - memcpy(topk->heap, pinnable_value.data(), pinnable_value.size()); - for (uint32_t j = 0; j < metadata.top_k; j++) { - std::string hb_key = getHBKey(ns_key, metadata, i, j); - rocksdb::PinnableSlice hb_value; - rocksdb::Status s = storage_->Get(ctx, ctx.GetReadOptions(), hb_key, &hb_value); - if (!s.ok()) return s; - if (hb_value.size() != topk->heap[j].itemlen) { - return rocksdb::Status::Corruption("TopK data corrupted: heap bucket size mismatch"); - } - topk->heap[j].item = new char[topk->heap[j].itemlen]; - memcpy(topk->heap[j].item, hb_value.data(), hb_value.size()); - } - } else { - topk->heap_size = static_cast(std::stoul(pinnable_value.data())); + if (hb_value.size() != topk->heap[j].itemlen) { + return rocksdb::Status::Corruption("TopK data corrupted: heap bucket size mismatch"); } + topk->heap[j].item = new char[topk->heap[j].itemlen]; + memcpy(topk->heap[j].item, hb_value.data(), hb_value.size()); + } + } else { + topk->heap_size = static_cast(std::stoul(pinnable_value.data())); } - return rocksdb::Status::OK(); + } + return rocksdb::Status::OK(); } -rocksdb::Status TopK::setTopkData(engine::Context &ctx, const Slice& ns_key, const TopKMetadata &metadata, +rocksdb::Status TopK::setTopkData(engine::Context &ctx, const Slice &ns_key, const TopKMetadata &metadata, const BlockSplitTopK &topk) { - auto batch = storage_->GetWriteBatchBase(); - WriteBatchLogData log_data(kRedisTopK, {"setTopkData"}); - rocksdb::Status s = batch->PutLogData(log_data.Encode()); - if (!s.ok()) return s; - - for (uint8_t i = 0; i < 3; i++) { - std::string tk_key = getTKKey(ns_key, metadata, i); - std::string tk_value; - if (i == 0) { - tk_value.assign(reinterpret_cast(topk.buckets), metadata.width * metadata.depth * sizeof(Bucket)); - } else if (i == 1) { - tk_value.assign(reinterpret_cast(topk.heap), metadata.top_k * sizeof(HeapBucket)); - for (uint32_t j = 0; j < metadata.top_k; j++) { - std::string hb_key = getHBKey(ns_key, metadata, i, j); - std::string hb_value(topk.heap[j].item, topk.heap[j].itemlen); - s = batch->Put(hb_key, hb_value); - if (!s.ok()) return s; - } - } else { - tk_value = std::to_string(topk.heap_size); - } - rocksdb::Status s = batch->Put(tk_key, tk_value); + auto batch = storage_->GetWriteBatchBase(); + WriteBatchLogData log_data(kRedisTopK, {"setTopkData"}); + rocksdb::Status s = batch->PutLogData(log_data.Encode()); + if (!s.ok()) return s; + + for (uint8_t i = 0; i < 3; i++) { + std::string tk_key = getTKKey(ns_key, metadata, i); + std::string tk_value; + if (i == 0) { + tk_value.assign(reinterpret_cast(topk.buckets), metadata.width * metadata.depth * sizeof(Bucket)); + } else if (i == 1) { + tk_value.assign(reinterpret_cast(topk.heap), metadata.top_k * sizeof(HeapBucket)); + for (uint32_t j = 0; j < metadata.top_k; j++) { + std::string hb_key = getHBKey(ns_key, metadata, i, j); + std::string hb_value(topk.heap[j].item, topk.heap[j].itemlen); + s = batch->Put(hb_key, hb_value); if (!s.ok()) return s; + } + } else { + tk_value = std::to_string(topk.heap_size); } + rocksdb::Status s = batch->Put(tk_key, tk_value); + if (!s.ok()) return s; + } - return storage_->Write(ctx, storage_->DefaultWriteOptions(), batch->GetWriteBatch()); + return storage_->Write(ctx, storage_->DefaultWriteOptions(), batch->GetWriteBatch()); } std::string TopK::getTKKey(const Slice &ns_key, const TopKMetadata &metadata, uint8_t index) { - std::string sub_key; - PutFixed8(&sub_key, index); - std::string bf_key = InternalKey(ns_key, sub_key, metadata.version, storage_->IsSlotIdEncoded()).Encode(); - return bf_key; + std::string sub_key; + PutFixed8(&sub_key, index); + std::string bf_key = InternalKey(ns_key, sub_key, metadata.version, storage_->IsSlotIdEncoded()).Encode(); + return bf_key; } std::string TopK::getHBKey(const Slice &ns_key, const TopKMetadata &metadata, uint8_t topk_index, uint32_t hp_index) { - std::string sub_key; - PutFixed8(&sub_key, topk_index); - PutFixed32(&sub_key, hp_index); - return InternalKey(ns_key, sub_key, metadata.version, storage_->IsSlotIdEncoded()).Encode(); + std::string sub_key; + PutFixed8(&sub_key, topk_index); + PutFixed32(&sub_key, hp_index); + return InternalKey(ns_key, sub_key, metadata.version, storage_->IsSlotIdEncoded()).Encode(); } -} // namespace redis \ No newline at end of file +} // namespace redis \ No newline at end of file diff --git a/src/types/redis_topk.h b/src/types/redis_topk.h index a98420ad4b9..48f0e5a4759 100644 --- a/src/types/redis_topk.h +++ b/src/types/redis_topk.h @@ -25,13 +25,7 @@ namespace redis { -enum class TopKInfoType { - kAll, - kTopK, - kWidth, - kDepth, - kDecay -}; +enum class TopKInfoType { kAll, kTopK, kWidth, kDepth, kDecay }; struct TopKInfo { uint32_t k; @@ -43,30 +37,25 @@ struct TopKInfo { class TopK : public SubKeyScanner { public: using Slice = rocksdb::Slice; - - explicit TopK(engine::Storage* storage, const std::string& ns) - : SubKeyScanner(storage, ns) {} - rocksdb::Status Reserve(engine::Context &ctx, const Slice& user_key, uint32_t k, - uint32_t width, uint32_t depth, double decay); - rocksdb::Status Query(engine::Context &ctx, const Slice& user_key, - const Slice &items, - bool *exists); - rocksdb::Status Add(engine::Context &ctx, const Slice &user_key, - const Slice &items); - rocksdb::Status List(engine::Context &ctx, const Slice& user_key, std::vector &items); - rocksdb::Status Info(engine::Context &ctx, const Slice& user_key, TopKInfo *info); - rocksdb::Status IncrBy(engine::Context &ctx, const Slice &user_key, - const Slice &items, uint32_t incr); + explicit TopK(engine::Storage *storage, const std::string &ns) : SubKeyScanner(storage, ns) {} + + rocksdb::Status Reserve(engine::Context &ctx, const Slice &user_key, uint32_t k, uint32_t width, uint32_t depth, + double decay); + rocksdb::Status Query(engine::Context &ctx, const Slice &user_key, const Slice &items, bool *exists); + rocksdb::Status Add(engine::Context &ctx, const Slice &user_key, const Slice &items); + rocksdb::Status List(engine::Context &ctx, const Slice &user_key, std::vector &items); + rocksdb::Status Info(engine::Context &ctx, const Slice &user_key, TopKInfo *info); + rocksdb::Status IncrBy(engine::Context &ctx, const Slice &user_key, const Slice &items, uint32_t incr); + private: rocksdb::Status getTopKMetadata(engine::Context &ctx, const Slice &ns_key, TopKMetadata *metadata); - rocksdb::Status createTopK(engine::Context &ctx, const Slice &ns_key, - uint32_t k, uint32_t width, uint32_t depth, double decay, - TopKMetadata *metadata); + rocksdb::Status createTopK(engine::Context &ctx, const Slice &ns_key, uint32_t k, uint32_t width, uint32_t depth, + double decay, TopKMetadata *metadata); - rocksdb::Status getTopKData(engine::Context &ctx, const Slice& ns_key, const TopKMetadata &metadata, + rocksdb::Status getTopKData(engine::Context &ctx, const Slice &ns_key, const TopKMetadata &metadata, BlockSplitTopK *topk); - rocksdb::Status setTopkData(engine::Context &ctx, const Slice& ns_key, const TopKMetadata &metadata, + rocksdb::Status setTopkData(engine::Context &ctx, const Slice &ns_key, const TopKMetadata &metadata, const BlockSplitTopK &topk); std::string getTKKey(const Slice &ns_key, const TopKMetadata &metadata, uint8_t index); diff --git a/src/types/topk.cc b/src/types/topk.cc index ff34822cbcf..19b0981fcbd 100644 --- a/src/types/topk.cc +++ b/src/types/topk.cc @@ -20,10 +20,10 @@ #include "topk.h" -#include +#include #include +#include #include -#include #include #define TOPK_HASH(item, itemlen, i) MurmurHash2(item, itemlen, i) @@ -48,54 +48,54 @@ //----------------------------------------------------------------------------- static uint32_t MurmurHash2(const void *key, int len, uint32_t seed) { - // 'm' and 'r' are mixing constants generated offline. - // They're not really 'magic', they just happen to work well. + // 'm' and 'r' are mixing constants generated offline. + // They're not really 'magic', they just happen to work well. - const uint32_t m = 0x5bd1e995; - const int r = 24; + const uint32_t m = 0x5bd1e995; + const int r = 24; - // Initialize the hash to a 'random' value + // Initialize the hash to a 'random' value - uint32_t h = seed ^ len; + uint32_t h = seed ^ len; - // Mix 4 bytes at a time into the hash + // Mix 4 bytes at a time into the hash - const unsigned char *data = (const unsigned char *)key; + const unsigned char *data = (const unsigned char *)key; - while (len >= 4) { - uint32_t k = *(uint32_t *)data; + while (len >= 4) { + uint32_t k = *(uint32_t *)data; - k *= m; - k ^= k >> r; - k *= m; + k *= m; + k ^= k >> r; + k *= m; - h *= m; - h ^= k; + h *= m; + h ^= k; - data += 4; - len -= 4; - } + data += 4; + len -= 4; + } - // Handle the last few bytes of the input heap + // Handle the last few bytes of the input heap - switch (len) { + switch (len) { case 3: - h ^= data[2] << 16; + h ^= data[2] << 16; case 2: - h ^= data[1] << 8; + h ^= data[1] << 8; case 1: - h ^= data[0]; - h *= m; - }; + h ^= data[0]; + h *= m; + }; - // Do a few final mixes of the hash to ensure the last few - // bytes are well-incorporated. + // Do a few final mixes of the hash to ensure the last few + // bytes are well-incorporated. - h ^= h >> 13; - h *= m; - h ^= h >> 15; + h ^= h >> 13; + h *= m; + h ^= h >> 15; - return h; + return h; } //----------------------------------------------------------------------------- @@ -106,280 +106,276 @@ static uint32_t MurmurHash2(const void *key, int len, uint32_t seed) { // 64-bit hash for 64-bit platforms -[[maybe_unused]]static uint64_t MurmurHash64A_Bloom(const void *key, int len, uint64_t seed) { - const uint64_t m = BIG_CONSTANT(0xc6a4a7935bd1e995); - const int r = 47; +[[maybe_unused]] static uint64_t MurmurHash64A_Bloom(const void *key, int len, uint64_t seed) { + const uint64_t m = BIG_CONSTANT(0xc6a4a7935bd1e995); + const int r = 47; - uint64_t h = seed ^ (len * m); + uint64_t h = seed ^ (len * m); - const uint64_t *data = (const uint64_t *)key; - const uint64_t *end = data + (len / 8); + const uint64_t *data = (const uint64_t *)key; + const uint64_t *end = data + (len / 8); - while (data != end) { - uint64_t k = *data++; + while (data != end) { + uint64_t k = *data++; - k *= m; - k ^= k >> r; - k *= m; + k *= m; + k ^= k >> r; + k *= m; - h ^= k; - h *= m; - } + h ^= k; + h *= m; + } - const unsigned char *data2 = (const unsigned char *)data; + const unsigned char *data2 = (const unsigned char *)data; - switch (len & 7) { + switch (len & 7) { case 7: - h ^= ((uint64_t)data2[6]) << 48; + h ^= ((uint64_t)data2[6]) << 48; case 6: - h ^= ((uint64_t)data2[5]) << 40; + h ^= ((uint64_t)data2[5]) << 40; case 5: - h ^= ((uint64_t)data2[4]) << 32; + h ^= ((uint64_t)data2[4]) << 32; case 4: - h ^= ((uint64_t)data2[3]) << 24; + h ^= ((uint64_t)data2[3]) << 24; case 3: - h ^= ((uint64_t)data2[2]) << 16; + h ^= ((uint64_t)data2[2]) << 16; case 2: - h ^= ((uint64_t)data2[1]) << 8; + h ^= ((uint64_t)data2[1]) << 8; case 1: - h ^= ((uint64_t)data2[0]); - h *= m; - }; + h ^= ((uint64_t)data2[0]); + h *= m; + }; - h ^= h >> r; - h *= m; - h ^= h >> r; + h ^= h >> r; + h *= m; + h ^= h >> r; - return h; + return h; } // 64-bit hash for 32-bit platforms -[[maybe_unused]]static uint64_t MurmurHash64B(const void *key, int len, uint64_t seed) { - const uint32_t m = 0x5bd1e995; - const int r = 24; - - uint32_t h1 = (uint32_t)(seed ^ len); - uint32_t h2 = (uint32_t)(seed >> 32); - - const uint32_t *data = (const uint32_t *)key; - - while (len >= 8) { - uint32_t k1 = *data++; - k1 *= m; - k1 ^= k1 >> r; - k1 *= m; - h1 *= m; - h1 ^= k1; - len -= 4; - - uint32_t k2 = *data++; - k2 *= m; - k2 ^= k2 >> r; - k2 *= m; - h2 *= m; - h2 ^= k2; - len -= 4; - } +[[maybe_unused]] static uint64_t MurmurHash64B(const void *key, int len, uint64_t seed) { + const uint32_t m = 0x5bd1e995; + const int r = 24; - if (len >= 4) { - uint32_t k1 = *data++; - k1 *= m; - k1 ^= k1 >> r; - k1 *= m; - h1 *= m; - h1 ^= k1; - len -= 4; - } + uint32_t h1 = (uint32_t)(seed ^ len); + uint32_t h2 = (uint32_t)(seed >> 32); - switch (len) { - case 3: - h2 ^= ((unsigned char *)data)[2] << 16; - case 2: - h2 ^= ((unsigned char *)data)[1] << 8; - case 1: - h2 ^= ((unsigned char *)data)[0]; - h2 *= m; - }; + const uint32_t *data = (const uint32_t *)key; - h1 ^= h2 >> 18; + while (len >= 8) { + uint32_t k1 = *data++; + k1 *= m; + k1 ^= k1 >> r; + k1 *= m; h1 *= m; - h2 ^= h1 >> 22; + h1 ^= k1; + len -= 4; + + uint32_t k2 = *data++; + k2 *= m; + k2 ^= k2 >> r; + k2 *= m; h2 *= m; - h1 ^= h2 >> 17; + h2 ^= k2; + len -= 4; + } + + if (len >= 4) { + uint32_t k1 = *data++; + k1 *= m; + k1 ^= k1 >> r; + k1 *= m; h1 *= m; - h2 ^= h1 >> 19; - h2 *= m; + h1 ^= k1; + len -= 4; + } + + switch (len) { + case 3: + h2 ^= ((unsigned char *)data)[2] << 16; + case 2: + h2 ^= ((unsigned char *)data)[1] << 8; + case 1: + h2 ^= ((unsigned char *)data)[0]; + h2 *= m; + }; - uint64_t h = h1; + h1 ^= h2 >> 18; + h1 *= m; + h2 ^= h1 >> 22; + h2 *= m; + h1 ^= h2 >> 17; + h1 *= m; + h2 ^= h1 >> 19; + h2 *= m; - h = (h << 32) | h2; + uint64_t h = h1; - return h; + h = (h << 32) | h2; + + return h; } /* ---------------------------------------------------------------------- */ void BlockSplitTopK::heapifyDown(int start) { - size_t child = start; - - // check whether larger than children - if (heap_size < 2 || (heap_size - 2) / 2 < child) { - return; + size_t child = start; + + // check whether larger than children + if (heap_size < 2 || (heap_size - 2) / 2 < child) { + return; + } + + child = 2 * child + 1; + if ((child + 1) < heap_size && (heap[child].count > heap[child + 1].count)) { + ++child; + } + if (heap[child].count > heap[start].count) { + return; + } + + HeapBucket top; + memcpy(&top, &heap[start], sizeof(HeapBucket)); + do { + memcpy(&heap[start], &heap[child], sizeof(HeapBucket)); + start = child; + + if ((heap_size - 2) / 2 < child) { + break; } - child = 2 * child + 1; + if ((child + 1) < heap_size && (heap[child].count > heap[child + 1].count)) { - ++child; + ++child; } - if (heap[child].count > heap[start].count) { - return; - } - - HeapBucket top; - memcpy(&top, &heap[start], sizeof(HeapBucket)); - do { - memcpy(&heap[start], &heap[child], sizeof(HeapBucket)); - start = child; - - if ((heap_size - 2) / 2 < child) { - break; - } - child = 2 * child + 1; - - if ((child + 1) < heap_size && (heap[child].count > heap[child + 1].count)) { - ++child; - } - } while (heap[child].count < top.count); - memcpy(&heap[start], &top, sizeof(HeapBucket)); + } while (heap[child].count < top.count); + memcpy(&heap[start], &top, sizeof(HeapBucket)); } void BlockSplitTopK::heapifyUp(int start) { - size_t parent = start; - - // check whether smaller than parent - if (heap_size < 2 || parent == 0) { - return; + size_t parent = start; + + // check whether smaller than parent + if (heap_size < 2 || parent == 0) { + return; + } + + parent = (parent - 1) / 2; + if (heap[parent].count > heap[start].count) { + return; + } + + HeapBucket bottom; + memcpy(&bottom, &heap[start], sizeof(HeapBucket)); + do { + memcpy(&heap[start], &heap[parent], sizeof(HeapBucket)); + start = parent; + + if (start == 0) { + break; } - parent = (parent - 1) / 2; - if (heap[parent].count > heap[start].count) { - return; - } - - HeapBucket bottom; - memcpy(&bottom, &heap[start], sizeof(HeapBucket)); - do { - memcpy(&heap[start], &heap[parent], sizeof(HeapBucket)); - start = parent; - - if (start == 0) { - break; - } - parent = (parent - 1) / 2; - } while (heap[parent].count > bottom.count); - memcpy(&heap[start], &bottom, sizeof(HeapBucket)); + } while (heap[parent].count > bottom.count); + memcpy(&heap[start], &bottom, sizeof(HeapBucket)); } int BlockSplitTopK::checkExistInHeap(const std::string &item) { - uint32_t itemlen = item.size(); - const char *data = item.c_str(); - for (int32_t i = heap_size - 1; i >= 0; --i) { - if (heap[i].itemlen == itemlen && memcmp(heap[i].item, data, itemlen) == 0) { - return i; - } + uint32_t itemlen = item.size(); + const char *data = item.c_str(); + for (int32_t i = heap_size - 1; i >= 0; --i) { + if (heap[i].itemlen == itemlen && memcmp(heap[i].item, data, itemlen) == 0) { + return i; } - return -1; + } + return -1; } int BlockSplitTopK::cmpHeapBucketCount(const HeapBucket &a, const HeapBucket &b) { - return a.count < b.count ? 1 : a.count > b.count ? -1 : 0; + return a.count < b.count ? 1 : a.count > b.count ? -1 : 0; } void BlockSplitTopK::Add(const std::string &item, uint32_t increment) { - uint32_t itemlen = item.size(); - const char *data = item.c_str(); - counter_t maxCount = 0; - uint32_t fp = TOPK_HASH(data, itemlen, GA); - - int location = checkExistInHeap(item); - - for (size_t i = 0; i < depth; ++i) { - uint32_t loc = TOPK_HASH(data, itemlen, i) % width; - - loc += i * width; - if (buckets[loc].count == 0) { + uint32_t itemlen = item.size(); + const char *data = item.c_str(); + counter_t maxCount = 0; + uint32_t fp = TOPK_HASH(data, itemlen, GA); + + int location = checkExistInHeap(item); + + for (size_t i = 0; i < depth; ++i) { + uint32_t loc = TOPK_HASH(data, itemlen, i) % width; + + loc += i * width; + if (buckets[loc].count == 0) { + buckets[loc].fp = fp; + buckets[loc].count = increment; + maxCount = std::max(maxCount, buckets[loc].count); + } else if (buckets[loc].fp == fp && location != -1) { + buckets[loc].count += increment; + maxCount = std::max(maxCount, buckets[loc].count); + } else { + // decay + uint32_t local_incr = increment; + for (; local_incr > 0; --local_incr) { + double decay; + if (buckets[loc].count < TOPK_DECAY_LOOKUP_TABLE) { + decay = lookupTable[buckets[loc].count]; + } else { + decay = pow(lookupTable[TOPK_DECAY_LOOKUP_TABLE - 1], (buckets[loc].count / (TOPK_DECAY_LOOKUP_TABLE - 1))) * + lookupTable[buckets[loc].count % (TOPK_DECAY_LOOKUP_TABLE - 1)]; + } + double chance = rand() / (double)RAND_MAX; + if (chance < decay) { + --buckets[loc].count; + if (buckets[loc].count == 0) { buckets[loc].fp = fp; - buckets[loc].count = increment; + buckets[loc].count = 1; maxCount = std::max(maxCount, buckets[loc].count); - } else if (buckets[loc].fp == fp && location != -1) { - buckets[loc].count += increment; - maxCount = std::max(maxCount, buckets[loc].count); - } else { - // decay - uint32_t local_incr = increment; - for (; local_incr > 0; --local_incr) { - double decay; - if (buckets[loc].count < TOPK_DECAY_LOOKUP_TABLE) { - decay = lookupTable[buckets[loc].count]; - } else { - decay = pow(lookupTable[TOPK_DECAY_LOOKUP_TABLE - 1], - (buckets[loc].count / (TOPK_DECAY_LOOKUP_TABLE - 1))) * - lookupTable[buckets[loc].count % (TOPK_DECAY_LOOKUP_TABLE - 1)]; - } - double chance = rand() / (double)RAND_MAX; - if (chance < decay) { - -- buckets[loc].count; - if (buckets[loc].count == 0) { - buckets[loc].fp = fp; - buckets[loc].count = 1; - maxCount = std::max(maxCount, buckets[loc].count); - break; - } - } - } + break; + } } + } } + } - if (k == heap_size) { - if (location == -1) { - if (heap[0].count == maxCount || heap[0].count + 1 == maxCount) { - heap[0].fp = fp; - heap[0].itemlen = itemlen; - delete heap[0].item; - heap[0].item = new char[itemlen]; - memcpy(heap[0].item, data, itemlen); - - heap[0].count = maxCount; - - heapifyDown(0); - } - } else { - heap[location].count += increment; - heapifyDown(location); - } + if (k == heap_size) { + if (location == -1) { + if (heap[0].count == maxCount || heap[0].count + 1 == maxCount) { + heap[0].fp = fp; + heap[0].itemlen = itemlen; + delete heap[0].item; + heap[0].item = new char[itemlen]; + memcpy(heap[0].item, data, itemlen); + + heap[0].count = maxCount; + + heapifyDown(0); + } } else { - heap[heap_size].fp = fp; - heap[heap_size].itemlen = itemlen; - heap[heap_size].item = new char[itemlen]; - memcpy(heap[heap_size].item, data, itemlen); - heap[heap_size].count = maxCount; - - heapifyUp(heap_size); - heap_size ++; + heap[location].count += increment; + heapifyDown(location); } + } else { + heap[heap_size].fp = fp; + heap[heap_size].itemlen = itemlen; + heap[heap_size].item = new char[itemlen]; + memcpy(heap[heap_size].item, data, itemlen); + heap[heap_size].count = maxCount; + + heapifyUp(heap_size); + heap_size++; + } } -bool BlockSplitTopK::Query(const std::string &item) { - return checkExistInHeap(item) != -1; -} +bool BlockSplitTopK::Query(const std::string &item) { return checkExistInHeap(item) != -1; } std::vector BlockSplitTopK::List() { - std::vector result(heap_size); - for (uint32_t i = 0; i < heap_size; i ++) { - result[i] = heap[i]; - } - std::sort(result.begin(), result.end(), [this] (const HeapBucket &a, const HeapBucket &b) { - return cmpHeapBucketCount(a, b) > 0; - }); - return result; + std::vector result(heap_size); + for (uint32_t i = 0; i < heap_size; i++) { + result[i] = heap[i]; + } + std::sort(result.begin(), result.end(), + [this](const HeapBucket &a, const HeapBucket &b) { return cmpHeapBucketCount(a, b) > 0; }); + return result; } \ No newline at end of file diff --git a/src/types/topk.h b/src/types/topk.h index 559d4c010e3..34fbc684bb0 100644 --- a/src/types/topk.h +++ b/src/types/topk.h @@ -21,66 +21,67 @@ #pragma once #include -#include -#include + #include +#include #include +#include static constexpr int TOPK_DECAY_LOOKUP_TABLE = 256; using counter_t = uint32_t; struct HeapBucket { - uint32_t fp; - uint32_t itemlen; - char* item; - counter_t count; + uint32_t fp; + uint32_t itemlen; + char *item; + counter_t count; }; struct Bucket { - uint32_t fp; - counter_t count; + uint32_t fp; + counter_t count; }; class BlockSplitTopK { -public: - BlockSplitTopK() = delete; - explicit BlockSplitTopK(uint32_t k, uint32_t width, uint32_t depth, double decay) : - k(k), width(width), depth(depth), decay(decay), heap_size(0) { - buckets = new Bucket[width * depth]; - heap = new HeapBucket[k]; - std::fill_n(buckets, width * depth, Bucket{0, 0}); - std::fill_n(heap, k, HeapBucket{0, 0, nullptr, 0}); - for (int i = 0; i < TOPK_DECAY_LOOKUP_TABLE; ++i) { - lookupTable[i] = pow(decay, i); - } + public: + BlockSplitTopK() = delete; + explicit BlockSplitTopK(uint32_t k, uint32_t width, uint32_t depth, double decay) + : k(k), width(width), depth(depth), decay(decay), heap_size(0) { + buckets = new Bucket[width * depth]; + heap = new HeapBucket[k]; + std::fill_n(buckets, width * depth, Bucket{0, 0}); + std::fill_n(heap, k, HeapBucket{0, 0, nullptr, 0}); + for (int i = 0; i < TOPK_DECAY_LOOKUP_TABLE; ++i) { + lookupTable[i] = pow(decay, i); } + } - ~BlockSplitTopK() { - for (size_t i = 0; i < k; ++i) { - delete[] heap[i].item; - } - delete[] buckets; - delete[] heap; + ~BlockSplitTopK() { + for (size_t i = 0; i < k; ++i) { + delete[] heap[i].item; } + delete[] buckets; + delete[] heap; + } - void Add(const std::string &item, uint32_t increment); - bool Query(const std::string &item); - std::vector List(); + void Add(const std::string &item, uint32_t increment); + bool Query(const std::string &item); + std::vector List(); - void heapifyDown(int start); - void heapifyUp(int start); - int checkExistInHeap(const std::string &item); - int cmpHeapBucketCount(const HeapBucket &a, const HeapBucket &b); + void heapifyDown(int start); + void heapifyUp(int start); + int checkExistInHeap(const std::string &item); + int cmpHeapBucketCount(const HeapBucket &a, const HeapBucket &b); - uint32_t k; - uint32_t width; - uint32_t depth; - double decay; + uint32_t k; + uint32_t width; + uint32_t depth; + double decay; - size_t heap_size; + size_t heap_size; - Bucket *buckets; - HeapBucket *heap; - double lookupTable[TOPK_DECAY_LOOKUP_TABLE]; + Bucket *buckets; + HeapBucket *heap; + double lookupTable[TOPK_DECAY_LOOKUP_TABLE]; }; \ No newline at end of file diff --git a/tests/cppunit/types/topk_test.cc b/tests/cppunit/types/topk_test.cc index 21af3171480..f24ced1d601 100644 --- a/tests/cppunit/types/topk_test.cc +++ b/tests/cppunit/types/topk_test.cc @@ -32,9 +32,7 @@ static constexpr double decay = 0.9; class RedisTopKTest : public TestBase { protected: - explicit RedisTopKTest() : TestBase() { - top_k_ = std::make_unique(storage_.get(), "topk_ns"); - } + explicit RedisTopKTest() : TestBase() { top_k_ = std::make_unique(storage_.get(), "topk_ns"); } ~RedisTopKTest() override = default; void SetUp() override { From b585cf182672e730f70368464fb6f893843b9699 Mon Sep 17 00:00:00 2001 From: aibin <2573214643@qq.com> Date: Mon, 17 Nov 2025 11:10:06 +0800 Subject: [PATCH 08/18] fix: issue for conflict --- src/commands/commander.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/commands/commander.h b/src/commands/commander.h index bf24bbac67d..55607f987f3 100644 --- a/src/commands/commander.h +++ b/src/commands/commander.h @@ -117,6 +117,9 @@ enum class CommandCategory : uint8_t { ZSet, Timeseries, TopK, + // this is a special category for disabling commands, + // basically can be used for version releasing or debugging + Disabled, }; class Commander { From 132d4ce7e5572622fbe216934de8d05cea7b5b48 Mon Sep 17 00:00:00 2001 From: Aleks Lozovyuk Date: Thu, 20 Nov 2025 13:41:09 +0300 Subject: [PATCH 09/18] Fix typo in comment about topk removal --- tests/cppunit/types/topk_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/cppunit/types/topk_test.cc b/tests/cppunit/types/topk_test.cc index f24ced1d601..51470427ea9 100644 --- a/tests/cppunit/types/topk_test.cc +++ b/tests/cppunit/types/topk_test.cc @@ -110,7 +110,7 @@ TEST_F(RedisTopKTest, TestTopKAddAndQuery) { // heap is full, need remove values1. for (size_t i = 0; i < values2.size(); ++i) { bool found = false; - // due to decay, topk is possiable to remove values1. + // due to decay, topk is possible to remove values1. while (!found) { top_k_->Add(*ctx_, key_, values2[i]); top_k_->Query(*ctx_, key_, values2[i], &found); @@ -131,4 +131,4 @@ TEST_F(RedisTopKTest, TestTopKAddAndQuery) { for (size_t i = 0; i < top_k_list.size(); ++i) { ASSERT_TRUE(values_set2.find(top_k_list[i]) != values_set2.end()); } -} \ No newline at end of file +} From c318f2104cba816efb4e312d9393d357b06278f8 Mon Sep 17 00:00:00 2001 From: Aleks Lozovyuk Date: Thu, 20 Nov 2025 14:23:35 +0300 Subject: [PATCH 10/18] Make TopKMetadata constructor explicit --- src/storage/redis_metadata.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/storage/redis_metadata.h b/src/storage/redis_metadata.h index 516fde6ec05..a14504d1259 100644 --- a/src/storage/redis_metadata.h +++ b/src/storage/redis_metadata.h @@ -420,9 +420,9 @@ class TopKMetadata : public Metadata { explicit TopKMetadata(bool generate_version = true) : Metadata(kRedisTopK, generate_version) {} - TopKMetadata(uint64_t top_k, uint64_t width = 7, uint64_t depth = 8, double decay = 0.9, bool generate_version = true) + explicit TopKMetadata(uint64_t top_k, uint64_t width = 7, uint64_t depth = 8, double decay = 0.9, bool generate_version = true) : Metadata(kRedisTopK, generate_version), top_k(top_k), width(width), depth(depth), decay(decay) {} void Encode(std::string *dst) const override; rocksdb::Status Decode(Slice *input) override; -}; \ No newline at end of file +}; From 6ec5984ac4ce881c4f0774e2eec743f24e204f06 Mon Sep 17 00:00:00 2001 From: aibin <2573214643@qq.com> Date: Thu, 20 Nov 2025 19:58:25 +0800 Subject: [PATCH 11/18] fix clang code format --- src/storage/redis_metadata.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/storage/redis_metadata.h b/src/storage/redis_metadata.h index a14504d1259..2db0a422ab7 100644 --- a/src/storage/redis_metadata.h +++ b/src/storage/redis_metadata.h @@ -420,7 +420,8 @@ class TopKMetadata : public Metadata { explicit TopKMetadata(bool generate_version = true) : Metadata(kRedisTopK, generate_version) {} - explicit TopKMetadata(uint64_t top_k, uint64_t width = 7, uint64_t depth = 8, double decay = 0.9, bool generate_version = true) + explicit TopKMetadata(uint64_t top_k, uint64_t width = 7, uint64_t depth = 8, double decay = 0.9, + bool generate_version = true) : Metadata(kRedisTopK, generate_version), top_k(top_k), width(width), depth(depth), decay(decay) {} void Encode(std::string *dst) const override; From 55c2e677a827e1894a3f69cc28dc901be64a8d9b Mon Sep 17 00:00:00 2001 From: aibin <2573214643@qq.com> Date: Fri, 21 Nov 2025 12:56:10 +0800 Subject: [PATCH 12/18] fix clang code format --- src/commands/cmd_topk.cc | 15 ++- src/types/topk.cc | 179 +++++-------------------------- src/types/topk.h | 35 +++--- tests/cppunit/types/topk_test.cc | 33 +++--- 4 files changed, 75 insertions(+), 187 deletions(-) diff --git a/src/commands/cmd_topk.cc b/src/commands/cmd_topk.cc index 24404870eba..4375c555889 100644 --- a/src/commands/cmd_topk.cc +++ b/src/commands/cmd_topk.cc @@ -163,14 +163,13 @@ class CommandTopKInfo final : public Commander { CommandParser parser(args, 2); if (parser.Good()) { if (args.size() == 3) { - std::string type_str = args[2]; - if (type_str == "topk") { + if (args[2] == "topk") { type_ = TopKInfoType::kTopK; - } else if (type_str == "width") { + } else if (args[2] == "width") { type_ = TopKInfoType::kWidth; - } else if (type_str == "depth") { + } else if (args[2] == "depth") { type_ = TopKInfoType::kDepth; - } else if (type_str == "decay") { + } else if (args[2] == "decay") { type_ = TopKInfoType::kDecay; } else { return {Status::InvalidArgument, "Invalid info type"}; @@ -227,12 +226,12 @@ class CommandTopKQuery final : public Commander { redis::TopK topk(srv->storage, conn->GetNamespace()); CHECK(args_.size() == 3); - bool is_exists_; - auto s = topk.Query(ctx, args_[1], args_[2], &is_exists_); + bool is_exists = false; + auto s = topk.Query(ctx, args_[1], args_[2], &is_exists); if (!s.ok()) { return {Status::RedisExecErr, s.ToString()}; } - *output = redis::Bool(redis::RESP::v2, is_exists_); + *output = redis::Bool(redis::RESP::v2, is_exists); return Status::OK(); } }; diff --git a/src/types/topk.cc b/src/types/topk.cc index 19b0981fcbd..20ed1936a1b 100644 --- a/src/types/topk.cc +++ b/src/types/topk.cc @@ -26,9 +26,6 @@ #include #include -#define TOPK_HASH(item, itemlen, i) MurmurHash2(item, itemlen, i) -#define GA 1919 - //----------------------------------------------------------------------------- // MurmurHash2 was written by Austin Appleby, and is placed in the public // domain. The author hereby disclaims copyright to this source code. @@ -43,7 +40,6 @@ // 1. It will not work incrementally. // 2. It will not produce the same results on little-endian and big-endian // machines. -#define BIG_CONSTANT(x) (x##LLU) //----------------------------------------------------------------------------- @@ -60,7 +56,7 @@ static uint32_t MurmurHash2(const void *key, int len, uint32_t seed) { // Mix 4 bytes at a time into the hash - const unsigned char *data = (const unsigned char *)key; + auto *data = reinterpret_cast(key); while (len >= 4) { uint32_t k = *(uint32_t *)data; @@ -98,129 +94,12 @@ static uint32_t MurmurHash2(const void *key, int len, uint32_t seed) { return h; } -//----------------------------------------------------------------------------- -// MurmurHash2, 64-bit versions, by Austin Appleby - -// The same caveats as 32-bit MurmurHash2 apply here - beware of alignment -// and endian-ness issues if used across multiple platforms. - -// 64-bit hash for 64-bit platforms - -[[maybe_unused]] static uint64_t MurmurHash64A_Bloom(const void *key, int len, uint64_t seed) { - const uint64_t m = BIG_CONSTANT(0xc6a4a7935bd1e995); - const int r = 47; - - uint64_t h = seed ^ (len * m); - - const uint64_t *data = (const uint64_t *)key; - const uint64_t *end = data + (len / 8); - - while (data != end) { - uint64_t k = *data++; - - k *= m; - k ^= k >> r; - k *= m; - - h ^= k; - h *= m; - } - - const unsigned char *data2 = (const unsigned char *)data; - - switch (len & 7) { - case 7: - h ^= ((uint64_t)data2[6]) << 48; - case 6: - h ^= ((uint64_t)data2[5]) << 40; - case 5: - h ^= ((uint64_t)data2[4]) << 32; - case 4: - h ^= ((uint64_t)data2[3]) << 24; - case 3: - h ^= ((uint64_t)data2[2]) << 16; - case 2: - h ^= ((uint64_t)data2[1]) << 8; - case 1: - h ^= ((uint64_t)data2[0]); - h *= m; - }; - - h ^= h >> r; - h *= m; - h ^= h >> r; - - return h; -} - -// 64-bit hash for 32-bit platforms - -[[maybe_unused]] static uint64_t MurmurHash64B(const void *key, int len, uint64_t seed) { - const uint32_t m = 0x5bd1e995; - const int r = 24; - - uint32_t h1 = (uint32_t)(seed ^ len); - uint32_t h2 = (uint32_t)(seed >> 32); - - const uint32_t *data = (const uint32_t *)key; - - while (len >= 8) { - uint32_t k1 = *data++; - k1 *= m; - k1 ^= k1 >> r; - k1 *= m; - h1 *= m; - h1 ^= k1; - len -= 4; - - uint32_t k2 = *data++; - k2 *= m; - k2 ^= k2 >> r; - k2 *= m; - h2 *= m; - h2 ^= k2; - len -= 4; - } - - if (len >= 4) { - uint32_t k1 = *data++; - k1 *= m; - k1 ^= k1 >> r; - k1 *= m; - h1 *= m; - h1 ^= k1; - len -= 4; - } - - switch (len) { - case 3: - h2 ^= ((unsigned char *)data)[2] << 16; - case 2: - h2 ^= ((unsigned char *)data)[1] << 8; - case 1: - h2 ^= ((unsigned char *)data)[0]; - h2 *= m; - }; - - h1 ^= h2 >> 18; - h1 *= m; - h2 ^= h1 >> 22; - h2 *= m; - h1 ^= h2 >> 17; - h1 *= m; - h2 ^= h1 >> 19; - h2 *= m; - - uint64_t h = h1; - - h = (h << 32) | h2; - - return h; -} +static uint32_t TopkHash(const void *item, int itemlen, uint32_t i) { return MurmurHash2(item, itemlen, i); } +constexpr uint32_t GA = 1919; /* ---------------------------------------------------------------------- */ -void BlockSplitTopK::heapifyDown(int start) { - size_t child = start; +void BlockSplitTopK::HeapifyDown(int start) const { + int child = start; // check whether larger than children if (heap_size < 2 || (heap_size - 2) / 2 < child) { @@ -253,8 +132,8 @@ void BlockSplitTopK::heapifyDown(int start) { memcpy(&heap[start], &top, sizeof(HeapBucket)); } -void BlockSplitTopK::heapifyUp(int start) { - size_t parent = start; +void BlockSplitTopK::HeapifyUp(int start) const { + int parent = start; // check whether smaller than parent if (heap_size < 2 || parent == 0) { @@ -280,10 +159,10 @@ void BlockSplitTopK::heapifyUp(int start) { memcpy(&heap[start], &bottom, sizeof(HeapBucket)); } -int BlockSplitTopK::checkExistInHeap(const std::string &item) { +int BlockSplitTopK::CheckExistInHeap(const std::string &item) const { uint32_t itemlen = item.size(); const char *data = item.c_str(); - for (int32_t i = heap_size - 1; i >= 0; --i) { + for (int i = (int)heap_size - 1; i >= 0; --i) { if (heap[i].itemlen == itemlen && memcmp(heap[i].item, data, itemlen) == 0) { return i; } @@ -291,39 +170,39 @@ int BlockSplitTopK::checkExistInHeap(const std::string &item) { return -1; } -int BlockSplitTopK::cmpHeapBucketCount(const HeapBucket &a, const HeapBucket &b) { +int BlockSplitTopK::CmpHeapBucketCount(const HeapBucket &a, const HeapBucket &b) { return a.count < b.count ? 1 : a.count > b.count ? -1 : 0; } void BlockSplitTopK::Add(const std::string &item, uint32_t increment) { uint32_t itemlen = item.size(); const char *data = item.c_str(); - counter_t maxCount = 0; - uint32_t fp = TOPK_HASH(data, itemlen, GA); + CounterT max_count = 0; + uint32_t fp = TopkHash(data, (int)itemlen, GA); - int location = checkExistInHeap(item); + int location = CheckExistInHeap(item); for (size_t i = 0; i < depth; ++i) { - uint32_t loc = TOPK_HASH(data, itemlen, i) % width; + uint32_t loc = TopkHash(data, (int)itemlen, i) % width; loc += i * width; if (buckets[loc].count == 0) { buckets[loc].fp = fp; buckets[loc].count = increment; - maxCount = std::max(maxCount, buckets[loc].count); + max_count = std::max(max_count, buckets[loc].count); } else if (buckets[loc].fp == fp && location != -1) { buckets[loc].count += increment; - maxCount = std::max(maxCount, buckets[loc].count); + max_count = std::max(max_count, buckets[loc].count); } else { // decay uint32_t local_incr = increment; for (; local_incr > 0; --local_incr) { - double decay; + double decay = 0.0; if (buckets[loc].count < TOPK_DECAY_LOOKUP_TABLE) { - decay = lookupTable[buckets[loc].count]; + decay = lookup_table[buckets[loc].count]; } else { - decay = pow(lookupTable[TOPK_DECAY_LOOKUP_TABLE - 1], (buckets[loc].count / (TOPK_DECAY_LOOKUP_TABLE - 1))) * - lookupTable[buckets[loc].count % (TOPK_DECAY_LOOKUP_TABLE - 1)]; + decay = pow(lookup_table[TOPK_DECAY_LOOKUP_TABLE - 1], (buckets[loc].count / (TOPK_DECAY_LOOKUP_TABLE - 1))) * + lookup_table[buckets[loc].count % (TOPK_DECAY_LOOKUP_TABLE - 1)]; } double chance = rand() / (double)RAND_MAX; if (chance < decay) { @@ -331,7 +210,7 @@ void BlockSplitTopK::Add(const std::string &item, uint32_t increment) { if (buckets[loc].count == 0) { buckets[loc].fp = fp; buckets[loc].count = 1; - maxCount = std::max(maxCount, buckets[loc].count); + max_count = std::max(max_count, buckets[loc].count); break; } } @@ -341,34 +220,34 @@ void BlockSplitTopK::Add(const std::string &item, uint32_t increment) { if (k == heap_size) { if (location == -1) { - if (heap[0].count == maxCount || heap[0].count + 1 == maxCount) { + if (heap[0].count == max_count || heap[0].count + 1 == max_count) { heap[0].fp = fp; heap[0].itemlen = itemlen; delete heap[0].item; heap[0].item = new char[itemlen]; memcpy(heap[0].item, data, itemlen); - heap[0].count = maxCount; + heap[0].count = max_count; - heapifyDown(0); + HeapifyDown(0); } } else { heap[location].count += increment; - heapifyDown(location); + HeapifyDown(location); } } else { heap[heap_size].fp = fp; heap[heap_size].itemlen = itemlen; heap[heap_size].item = new char[itemlen]; memcpy(heap[heap_size].item, data, itemlen); - heap[heap_size].count = maxCount; + heap[heap_size].count = max_count; - heapifyUp(heap_size); + HeapifyUp((int)heap_size); heap_size++; } } -bool BlockSplitTopK::Query(const std::string &item) { return checkExistInHeap(item) != -1; } +bool BlockSplitTopK::Query(const std::string &item) const { return CheckExistInHeap(item) != -1; } std::vector BlockSplitTopK::List() { std::vector result(heap_size); @@ -376,6 +255,6 @@ std::vector BlockSplitTopK::List() { result[i] = heap[i]; } std::sort(result.begin(), result.end(), - [this](const HeapBucket &a, const HeapBucket &b) { return cmpHeapBucketCount(a, b) > 0; }); + [this](const HeapBucket &a, const HeapBucket &b) { return CmpHeapBucketCount(a, b) > 0; }); return result; } \ No newline at end of file diff --git a/src/types/topk.h b/src/types/topk.h index 34fbc684bb0..8834c3bc333 100644 --- a/src/types/topk.h +++ b/src/types/topk.h @@ -29,31 +29,40 @@ static constexpr int TOPK_DECAY_LOOKUP_TABLE = 256; -using counter_t = uint32_t; +using CounterT = uint32_t; struct HeapBucket { uint32_t fp; uint32_t itemlen; char *item; - counter_t count; + CounterT count; }; struct Bucket { uint32_t fp; - counter_t count; + CounterT count; }; class BlockSplitTopK { public: BlockSplitTopK() = delete; + BlockSplitTopK(const BlockSplitTopK &) = delete; + BlockSplitTopK &operator=(const BlockSplitTopK &) = delete; + BlockSplitTopK(BlockSplitTopK &&) = default; + BlockSplitTopK &operator=(BlockSplitTopK &&) = default; + explicit BlockSplitTopK(uint32_t k, uint32_t width, uint32_t depth, double decay) - : k(k), width(width), depth(depth), decay(decay), heap_size(0) { - buckets = new Bucket[width * depth]; - heap = new HeapBucket[k]; + : k(k), + width(width), + depth(depth), + decay(decay), + heap_size(0), + buckets(new Bucket[width * depth]), + heap(new HeapBucket[k]) { std::fill_n(buckets, width * depth, Bucket{0, 0}); std::fill_n(heap, k, HeapBucket{0, 0, nullptr, 0}); for (int i = 0; i < TOPK_DECAY_LOOKUP_TABLE; ++i) { - lookupTable[i] = pow(decay, i); + lookup_table[i] = pow(decay, i); } } @@ -66,13 +75,13 @@ class BlockSplitTopK { } void Add(const std::string &item, uint32_t increment); - bool Query(const std::string &item); + bool Query(const std::string &item) const; std::vector List(); - void heapifyDown(int start); - void heapifyUp(int start); - int checkExistInHeap(const std::string &item); - int cmpHeapBucketCount(const HeapBucket &a, const HeapBucket &b); + void HeapifyDown(int start) const; + void HeapifyUp(int start) const; + int CheckExistInHeap(const std::string &item) const; + static int CmpHeapBucketCount(const HeapBucket &a, const HeapBucket &b); uint32_t k; uint32_t width; @@ -83,5 +92,5 @@ class BlockSplitTopK { Bucket *buckets; HeapBucket *heap; - double lookupTable[TOPK_DECAY_LOOKUP_TABLE]; + double lookup_table[TOPK_DECAY_LOOKUP_TABLE]; }; \ No newline at end of file diff --git a/tests/cppunit/types/topk_test.cc b/tests/cppunit/types/topk_test.cc index 51470427ea9..1c7bd75085d 100644 --- a/tests/cppunit/types/topk_test.cc +++ b/tests/cppunit/types/topk_test.cc @@ -66,9 +66,10 @@ TEST_F(RedisTopKTest, TestTopKAddAndQuery) { auto s = top_k_->Add(*ctx_, no_exist_key, "1"); ASSERT_FALSE(s.ok()); - bool exist; + bool exist = false; s = top_k_->Query(*ctx_, no_exist_key, "1", &exist); ASSERT_FALSE(s.ok()); + ASSERT_TRUE(exist); std::vector list; s = top_k_->List(*ctx_, no_exist_key, list); @@ -81,21 +82,21 @@ TEST_F(RedisTopKTest, TestTopKAddAndQuery) { std::unordered_set values_set2(values2.begin(), values2.end()); // found not exist values1 - for (size_t i = 0; i < values1.size(); ++i) { + for (const auto &value : values1) { bool found = true; - top_k_->Query(*ctx_, key_, values1[i], &found); + top_k_->Query(*ctx_, key_, value, &found); ASSERT_FALSE(found); } // add values1, and query values1. - for (size_t i = 0; i < values1.size(); ++i) { - top_k_->Add(*ctx_, key_, values1[i]); + for (const auto &value : values1) { + top_k_->Add(*ctx_, key_, value); bool found = false; - top_k_->Query(*ctx_, key_, values1[i], &found); + top_k_->Query(*ctx_, key_, value, &found); ASSERT_TRUE(found); } - for (size_t i = 0; i < values1.size(); ++i) { + for (const auto &value : values1) { bool found = false; - top_k_->Query(*ctx_, key_, values1[i], &found); + top_k_->Query(*ctx_, key_, value, &found); ASSERT_TRUE(found); } @@ -108,27 +109,27 @@ TEST_F(RedisTopKTest, TestTopKAddAndQuery) { } // heap is full, need remove values1. - for (size_t i = 0; i < values2.size(); ++i) { + for (const auto &value : values2) { bool found = false; // due to decay, topk is possible to remove values1. while (!found) { - top_k_->Add(*ctx_, key_, values2[i]); - top_k_->Query(*ctx_, key_, values2[i], &found); + top_k_->Add(*ctx_, key_, value); + top_k_->Query(*ctx_, key_, value, &found); } - top_k_->Add(*ctx_, key_, values2[i]); + top_k_->Add(*ctx_, key_, value); } // values1 is removed. - for (size_t i = 0; i < values1.size(); ++i) { + for (const auto &value : values1) { bool found = true; - top_k_->Query(*ctx_, key_, values1[i], &found); + top_k_->Query(*ctx_, key_, value, &found); ASSERT_FALSE(found); } // found topk list. top_k_list.clear(); top_k_->List(*ctx_, key_, top_k_list); - for (size_t i = 0; i < top_k_list.size(); ++i) { - ASSERT_TRUE(values_set2.find(top_k_list[i]) != values_set2.end()); + for (const auto &value : top_k_list) { + ASSERT_TRUE(values_set2.find(value) != values_set2.end()); } } From 7a8f24734a0a276c45a17434327abbc22c0f3e34 Mon Sep 17 00:00:00 2001 From: aibin <2573214643@qq.com> Date: Fri, 21 Nov 2025 13:11:31 +0800 Subject: [PATCH 13/18] fix topk test --- tests/cppunit/types/topk_test.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/cppunit/types/topk_test.cc b/tests/cppunit/types/topk_test.cc index 1c7bd75085d..daeffc0aeac 100644 --- a/tests/cppunit/types/topk_test.cc +++ b/tests/cppunit/types/topk_test.cc @@ -69,7 +69,6 @@ TEST_F(RedisTopKTest, TestTopKAddAndQuery) { bool exist = false; s = top_k_->Query(*ctx_, no_exist_key, "1", &exist); ASSERT_FALSE(s.ok()); - ASSERT_TRUE(exist); std::vector list; s = top_k_->List(*ctx_, no_exist_key, list); From 5f99e27ff19f121a2bb6136a6d4ac97684ce79ad Mon Sep 17 00:00:00 2001 From: aibin <2573214643@qq.com> Date: Fri, 21 Nov 2025 13:55:27 +0800 Subject: [PATCH 14/18] fix topk heap_size type --- src/types/redis_topk.cc | 2 +- src/types/topk.cc | 6 +++--- src/types/topk.h | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/types/redis_topk.cc b/src/types/redis_topk.cc index a547cc5bb82..62b27300eba 100644 --- a/src/types/redis_topk.cc +++ b/src/types/redis_topk.cc @@ -175,7 +175,7 @@ rocksdb::Status TopK::getTopKData(engine::Context &ctx, const Slice &ns_key, con memcpy(topk->heap[j].item, hb_value.data(), hb_value.size()); } } else { - topk->heap_size = static_cast(std::stoul(pinnable_value.data())); + topk->heap_size = static_cast(std::stoul(pinnable_value.data())); } } return rocksdb::Status::OK(); diff --git a/src/types/topk.cc b/src/types/topk.cc index 20ed1936a1b..a2bec350d57 100644 --- a/src/types/topk.cc +++ b/src/types/topk.cc @@ -162,7 +162,7 @@ void BlockSplitTopK::HeapifyUp(int start) const { int BlockSplitTopK::CheckExistInHeap(const std::string &item) const { uint32_t itemlen = item.size(); const char *data = item.c_str(); - for (int i = (int)heap_size - 1; i >= 0; --i) { + for (int i = heap_size - 1; i >= 0; --i) { if (heap[i].itemlen == itemlen && memcmp(heap[i].item, data, itemlen) == 0) { return i; } @@ -218,7 +218,7 @@ void BlockSplitTopK::Add(const std::string &item, uint32_t increment) { } } - if (k == heap_size) { + if (k == (uint32_t)heap_size) { if (location == -1) { if (heap[0].count == max_count || heap[0].count + 1 == max_count) { heap[0].fp = fp; @@ -251,7 +251,7 @@ bool BlockSplitTopK::Query(const std::string &item) const { return CheckExistInH std::vector BlockSplitTopK::List() { std::vector result(heap_size); - for (uint32_t i = 0; i < heap_size; i++) { + for (int i = 0; i < heap_size; i++) { result[i] = heap[i]; } std::sort(result.begin(), result.end(), diff --git a/src/types/topk.h b/src/types/topk.h index 8834c3bc333..6b8320c5ee4 100644 --- a/src/types/topk.h +++ b/src/types/topk.h @@ -88,7 +88,7 @@ class BlockSplitTopK { uint32_t depth; double decay; - size_t heap_size; + int heap_size; Bucket *buckets; HeapBucket *heap; From 5db4141b4fd705d3bc141f18c6f5663d31ed01f8 Mon Sep 17 00:00:00 2001 From: aibin <2573214643@qq.com> Date: Sun, 23 Nov 2025 11:37:43 +0800 Subject: [PATCH 15/18] fix delete[] in topk.cc --- src/types/topk.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/types/topk.cc b/src/types/topk.cc index a2bec350d57..4ba3a0238f0 100644 --- a/src/types/topk.cc +++ b/src/types/topk.cc @@ -223,7 +223,7 @@ void BlockSplitTopK::Add(const std::string &item, uint32_t increment) { if (heap[0].count == max_count || heap[0].count + 1 == max_count) { heap[0].fp = fp; heap[0].itemlen = itemlen; - delete heap[0].item; + delete[] heap[0].item; heap[0].item = new char[itemlen]; memcpy(heap[0].item, data, itemlen); From 39d423c5d0dcb0b64279e61303cae5354cbc687b Mon Sep 17 00:00:00 2001 From: aibin <2573214643@qq.com> Date: Mon, 24 Nov 2025 10:21:01 +0800 Subject: [PATCH 16/18] fix: memory leaks --- src/types/redis_topk.cc | 113 ++++++++++++++++++++++++++++------------ src/types/redis_topk.h | 5 +- src/types/topk.cc | 64 ++++++++++------------- src/types/topk.h | 39 +++++++------- 4 files changed, 130 insertions(+), 91 deletions(-) diff --git a/src/types/redis_topk.cc b/src/types/redis_topk.cc index 62b27300eba..2af07e7b93d 100644 --- a/src/types/redis_topk.cc +++ b/src/types/redis_topk.cc @@ -59,6 +59,8 @@ rocksdb::Status TopK::IncrBy(engine::Context &ctx, const Slice &user_key, const s = getTopKData(ctx, ns_key, topk_metadata, &topk); if (!s.ok()) return s; + std::vector is_dirty_buckets(topk_metadata.width * topk_metadata.depth, false); + std::vector is_dirty_heaps(topk_metadata.top_k, false); topk.Add(items.data_, incr); s = setTopkData(ctx, ns_key, topk_metadata, topk); @@ -96,7 +98,7 @@ rocksdb::Status TopK::List(engine::Context &ctx, const Slice &user_key, std::vec auto heap_buckets = topk.List(); for (auto &bucket : heap_buckets) { - items.emplace_back(bucket.item, bucket.itemlen); + items.emplace_back(bucket.item); } return rocksdb::Status::OK(); @@ -140,7 +142,10 @@ rocksdb::Status TopK::createTopK(engine::Context &ctx, const Slice &ns_key, uint s = batch->Put(metadata_cf_handle_, ns_key, top_k_meta_bytes); if (!s.ok()) return s; - s = setTopkData(ctx, ns_key, *metadata, block_split_top_k); + // is dirty vector to optimize writes + std::vector is_dirty_buckets(width * depth, true); + std::vector is_dirty_heaps(k, true); + s = setTopkData(ctx, ns_key, *metadata, block_split_top_k, is_dirty_buckets, is_dirty_heaps); if (!s.ok()) return s; return storage_->Write(ctx, storage_->DefaultWriteOptions(), batch->GetWriteBatch()); @@ -154,25 +159,40 @@ rocksdb::Status TopK::getTopKData(engine::Context &ctx, const Slice &ns_key, con rocksdb::Status s = storage_->Get(ctx, ctx.GetReadOptions(), tk_key, &pinnable_value); if (!s.ok()) return s; if (i == 0) { - if (pinnable_value.size() != metadata.width * metadata.depth * sizeof(Bucket)) { - return rocksdb::Status::Corruption("TopK data corrupted: buckets size mismatch"); + // get buckets of topk structure + for (uint32_t j = 0; j < metadata.width * metadata.depth; j++) { + for (uint8_t k = 0; k < 2; k++) { + std::string bk_key = getSubKey(ns_key, metadata, i, j, k); + rocksdb::PinnableSlice bk_value; + rocksdb::Status s = storage_->Get(ctx, ctx.GetReadOptions(), bk_key, &bk_value); + if (!s.ok()) return s; + + int dep = j / metadata.width; + int wid = j % metadata.width; + if (k == 0) { + topk->buckets[dep][wid].fp = static_cast(std::stoul(pinnable_value.data())); + } else { + topk->buckets[dep][wid].count = static_cast(std::stoul(pinnable_value.data())); + } + } } - memcpy(topk->buckets, pinnable_value.data(), pinnable_value.size()); } else if (i == 1) { - if (pinnable_value.size() != metadata.top_k * sizeof(HeapBucket)) { - return rocksdb::Status::Corruption("TopK data corrupted: heap size mismatch"); - } - memcpy(topk->heap, pinnable_value.data(), pinnable_value.size()); + // get heapbucket of topk structure for (uint32_t j = 0; j < metadata.top_k; j++) { - std::string hb_key = getHBKey(ns_key, metadata, i, j); - rocksdb::PinnableSlice hb_value; - rocksdb::Status s = storage_->Get(ctx, ctx.GetReadOptions(), hb_key, &hb_value); - if (!s.ok()) return s; - if (hb_value.size() != topk->heap[j].itemlen) { - return rocksdb::Status::Corruption("TopK data corrupted: heap bucket size mismatch"); + for (uint8_t k = 0; k < 3; k++) { + std::string hb_key = getSubKey(ns_key, metadata, i, j, k); + rocksdb::PinnableSlice hb_value; + rocksdb::Status s = storage_->Get(ctx, ctx.GetReadOptions(), hb_key, &hb_value); + if (!s.ok()) return s; + + if (k == 0) { + topk->heap[j].count = static_cast(std::stoul(pinnable_value.data())); + } else if (k == 1) { + topk->heap[j].fp = static_cast(std::stoul(pinnable_value.data())); + } else { + topk->heap[j].item = hb_value.data(); + } } - topk->heap[j].item = new char[topk->heap[j].itemlen]; - memcpy(topk->heap[j].item, hb_value.data(), hb_value.size()); } } else { topk->heap_size = static_cast(std::stoul(pinnable_value.data())); @@ -182,30 +202,56 @@ rocksdb::Status TopK::getTopKData(engine::Context &ctx, const Slice &ns_key, con } rocksdb::Status TopK::setTopkData(engine::Context &ctx, const Slice &ns_key, const TopKMetadata &metadata, - const BlockSplitTopK &topk) { + const BlockSplitTopK &topk, const std::vector &is_dirty_buckets, + const std::vector &is_dirty_heaps) { auto batch = storage_->GetWriteBatchBase(); - WriteBatchLogData log_data(kRedisTopK, {"setTopkData"}); - rocksdb::Status s = batch->PutLogData(log_data.Encode()); - if (!s.ok()) return s; for (uint8_t i = 0; i < 3; i++) { - std::string tk_key = getTKKey(ns_key, metadata, i); - std::string tk_value; if (i == 0) { - tk_value.assign(reinterpret_cast(topk.buckets), metadata.width * metadata.depth * sizeof(Bucket)); + for (uint32_t j = 0; j < metadata.width * metadata.depth; j++) { + if (!is_dirty_buckets[j]) { + continue; + } + for (uint32_t k = 0; k < 2; k++) { + std::string sub_key = getSubKey(ns_key, metadata, i, j, k); + std::string sub_value; + int dep = j / metadata.width; + int wid = j % metadata.width; + if (k == 0) { + sub_value = std::to_string(topk.buckets[dep][wid].fp); + } else { + sub_value = std::to_string(topk.buckets[dep][wid].count); + } + rocksdb::Status s = batch->Put(sub_key, sub_value); + if (!s.ok()) return s; + } + } } else if (i == 1) { - tk_value.assign(reinterpret_cast(topk.heap), metadata.top_k * sizeof(HeapBucket)); for (uint32_t j = 0; j < metadata.top_k; j++) { - std::string hb_key = getHBKey(ns_key, metadata, i, j); - std::string hb_value(topk.heap[j].item, topk.heap[j].itemlen); - s = batch->Put(hb_key, hb_value); - if (!s.ok()) return s; + if (!is_dirty_heaps[j]) { + continue; + } + for (uint8_t k = 0; k < 3; k++) { + std::string sub_key = getSubKey(ns_key, metadata, i, j, k); + std::string sub_value; + if (k == 0) { + sub_value = std::to_string(topk.heap[j].count); + } else if (k == 1) { + sub_value = std::to_string(topk.heap[j].fp); + } else { + sub_value = topk.heap[j].item; + } + rocksdb::Status s = batch->Put(sub_key, sub_value); + if (!s.ok()) return s; + } } } else { + std::string tk_key = getTKKey(ns_key, metadata, i); + std::string tk_value; tk_value = std::to_string(topk.heap_size); + rocksdb::Status s = batch->Put(tk_key, tk_value); + if (!s.ok()) return s; } - rocksdb::Status s = batch->Put(tk_key, tk_value); - if (!s.ok()) return s; } return storage_->Write(ctx, storage_->DefaultWriteOptions(), batch->GetWriteBatch()); @@ -218,10 +264,11 @@ std::string TopK::getTKKey(const Slice &ns_key, const TopKMetadata &metadata, ui return bf_key; } -std::string TopK::getHBKey(const Slice &ns_key, const TopKMetadata &metadata, uint8_t topk_index, uint32_t hp_index) { +std::string TopK::getSubKey(const Slice &ns_key, const TopKMetadata &metadata, uint8_t topk_index, uint32_t sub_index, uint8_t index) { std::string sub_key; PutFixed8(&sub_key, topk_index); - PutFixed32(&sub_key, hp_index); + PutFixed32(&sub_key, sub_index); + PutFixed8(&sub_key, index); return InternalKey(ns_key, sub_key, metadata.version, storage_->IsSlotIdEncoded()).Encode(); } diff --git a/src/types/redis_topk.h b/src/types/redis_topk.h index 48f0e5a4759..79abe53b7a1 100644 --- a/src/types/redis_topk.h +++ b/src/types/redis_topk.h @@ -56,11 +56,12 @@ class TopK : public SubKeyScanner { rocksdb::Status getTopKData(engine::Context &ctx, const Slice &ns_key, const TopKMetadata &metadata, BlockSplitTopK *topk); rocksdb::Status setTopkData(engine::Context &ctx, const Slice &ns_key, const TopKMetadata &metadata, - const BlockSplitTopK &topk); + const BlockSplitTopK &topk, const std::vector &is_dirty_buckets, + const std::vector &is_dirty_heaps); std::string getTKKey(const Slice &ns_key, const TopKMetadata &metadata, uint8_t index); - std::string getHBKey(const Slice &ns_key, const TopKMetadata &metadata, uint8_t topk_index, uint32_t hp_index); + std::string getSubKey(const Slice &ns_key, const TopKMetadata &metadata, uint8_t topk_index, uint32_t sub_index, uint8_t index); }; } // namespace redis \ No newline at end of file diff --git a/src/types/topk.cc b/src/types/topk.cc index 4ba3a0238f0..ab0e4ced868 100644 --- a/src/types/topk.cc +++ b/src/types/topk.cc @@ -98,7 +98,7 @@ static uint32_t TopkHash(const void *item, int itemlen, uint32_t i) { return Mur constexpr uint32_t GA = 1919; /* ---------------------------------------------------------------------- */ -void BlockSplitTopK::HeapifyDown(int start) const { +void BlockSplitTopK::HeapifyDown(int start) { int child = start; // check whether larger than children @@ -114,10 +114,9 @@ void BlockSplitTopK::HeapifyDown(int start) const { return; } - HeapBucket top; - memcpy(&top, &heap[start], sizeof(HeapBucket)); + HeapBucket top = heap[start]; do { - memcpy(&heap[start], &heap[child], sizeof(HeapBucket)); + heap[start] = heap[child]; start = child; if ((heap_size - 2) / 2 < child) { @@ -129,10 +128,10 @@ void BlockSplitTopK::HeapifyDown(int start) const { ++child; } } while (heap[child].count < top.count); - memcpy(&heap[start], &top, sizeof(HeapBucket)); + heap[start] = top; } -void BlockSplitTopK::HeapifyUp(int start) const { +void BlockSplitTopK::HeapifyUp(int start) { int parent = start; // check whether smaller than parent @@ -145,10 +144,9 @@ void BlockSplitTopK::HeapifyUp(int start) const { return; } - HeapBucket bottom; - memcpy(&bottom, &heap[start], sizeof(HeapBucket)); + HeapBucket bottom = heap[start]; do { - memcpy(&heap[start], &heap[parent], sizeof(HeapBucket)); + heap[start] = heap[parent]; start = parent; if (start == 0) { @@ -156,14 +154,12 @@ void BlockSplitTopK::HeapifyUp(int start) const { } parent = (parent - 1) / 2; } while (heap[parent].count > bottom.count); - memcpy(&heap[start], &bottom, sizeof(HeapBucket)); + heap[start] = bottom; } int BlockSplitTopK::CheckExistInHeap(const std::string &item) const { - uint32_t itemlen = item.size(); - const char *data = item.c_str(); for (int i = heap_size - 1; i >= 0; --i) { - if (heap[i].itemlen == itemlen && memcmp(heap[i].item, data, itemlen) == 0) { + if (heap[i].item == item) { return i; } } @@ -185,32 +181,31 @@ void BlockSplitTopK::Add(const std::string &item, uint32_t increment) { for (size_t i = 0; i < depth; ++i) { uint32_t loc = TopkHash(data, (int)itemlen, i) % width; - loc += i * width; - if (buckets[loc].count == 0) { - buckets[loc].fp = fp; - buckets[loc].count = increment; - max_count = std::max(max_count, buckets[loc].count); - } else if (buckets[loc].fp == fp && location != -1) { - buckets[loc].count += increment; - max_count = std::max(max_count, buckets[loc].count); + if (buckets[i][loc].count == 0) { + buckets[i][loc].fp = fp; + buckets[i][loc].count = increment; + max_count = std::max(max_count, buckets[i][loc].count); + } else if (buckets[i][loc].fp == fp && location != -1) { + buckets[i][loc].count += increment; + max_count = std::max(max_count, buckets[i][loc].count); } else { // decay uint32_t local_incr = increment; for (; local_incr > 0; --local_incr) { double decay = 0.0; - if (buckets[loc].count < TOPK_DECAY_LOOKUP_TABLE) { - decay = lookup_table[buckets[loc].count]; + if (buckets[i][loc].count < TOPK_DECAY_LOOKUP_TABLE) { + decay = lookup_table[buckets[i][loc].count]; } else { - decay = pow(lookup_table[TOPK_DECAY_LOOKUP_TABLE - 1], (buckets[loc].count / (TOPK_DECAY_LOOKUP_TABLE - 1))) * - lookup_table[buckets[loc].count % (TOPK_DECAY_LOOKUP_TABLE - 1)]; + decay = pow(lookup_table[TOPK_DECAY_LOOKUP_TABLE - 1], (buckets[i][loc].count / (TOPK_DECAY_LOOKUP_TABLE - 1))) * + lookup_table[buckets[i][loc].count % (TOPK_DECAY_LOOKUP_TABLE - 1)]; } double chance = rand() / (double)RAND_MAX; if (chance < decay) { - --buckets[loc].count; - if (buckets[loc].count == 0) { - buckets[loc].fp = fp; - buckets[loc].count = 1; - max_count = std::max(max_count, buckets[loc].count); + --buckets[i][loc].count; + if (buckets[i][loc].count == 0) { + buckets[i][loc].fp = fp; + buckets[i][loc].count = 1; + max_count = std::max(max_count, buckets[i][loc].count); break; } } @@ -222,10 +217,7 @@ void BlockSplitTopK::Add(const std::string &item, uint32_t increment) { if (location == -1) { if (heap[0].count == max_count || heap[0].count + 1 == max_count) { heap[0].fp = fp; - heap[0].itemlen = itemlen; - delete[] heap[0].item; - heap[0].item = new char[itemlen]; - memcpy(heap[0].item, data, itemlen); + heap[0].item = item; heap[0].count = max_count; @@ -237,9 +229,7 @@ void BlockSplitTopK::Add(const std::string &item, uint32_t increment) { } } else { heap[heap_size].fp = fp; - heap[heap_size].itemlen = itemlen; - heap[heap_size].item = new char[itemlen]; - memcpy(heap[heap_size].item, data, itemlen); + heap[heap_size].item = item; heap[heap_size].count = max_count; HeapifyUp((int)heap_size); diff --git a/src/types/topk.h b/src/types/topk.h index 6b8320c5ee4..4f4c683f55f 100644 --- a/src/types/topk.h +++ b/src/types/topk.h @@ -23,6 +23,7 @@ #include #include +#include #include #include #include @@ -33,9 +34,17 @@ using CounterT = uint32_t; struct HeapBucket { uint32_t fp; - uint32_t itemlen; - char *item; CounterT count; + std::string item; + + HeapBucket& operator=(const HeapBucket& other) { + if (this != &other) { + fp = other.fp; + count = other.count; + item = other.item; + return *this; + } + } }; struct Bucket { @@ -48,8 +57,8 @@ class BlockSplitTopK { BlockSplitTopK() = delete; BlockSplitTopK(const BlockSplitTopK &) = delete; BlockSplitTopK &operator=(const BlockSplitTopK &) = delete; - BlockSplitTopK(BlockSplitTopK &&) = default; - BlockSplitTopK &operator=(BlockSplitTopK &&) = default; + BlockSplitTopK(BlockSplitTopK &&) = delete; + BlockSplitTopK &operator=(BlockSplitTopK &&) = delete; explicit BlockSplitTopK(uint32_t k, uint32_t width, uint32_t depth, double decay) : k(k), @@ -57,29 +66,21 @@ class BlockSplitTopK { depth(depth), decay(decay), heap_size(0), - buckets(new Bucket[width * depth]), - heap(new HeapBucket[k]) { - std::fill_n(buckets, width * depth, Bucket{0, 0}); - std::fill_n(heap, k, HeapBucket{0, 0, nullptr, 0}); + buckets(depth, std::vector(width, Bucket{0, 0})), + heap(k, HeapBucket{0, 0, ""}) { for (int i = 0; i < TOPK_DECAY_LOOKUP_TABLE; ++i) { lookup_table[i] = pow(decay, i); } } - ~BlockSplitTopK() { - for (size_t i = 0; i < k; ++i) { - delete[] heap[i].item; - } - delete[] buckets; - delete[] heap; - } + ~BlockSplitTopK() {} void Add(const std::string &item, uint32_t increment); bool Query(const std::string &item) const; std::vector List(); - void HeapifyDown(int start) const; - void HeapifyUp(int start) const; + void HeapifyDown(int start); + void HeapifyUp(int start); int CheckExistInHeap(const std::string &item) const; static int CmpHeapBucketCount(const HeapBucket &a, const HeapBucket &b); @@ -90,7 +91,7 @@ class BlockSplitTopK { int heap_size; - Bucket *buckets; - HeapBucket *heap; + std::vector> buckets; + std::vector heap; double lookup_table[TOPK_DECAY_LOOKUP_TABLE]; }; \ No newline at end of file From 60efb0ef0eb1bbb2f54f39f0956c3e012d32eee6 Mon Sep 17 00:00:00 2001 From: aibin <2573214643@qq.com> Date: Mon, 8 Dec 2025 16:05:17 +0800 Subject: [PATCH 17/18] feat: optimize TopK heapify with dirty tracking --- src/types/redis_topk.cc | 35 ++++++++++++++++---------------- src/types/redis_topk.h | 3 ++- src/types/topk.cc | 29 ++++++++++++++++++--------- src/types/topk.h | 44 +++++++++++++++++++++++++++++++++++------ 4 files changed, 78 insertions(+), 33 deletions(-) diff --git a/src/types/redis_topk.cc b/src/types/redis_topk.cc index 2af07e7b93d..983505e3b91 100644 --- a/src/types/redis_topk.cc +++ b/src/types/redis_topk.cc @@ -61,9 +61,9 @@ rocksdb::Status TopK::IncrBy(engine::Context &ctx, const Slice &user_key, const std::vector is_dirty_buckets(topk_metadata.width * topk_metadata.depth, false); std::vector is_dirty_heaps(topk_metadata.top_k, false); - topk.Add(items.data_, incr); + topk.Add(items.data_, incr, is_dirty_buckets, is_dirty_heaps); - s = setTopkData(ctx, ns_key, topk_metadata, topk); + s = setTopkData(ctx, ns_key, topk_metadata, topk, is_dirty_buckets, is_dirty_heaps); if (!s.ok()) return s; return storage_->Write(ctx, storage_->DefaultWriteOptions(), batch->GetWriteBatch()); @@ -154,10 +154,6 @@ rocksdb::Status TopK::createTopK(engine::Context &ctx, const Slice &ns_key, uint rocksdb::Status TopK::getTopKData(engine::Context &ctx, const Slice &ns_key, const TopKMetadata &metadata, BlockSplitTopK *topk) { for (uint8_t i = 0; i < 3; i++) { - std::string tk_key = getTKKey(ns_key, metadata, i); - rocksdb::PinnableSlice pinnable_value; - rocksdb::Status s = storage_->Get(ctx, ctx.GetReadOptions(), tk_key, &pinnable_value); - if (!s.ok()) return s; if (i == 0) { // get buckets of topk structure for (uint32_t j = 0; j < metadata.width * metadata.depth; j++) { @@ -166,13 +162,13 @@ rocksdb::Status TopK::getTopKData(engine::Context &ctx, const Slice &ns_key, con rocksdb::PinnableSlice bk_value; rocksdb::Status s = storage_->Get(ctx, ctx.GetReadOptions(), bk_key, &bk_value); if (!s.ok()) return s; - - int dep = j / metadata.width; - int wid = j % metadata.width; + + uint32_t dep = j / metadata.width; + uint32_t wid = j % metadata.width; if (k == 0) { - topk->buckets[dep][wid].fp = static_cast(std::stoul(pinnable_value.data())); + topk->buckets[dep][wid].fp = static_cast(std::stoul(bk_value.data())); } else { - topk->buckets[dep][wid].count = static_cast(std::stoul(pinnable_value.data())); + topk->buckets[dep][wid].count = static_cast(std::stoul(bk_value.data())); } } } @@ -186,15 +182,19 @@ rocksdb::Status TopK::getTopKData(engine::Context &ctx, const Slice &ns_key, con if (!s.ok()) return s; if (k == 0) { - topk->heap[j].count = static_cast(std::stoul(pinnable_value.data())); + topk->heap[j].count = static_cast(std::stoul(hb_value.data())); } else if (k == 1) { - topk->heap[j].fp = static_cast(std::stoul(pinnable_value.data())); + topk->heap[j].fp = static_cast(std::stoul(hb_value.data())); } else { topk->heap[j].item = hb_value.data(); } } } } else { + std::string tk_key = getTKKey(ns_key, metadata, i); + rocksdb::PinnableSlice pinnable_value; + rocksdb::Status s = storage_->Get(ctx, ctx.GetReadOptions(), tk_key, &pinnable_value); + if (!s.ok()) return s; topk->heap_size = static_cast(std::stoul(pinnable_value.data())); } } @@ -202,7 +202,7 @@ rocksdb::Status TopK::getTopKData(engine::Context &ctx, const Slice &ns_key, con } rocksdb::Status TopK::setTopkData(engine::Context &ctx, const Slice &ns_key, const TopKMetadata &metadata, - const BlockSplitTopK &topk, const std::vector &is_dirty_buckets, + const BlockSplitTopK &topk, const std::vector &is_dirty_buckets, const std::vector &is_dirty_heaps) { auto batch = storage_->GetWriteBatchBase(); @@ -215,8 +215,8 @@ rocksdb::Status TopK::setTopkData(engine::Context &ctx, const Slice &ns_key, con for (uint32_t k = 0; k < 2; k++) { std::string sub_key = getSubKey(ns_key, metadata, i, j, k); std::string sub_value; - int dep = j / metadata.width; - int wid = j % metadata.width; + uint32_t dep = j / metadata.width; + uint32_t wid = j % metadata.width; if (k == 0) { sub_value = std::to_string(topk.buckets[dep][wid].fp); } else { @@ -264,7 +264,8 @@ std::string TopK::getTKKey(const Slice &ns_key, const TopKMetadata &metadata, ui return bf_key; } -std::string TopK::getSubKey(const Slice &ns_key, const TopKMetadata &metadata, uint8_t topk_index, uint32_t sub_index, uint8_t index) { +std::string TopK::getSubKey(const Slice &ns_key, const TopKMetadata &metadata, uint8_t topk_index, uint32_t sub_index, + uint8_t index) { std::string sub_key; PutFixed8(&sub_key, topk_index); PutFixed32(&sub_key, sub_index); diff --git a/src/types/redis_topk.h b/src/types/redis_topk.h index 79abe53b7a1..089a850446c 100644 --- a/src/types/redis_topk.h +++ b/src/types/redis_topk.h @@ -61,7 +61,8 @@ class TopK : public SubKeyScanner { std::string getTKKey(const Slice &ns_key, const TopKMetadata &metadata, uint8_t index); - std::string getSubKey(const Slice &ns_key, const TopKMetadata &metadata, uint8_t topk_index, uint32_t sub_index, uint8_t index); + std::string getSubKey(const Slice &ns_key, const TopKMetadata &metadata, uint8_t topk_index, uint32_t sub_index, + uint8_t index); }; } // namespace redis \ No newline at end of file diff --git a/src/types/topk.cc b/src/types/topk.cc index ab0e4ced868..d7da238689b 100644 --- a/src/types/topk.cc +++ b/src/types/topk.cc @@ -98,7 +98,7 @@ static uint32_t TopkHash(const void *item, int itemlen, uint32_t i) { return Mur constexpr uint32_t GA = 1919; /* ---------------------------------------------------------------------- */ -void BlockSplitTopK::HeapifyDown(int start) { +void BlockSplitTopK::HeapifyDown(int start, std::vector &is_dirty_heaps) { int child = start; // check whether larger than children @@ -117,6 +117,7 @@ void BlockSplitTopK::HeapifyDown(int start) { HeapBucket top = heap[start]; do { heap[start] = heap[child]; + is_dirty_heaps[start] = true; start = child; if ((heap_size - 2) / 2 < child) { @@ -129,9 +130,10 @@ void BlockSplitTopK::HeapifyDown(int start) { } } while (heap[child].count < top.count); heap[start] = top; + is_dirty_heaps[start] = true; } -void BlockSplitTopK::HeapifyUp(int start) { +void BlockSplitTopK::HeapifyUp(int start, std::vector &is_dirty_heaps) { int parent = start; // check whether smaller than parent @@ -147,6 +149,7 @@ void BlockSplitTopK::HeapifyUp(int start) { HeapBucket bottom = heap[start]; do { heap[start] = heap[parent]; + is_dirty_heaps[start] = true; start = parent; if (start == 0) { @@ -155,6 +158,7 @@ void BlockSplitTopK::HeapifyUp(int start) { parent = (parent - 1) / 2; } while (heap[parent].count > bottom.count); heap[start] = bottom; + is_dirty_heaps[start] = true; } int BlockSplitTopK::CheckExistInHeap(const std::string &item) const { @@ -170,7 +174,8 @@ int BlockSplitTopK::CmpHeapBucketCount(const HeapBucket &a, const HeapBucket &b) return a.count < b.count ? 1 : a.count > b.count ? -1 : 0; } -void BlockSplitTopK::Add(const std::string &item, uint32_t increment) { +void BlockSplitTopK::Add(const std::string &item, uint32_t increment, std::vector &is_dirty_buckets, + std::vector &is_dirty_heaps) { uint32_t itemlen = item.size(); const char *data = item.c_str(); CounterT max_count = 0; @@ -184,10 +189,12 @@ void BlockSplitTopK::Add(const std::string &item, uint32_t increment) { if (buckets[i][loc].count == 0) { buckets[i][loc].fp = fp; buckets[i][loc].count = increment; + is_dirty_buckets[i * width + loc] = true; max_count = std::max(max_count, buckets[i][loc].count); } else if (buckets[i][loc].fp == fp && location != -1) { buckets[i][loc].count += increment; max_count = std::max(max_count, buckets[i][loc].count); + is_dirty_buckets[i * width + loc] = true; } else { // decay uint32_t local_incr = increment; @@ -196,8 +203,9 @@ void BlockSplitTopK::Add(const std::string &item, uint32_t increment) { if (buckets[i][loc].count < TOPK_DECAY_LOOKUP_TABLE) { decay = lookup_table[buckets[i][loc].count]; } else { - decay = pow(lookup_table[TOPK_DECAY_LOOKUP_TABLE - 1], (buckets[i][loc].count / (TOPK_DECAY_LOOKUP_TABLE - 1))) * - lookup_table[buckets[i][loc].count % (TOPK_DECAY_LOOKUP_TABLE - 1)]; + decay = + pow(lookup_table[TOPK_DECAY_LOOKUP_TABLE - 1], (buckets[i][loc].count / (TOPK_DECAY_LOOKUP_TABLE - 1))) * + lookup_table[buckets[i][loc].count % (TOPK_DECAY_LOOKUP_TABLE - 1)]; } double chance = rand() / (double)RAND_MAX; if (chance < decay) { @@ -205,6 +213,7 @@ void BlockSplitTopK::Add(const std::string &item, uint32_t increment) { if (buckets[i][loc].count == 0) { buckets[i][loc].fp = fp; buckets[i][loc].count = 1; + is_dirty_buckets[i * width + loc] = true; max_count = std::max(max_count, buckets[i][loc].count); break; } @@ -220,19 +229,21 @@ void BlockSplitTopK::Add(const std::string &item, uint32_t increment) { heap[0].item = item; heap[0].count = max_count; - - HeapifyDown(0); + is_dirty_heaps[0] = true; + HeapifyDown(0, is_dirty_heaps); } } else { heap[location].count += increment; - HeapifyDown(location); + HeapifyDown(location, is_dirty_heaps); } } else { heap[heap_size].fp = fp; heap[heap_size].item = item; heap[heap_size].count = max_count; - HeapifyUp((int)heap_size); + is_dirty_heaps[heap_size] = true; + + HeapifyUp((int)heap_size, is_dirty_heaps); heap_size++; } } diff --git a/src/types/topk.h b/src/types/topk.h index 4f4c683f55f..2791a8d7dfd 100644 --- a/src/types/topk.h +++ b/src/types/topk.h @@ -37,14 +37,45 @@ struct HeapBucket { CounterT count; std::string item; - HeapBucket& operator=(const HeapBucket& other) { + HeapBucket() = default; + + HeapBucket(uint32_t fp, CounterT count, std::string item) : fp(fp), count(count), item(std::move(item)) {} + + HeapBucket(const HeapBucket &other) { + if (this != &other) { + fp = other.fp; + count = other.count; + item = other.item; + } + } + + HeapBucket(const HeapBucket &&other) noexcept { + if (this != &other) { + fp = other.fp; + count = other.count; + item = other.item; + } + } + + HeapBucket &operator=(const HeapBucket &other) { + if (this != &other) { + fp = other.fp; + count = other.count; + item = other.item; + } + return *this; + } + + HeapBucket &operator=(const HeapBucket &&other) noexcept { if (this != &other) { fp = other.fp; count = other.count; item = other.item; - return *this; } + return *this; } + + ~HeapBucket() = default; }; struct Bucket { @@ -73,14 +104,15 @@ class BlockSplitTopK { } } - ~BlockSplitTopK() {} + ~BlockSplitTopK() = default; - void Add(const std::string &item, uint32_t increment); + void Add(const std::string &item, uint32_t increment, std::vector &is_dirty_buckets, + std::vector &is_dirty_heaps); bool Query(const std::string &item) const; std::vector List(); - void HeapifyDown(int start); - void HeapifyUp(int start); + void HeapifyDown(int start, std::vector &is_dirty_heaps); + void HeapifyUp(int start, std::vector &is_dirty_heaps); int CheckExistInHeap(const std::string &item) const; static int CmpHeapBucketCount(const HeapBucket &a, const HeapBucket &b); From b3f08c39344cf45ab8da8d79c37b7278944fc3bd Mon Sep 17 00:00:00 2001 From: hidedim <2573214643@qq.com> Date: Wed, 8 Apr 2026 21:49:55 +0800 Subject: [PATCH 18/18] fix: rand() function --- src/types/topk.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/types/topk.cc b/src/types/topk.cc index d7da238689b..6713d2fcdaf 100644 --- a/src/types/topk.cc +++ b/src/types/topk.cc @@ -25,6 +25,7 @@ #include #include #include +#include //----------------------------------------------------------------------------- // MurmurHash2 was written by Austin Appleby, and is placed in the public @@ -207,7 +208,10 @@ void BlockSplitTopK::Add(const std::string &item, uint32_t increment, std::vecto pow(lookup_table[TOPK_DECAY_LOOKUP_TABLE - 1], (buckets[i][loc].count / (TOPK_DECAY_LOOKUP_TABLE - 1))) * lookup_table[buckets[i][loc].count % (TOPK_DECAY_LOOKUP_TABLE - 1)]; } - double chance = rand() / (double)RAND_MAX; + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<> dis(0.0, 1.0); + double chance = dis(gen); if (chance < decay) { --buckets[i][loc].count; if (buckets[i][loc].count == 0) {