Skip to content

Commit d3090b8

Browse files
committed
config: add shard_local_cfg_unsafe()
And swap any uses of `shard_local_cfg()` which occur in the bootstrap process (before it is marked as `ready`) over to the unsafe accessor. These uses of potentially stale values are all audited and deemed safe, due to minimal or no impact on the behavior of the system post bootstrap. Two other relevant changes in this commit: * Pulling out `feature_table::setup_metrics()` into its own function, called after the bootstrap process (in order to properly respect `shard_local_cfg().disable_{public}_metrics`) * Updating `storage_resources` to _not_ invoke the `internal::chunks()` singleton constructor, which would in turn invoke `shard_local_cfg()`. We use `storage_resources for an early replay of the `kvstore` segments, so it is fine to use stale configuration values here. However, after bootstrapping is complete, it is *probably* important to set the `_append_chunk_size` back to the same value as what `internal::chunks()` is operating with. We now do so when initializing the storage system in `wire_up_bootstrap_services()`.
1 parent 43ac676 commit d3090b8

16 files changed

Lines changed: 105 additions & 38 deletions

src/v/cluster/cluster_discovery.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ cluster_discovery::cluster_discovery(
3737
ss::abort_source& as)
3838
: _node_uuid(node_uuid)
3939
, _cluster_uuid(std::move(cluster_uuid))
40-
, _join_retry_jitter(config::shard_local_cfg().join_retry_timeout_ms())
40+
, _join_retry_jitter(config::shard_local_cfg_unsafe().join_retry_timeout_ms())
4141
, _join_timeout(std::chrono::seconds(2))
4242
, _as(as) {}
4343

src/v/cluster/config_manager.cc

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ ss::future<> config_manager::wait_for_bootstrap() {
145145
ss::future<> config_manager::do_bootstrap() {
146146
config_update_request update;
147147

148-
config::shard_local_cfg().for_each(
148+
config::shard_local_cfg_unsafe().for_each(
149149
[&update](const config::base_property& p) {
150150
if (!p.is_default()) {
151151
json::StringBuffer buf;
@@ -292,7 +292,7 @@ static void preload_local(
292292
const ss::sstring& raw_value,
293293
std::optional<std::reference_wrapper<config_manager::preload_result>>
294294
result) {
295-
auto& cfg = config::shard_local_cfg();
295+
auto& cfg = config::shard_local_cfg_unsafe();
296296
if (cfg.contains(key)) {
297297
auto& property = cfg.get(key);
298298
try {
@@ -339,7 +339,7 @@ static void preload_local(
339339
const YAML::Node& value,
340340
std::optional<std::reference_wrapper<config_manager::preload_result>>
341341
result) {
342-
auto& cfg = config::shard_local_cfg();
342+
auto& cfg = config::shard_local_cfg_unsafe();
343343
if (cfg.contains(key)) {
344344
auto& property = cfg.get(key);
345345
std::string raw_value;
@@ -408,7 +408,7 @@ config_manager::preload(const YAML::Node& legacy_config) {
408408
// to set something in redpanda.yaml and it's not working.
409409
if (legacy_config["redpanda"]) {
410410
const auto nag_properties
411-
= config::shard_local_cfg().property_names_and_aliases();
411+
= config::shard_local_cfg_unsafe().property_names_and_aliases();
412412
for (const auto& node : legacy_config["redpanda"]) {
413413
auto name = node.first.as<ss::sstring>();
414414
if (nag_properties.contains(name)) {
@@ -447,7 +447,7 @@ ss::future<bool> config_manager::load_bootstrap() {
447447

448448
// This node has never seen a cluster configuration message.
449449
// Bootstrap configuration from local yaml file.
450-
auto errors = config::shard_local_cfg().read_yaml(config);
450+
auto errors = config::shard_local_cfg_unsafe().read_yaml(config);
451451
for (const auto& i : errors) {
452452
vlog(
453453
clusterlog.warn,
@@ -457,18 +457,19 @@ ss::future<bool> config_manager::load_bootstrap() {
457457
}
458458

459459
co_await ss::smp::invoke_on_all(
460-
[&config] { config::shard_local_cfg().read_yaml(config); });
460+
[&config] { config::shard_local_cfg_unsafe().read_yaml(config); });
461461

462462
co_return true;
463463
}
464464

465465
ss::future<> config_manager::load_legacy(const YAML::Node& legacy_config) {
466-
co_await ss::smp::invoke_on_all(
467-
[&legacy_config] { config::shard_local_cfg().load(legacy_config); });
466+
co_await ss::smp::invoke_on_all([&legacy_config] {
467+
config::shard_local_cfg_unsafe().load(legacy_config);
468+
});
468469

469470
// This node has never seen a cluster configuration message.
470471
// Bootstrap configuration from local yaml file.
471-
auto errors = config::shard_local_cfg().load(legacy_config);
472+
auto errors = config::shard_local_cfg_unsafe().load(legacy_config);
472473

473474
// Report any invalid properties. Do not refuse to start redpanda,
474475
// as the properties will have been either ignored or clamped
@@ -625,7 +626,7 @@ ss::future<> config_manager::reconcile_status() {
625626
*/
626627
config_manager::apply_result
627628
apply_local(const cluster_config_delta_cmd_data& data, bool silent) {
628-
auto& cfg = config::shard_local_cfg();
629+
auto& cfg = config::shard_local_cfg_unsafe();
629630
auto result = config_manager::apply_result{};
630631
for (const auto& u : data.upsert) {
631632
if (!cfg.contains(u.key)) {
@@ -813,7 +814,7 @@ void config_manager::merge_apply_result(
813814
*/
814815
ss::future<>
815816
config_manager::store_delta(const cluster_config_delta_cmd_data& data) {
816-
auto& cfg = config::shard_local_cfg();
817+
auto& cfg = config::shard_local_cfg_unsafe();
817818

818819
for (const auto& u : data.upsert) {
819820
/// skip section

src/v/config/configuration.cc

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4981,8 +4981,18 @@ std::unique_ptr<configuration> make_config() {
49814981
}
49824982
}
49834983

4984-
configuration& shard_local_cfg() {
4984+
configuration& shard_local_cfg_unsafe() {
49854985
static thread_local std::unique_ptr<configuration> cfg = make_config();
49864986
return *cfg;
49874987
}
4988+
4989+
configuration& shard_local_cfg() {
4990+
auto& cfg = shard_local_cfg_unsafe();
4991+
vassert(
4992+
cfg.is_ready(),
4993+
"Used shard_local_cfg() before it was marked as ready by the startup "
4994+
"process");
4995+
return cfg;
4996+
}
4997+
49884998
} // namespace config

src/v/config/configuration.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -846,6 +846,9 @@ struct configuration final : public config_store {
846846

847847
development_feature_property<int> development_feature_property_testing_only;
848848

849+
void mark_ready(bool ready) { _ready = ready; }
850+
bool is_ready() const { return _ready; }
851+
849852
private:
850853
// to query if experimental features are enabled in order to log a nag. it
851854
// does not use the query to control any experimental feature.
@@ -874,6 +877,8 @@ struct configuration final : public config_store {
874877
}
875878

876879
ss::sstring store_name() const override { return "cluster"; }
880+
881+
bool _ready{true};
877882
};
878883

879884
template<typename T>
@@ -884,6 +889,21 @@ bool development_feature_property<T>::development_features_enabled(
884889

885890
std::unique_ptr<configuration> make_config();
886891

892+
// Accessors for the shard local cluster configuration.
893+
// shard_local_cfg() contains a vassert which strictly enforces that the
894+
// configuration has been marked as ready by the bootstrap process after we have
895+
// successfully preloaded from our node local state, and furthermore, recieved
896+
// an up to date snapshot from the controller leader of the cluster wide view of
897+
// the configuration.
898+
// However, some configurations are somewhat required to be accessed in the
899+
// bootstrapping process before this can happen. For that reason, we provide a
900+
// shard_local_cfg_unsafe() accessor, which does not perform this same check.
901+
// It is highly recommended that use of this is kept to a minimum, and if you do
902+
// need to use it, that you are confident that using a potentially stale
903+
// configuration value during the bootstrapping process will not have any
904+
// ramifications on the behavior of the system after its value is updated during
905+
// bootstrapping.
887906
configuration& shard_local_cfg();
907+
configuration& shard_local_cfg_unsafe();
888908

889909
} // namespace config

src/v/config/node_config.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -341,7 +341,7 @@ node_config::error_map_t node_config::load(const YAML::Node& root_node) {
341341
throw std::invalid_argument("'redpanda' root is required");
342342
}
343343

344-
auto ignore = shard_local_cfg().property_names_and_aliases();
344+
auto ignore = shard_local_cfg_unsafe().property_names_and_aliases();
345345

346346
auto errors = config_store::read_yaml(root_node["redpanda"]);
347347
validate_multi_node_property_config(errors);

src/v/features/feature_table.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -828,7 +828,7 @@ feature_table::decode_version_fence(model::record_batch batch) {
828828
void feature_table::set_original_version(cluster::cluster_version v) {
829829
_original_version = v;
830830
if (v != cluster::invalid_version) {
831-
config::shard_local_cfg().notify_original_version(
831+
config::shard_local_cfg_unsafe().notify_original_version(
832832
config::legacy_version{v});
833833
}
834834
}

src/v/redpanda/application.cc

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
#include "datalake/credential_manager.h"
3232
#include "datalake/datalake_manager.h"
3333
#include "datalake/datalake_usage_aggregator.h"
34+
#include "features/feature_table.h"
3435
#include "kafka/client/configuration.h"
3536
#include "kafka/server/rm_group_frontend.h"
3637
#include "metrics/prometheus_sanitize.h"
@@ -340,6 +341,7 @@ int application::run(int ac, char** av) {
340341
// checks), so crypto must be initialized first.
341342
wire_up_and_start_crypto_services();
342343
wire_up_pre_bootstrap_services();
344+
mark_config_ready(false).get();
343345
hydrate_cluster_config(node_cfg_yaml);
344346
wire_up_and_start_rpc_service();
345347
establish_cluster_view(app_signal);
@@ -802,6 +804,15 @@ void application::establish_cluster_view(::stop_signal& app_signal) {
802804
app_signal.abort_source());
803805

804806
bootstrap_controller_view().get();
807+
808+
// The shard_local_cfg() is now safe to use as we have a view consistent
809+
// with the rest of the cluster.
810+
mark_config_ready(true).get();
811+
}
812+
813+
ss::future<> application::mark_config_ready(bool ready) {
814+
co_await ss::smp::invoke_on_all(
815+
[ready] { config::shard_local_cfg_unsafe().mark_ready(ready); });
805816
}
806817

807818
void application::hydrate_cluster_config(const YAML::Node& config) {

src/v/redpanda/application.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,8 @@ class application : public ssx::sharded_service_container {
264264
std::optional<cloud_storage_clients::bucket_name>& bucket_name,
265265
cloud_topics::test_fixture_cfg ct_test_cfg);
266266

267+
// Marks the shard_local_cfg as ready per the provided flag.
268+
ss::future<> mark_config_ready(bool ready);
267269
// Performs recovery on the local kvstore, applies a local feature table
268270
// snapshot, and sets in-memory node/cluster UUIDs.
269271
ss::future<> bootstrap_from_kvstore();

src/v/redpanda/application_bootstrap.cc

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,12 @@ void application::wire_up_and_start_crypto_services() {
9292

9393
void application::wire_up_bootstrap_services() {
9494
// Wire up local storage.
95+
storage
96+
.invoke_on_all([](storage::api& s) {
97+
s.resources().update_append_chunk_size(
98+
storage::internal::chunks().chunk_size());
99+
})
100+
.get();
95101
ss::smp::invoke_on_all([] {
96102
return storage::internal::chunks().start();
97103
}).get();

src/v/redpanda/application_config.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -175,8 +175,8 @@ storage::kvstore_config kvstore_config_from_global_config(
175175
* - ... #cores
176176
*/
177177
return storage::kvstore_config(
178-
config::shard_local_cfg().kvstore_max_segment_size(),
179-
config::shard_local_cfg().kvstore_flush_interval.bind(),
178+
config::shard_local_cfg_unsafe().kvstore_max_segment_size(),
179+
config::shard_local_cfg_unsafe().kvstore_flush_interval.bind(),
180180
config::node().data_directory().as_sstring(),
181181
sanitizer_config);
182182
}

0 commit comments

Comments
 (0)