Skip to content

Commit 0140ac7

Browse files
committed
config: add shard_local_cfg_unsafe()
And swap any uses of `shard_local_cfg()` which occur in the bootstrap process (before it is marked as `ready`) over to the unsafe accessor. These uses of potentially stale values are all audited and deemed safe, due to minimal or no impact on the behavior of the system post bootstrap. Two other relevant changes in this commit: * Pulling out `feature_table::setup_metrics()` into its own function, called after the bootstrap process (in order to properly respect `shard_local_cfg().disable_{public}_metrics`) * Updating `storage_resources` to _not_ invoke the `internal::chunks()` singleton constructor, which would in turn invoke `shard_local_cfg()`. We use `storage_resources for an early replay of the `kvstore` segments, so it is fine to use stale configuration values here. However, after bootstrapping is complete, it is *probably* important to set the `_append_chunk_size` back to the same value as what `internal::chunks()` is operating with. We now do so when initializing the storage system in `wire_up_bootstrap_services()`.
1 parent 1d3a34d commit 0140ac7

14 files changed

Lines changed: 93 additions & 32 deletions

src/v/cluster/cluster_discovery.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ cluster_discovery::cluster_discovery(
3737
ss::abort_source& as)
3838
: _node_uuid(node_uuid)
3939
, _cluster_uuid(std::move(cluster_uuid))
40-
, _join_retry_jitter(config::shard_local_cfg().join_retry_timeout_ms())
40+
, _join_retry_jitter(config::shard_local_cfg_unsafe().join_retry_timeout_ms())
4141
, _join_timeout(std::chrono::seconds(2))
4242
, _as(as) {}
4343

src/v/cluster/config_manager.cc

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ ss::future<> config_manager::wait_for_bootstrap() {
145145
ss::future<> config_manager::do_bootstrap() {
146146
config_update_request update;
147147

148-
config::shard_local_cfg().for_each(
148+
config::shard_local_cfg_unsafe().for_each(
149149
[&update](const config::base_property& p) {
150150
if (!p.is_default()) {
151151
json::StringBuffer buf;
@@ -292,7 +292,7 @@ static void preload_local(
292292
const ss::sstring& raw_value,
293293
std::optional<std::reference_wrapper<config_manager::preload_result>>
294294
result) {
295-
auto& cfg = config::shard_local_cfg();
295+
auto& cfg = config::shard_local_cfg_unsafe();
296296
if (cfg.contains(key)) {
297297
auto& property = cfg.get(key);
298298
try {
@@ -339,7 +339,7 @@ static void preload_local(
339339
const YAML::Node& value,
340340
std::optional<std::reference_wrapper<config_manager::preload_result>>
341341
result) {
342-
auto& cfg = config::shard_local_cfg();
342+
auto& cfg = config::shard_local_cfg_unsafe();
343343
if (cfg.contains(key)) {
344344
auto& property = cfg.get(key);
345345
std::string raw_value;
@@ -408,7 +408,7 @@ config_manager::preload(const YAML::Node& legacy_config) {
408408
// to set something in redpanda.yaml and it's not working.
409409
if (legacy_config["redpanda"]) {
410410
const auto nag_properties
411-
= config::shard_local_cfg().property_names_and_aliases();
411+
= config::shard_local_cfg_unsafe().property_names_and_aliases();
412412
for (const auto& node : legacy_config["redpanda"]) {
413413
auto name = node.first.as<ss::sstring>();
414414
if (nag_properties.contains(name)) {
@@ -447,7 +447,7 @@ ss::future<bool> config_manager::load_bootstrap() {
447447

448448
// This node has never seen a cluster configuration message.
449449
// Bootstrap configuration from local yaml file.
450-
auto errors = config::shard_local_cfg().read_yaml(config);
450+
auto errors = config::shard_local_cfg_unsafe().read_yaml(config);
451451
for (const auto& i : errors) {
452452
vlog(
453453
clusterlog.warn,
@@ -457,18 +457,19 @@ ss::future<bool> config_manager::load_bootstrap() {
457457
}
458458

459459
co_await ss::smp::invoke_on_all(
460-
[&config] { config::shard_local_cfg().read_yaml(config); });
460+
[&config] { config::shard_local_cfg_unsafe().read_yaml(config); });
461461

462462
co_return true;
463463
}
464464

465465
ss::future<> config_manager::load_legacy(const YAML::Node& legacy_config) {
466-
co_await ss::smp::invoke_on_all(
467-
[&legacy_config] { config::shard_local_cfg().load(legacy_config); });
466+
co_await ss::smp::invoke_on_all([&legacy_config] {
467+
config::shard_local_cfg_unsafe().load(legacy_config);
468+
});
468469

469470
// This node has never seen a cluster configuration message.
470471
// Bootstrap configuration from local yaml file.
471-
auto errors = config::shard_local_cfg().load(legacy_config);
472+
auto errors = config::shard_local_cfg_unsafe().load(legacy_config);
472473

473474
// Report any invalid properties. Do not refuse to start redpanda,
474475
// as the properties will have been either ignored or clamped
@@ -625,7 +626,7 @@ ss::future<> config_manager::reconcile_status() {
625626
*/
626627
config_manager::apply_result
627628
apply_local(const cluster_config_delta_cmd_data& data, bool silent) {
628-
auto& cfg = config::shard_local_cfg();
629+
auto& cfg = config::shard_local_cfg_unsafe();
629630
auto result = config_manager::apply_result{};
630631
for (const auto& u : data.upsert) {
631632
if (!cfg.contains(u.key)) {
@@ -813,7 +814,7 @@ void config_manager::merge_apply_result(
813814
*/
814815
ss::future<>
815816
config_manager::store_delta(const cluster_config_delta_cmd_data& data) {
816-
auto& cfg = config::shard_local_cfg();
817+
auto& cfg = config::shard_local_cfg_unsafe();
817818

818819
for (const auto& u : data.upsert) {
819820
/// skip section

src/v/config/configuration.cc

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4940,8 +4940,18 @@ std::unique_ptr<configuration> make_config() {
49404940
}
49414941
}
49424942

4943-
configuration& shard_local_cfg() {
4943+
configuration& shard_local_cfg_unsafe() {
49444944
static thread_local std::unique_ptr<configuration> cfg = make_config();
49454945
return *cfg;
49464946
}
4947+
4948+
configuration& shard_local_cfg() {
4949+
auto& cfg = shard_local_cfg_unsafe();
4950+
vassert(
4951+
cfg.is_ready(),
4952+
"Used shard_local_cfg() before it was marked as ready by the startup "
4953+
"process");
4954+
return cfg;
4955+
}
4956+
49474957
} // namespace config

src/v/config/configuration.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -844,6 +844,9 @@ struct configuration final : public config_store {
844844

845845
development_feature_property<int> development_feature_property_testing_only;
846846

847+
void mark_ready(bool ready) { _ready = ready; }
848+
bool is_ready() const { return _ready; }
849+
847850
private:
848851
// to query if experimental features are enabled in order to log a nag. it
849852
// does not use the query to control any experimental feature.
@@ -872,6 +875,8 @@ struct configuration final : public config_store {
872875
}
873876

874877
ss::sstring store_name() const override { return "cluster"; }
878+
879+
bool _ready{true};
875880
};
876881

877882
template<typename T>
@@ -882,6 +887,21 @@ bool development_feature_property<T>::development_features_enabled(
882887

883888
std::unique_ptr<configuration> make_config();
884889

890+
// Accessors for the shard local cluster configuration.
891+
// shard_local_cfg() contains a vassert which strictly enforces that the
892+
// configuration has been marked as ready by the bootstrap process after we have
893+
// successfully preloaded from our node local state, and furthermore, recieved
894+
// an up to date snapshot from the controller leader of the cluster wide view of
895+
// the configuration.
896+
// However, some configurations are somewhat required to be accessed in the
897+
// bootstrapping process before this can happen. For that reason, we provide a
898+
// shard_local_cfg_unsafe() accessor, which does not perform this same check.
899+
// It is highly recommended that use of this is kept to a minimum, and if you do
900+
// need to use it, that you are confident that using a potentially stale
901+
// configuration value during the bootstrapping process will not have any
902+
// ramifications on the behavior of the system after its value is updated during
903+
// bootstrapping.
885904
configuration& shard_local_cfg();
905+
configuration& shard_local_cfg_unsafe();
886906

887907
} // namespace config

src/v/config/node_config.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -341,7 +341,7 @@ node_config::error_map_t node_config::load(const YAML::Node& root_node) {
341341
throw std::invalid_argument("'redpanda' root is required");
342342
}
343343

344-
auto ignore = shard_local_cfg().property_names_and_aliases();
344+
auto ignore = shard_local_cfg_unsafe().property_names_and_aliases();
345345

346346
auto errors = config_store::read_yaml(root_node["redpanda"]);
347347
validate_multi_node_property_config(errors);

src/v/features/feature_table.cc

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -384,7 +384,9 @@ feature_table::feature_table() {
384384
}
385385
}
386386
}
387+
}
387388

389+
void feature_table::setup_metrics() {
388390
_probe = std::make_unique<probe>(*this);
389391
_probe->setup_metrics();
390392
}
@@ -824,7 +826,7 @@ feature_table::decode_version_fence(model::record_batch batch) {
824826
void feature_table::set_original_version(cluster::cluster_version v) {
825827
_original_version = v;
826828
if (v != cluster::invalid_version) {
827-
config::shard_local_cfg().notify_original_version(
829+
config::shard_local_cfg_unsafe().notify_original_version(
828830
config::legacy_version{v});
829831
}
830832
}

src/v/features/feature_table.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -662,7 +662,7 @@ class feature_table {
662662
std::optional<feature> resolve_name(std::string_view feature_name) const;
663663

664664
void set_license(security::license license);
665-
665+
void setup_metrics();
666666
/// Sets the builtin trial license based on the cluster creation time
667667
void set_builtin_trial_license(model::timestamp cluster_creation_timestamp);
668668

src/v/redpanda/application.cc

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
#include "datalake/credential_manager.h"
3232
#include "datalake/datalake_manager.h"
3333
#include "datalake/datalake_usage_aggregator.h"
34+
#include "features/feature_table.h"
3435
#include "kafka/client/configuration.h"
3536
#include "kafka/server/rm_group_frontend.h"
3637
#include "metrics/prometheus_sanitize.h"
@@ -340,6 +341,7 @@ int application::run(int ac, char** av) {
340341
// checks), so crypto must be initialized first.
341342
wire_up_and_start_crypto_services();
342343
wire_up_pre_bootstrap_services();
344+
mark_config_ready(false).get();
343345
hydrate_cluster_config(node_cfg_yaml);
344346
establish_cluster_view(app_signal);
345347
log_cluster_config();
@@ -518,6 +520,7 @@ void application::initialize(
518520
}
519521

520522
void application::setup_metrics() {
523+
feature_table.invoke_on_all(&features::feature_table::setup_metrics).get();
521524
setup_internal_metrics();
522525
setup_public_metrics();
523526
}
@@ -773,6 +776,15 @@ void application::establish_cluster_view(::stop_signal& app_signal) {
773776
app_signal.abort_source());
774777

775778
bootstrap_controller_view().get();
779+
780+
// The shard_local_cfg() is now safe to use as we have a view consistent
781+
// with the rest of the cluster.
782+
mark_config_ready(true).get();
783+
}
784+
785+
ss::future<> application::mark_config_ready(bool ready) {
786+
co_await ss::smp::invoke_on_all(
787+
[ready] { config::shard_local_cfg_unsafe().mark_ready(ready); });
776788
}
777789

778790
void application::hydrate_cluster_config(const YAML::Node& config) {

src/v/redpanda/application.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,8 @@ class application : public ssx::sharded_service_container {
260260
std::optional<cloud_storage_clients::bucket_name>& bucket_name,
261261
cloud_topics::test_fixture_cfg ct_test_cfg);
262262

263+
// Marks the shard_local_cfg as ready per the provided flag.
264+
ss::future<> mark_config_ready(bool ready);
263265
// Performs recovery on the local kvstore, applies a local feature table
264266
// snapshot, and sets in-memory node/cluster UUIDs.
265267
ss::future<> bootstrap_from_kvstore();

src/v/redpanda/application_bootstrap.cc

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,12 @@ void application::wire_up_and_start_crypto_services() {
9292

9393
void application::wire_up_bootstrap_services() {
9494
// Wire up local storage.
95+
storage
96+
.invoke_on_all([](storage::api& s) {
97+
s.resources().update_append_chunk_size(
98+
storage::internal::chunks().chunk_size());
99+
})
100+
.get();
95101
ss::smp::invoke_on_all([] {
96102
return storage::internal::chunks().start();
97103
}).get();

0 commit comments

Comments
 (0)