Skip to content

Commit a8a304a

Browse files
disagg: Detect cloud vendor according to the endpoint and refine Grafana panels (#10568) (#10572)
close #10569 disagg: Detect cloud vendor according to the endpoint and refine Grafana panels Signed-off-by: JaySon-Huang <tshent@qq.com> Co-authored-by: JaySon-Huang <tshent@qq.com>
1 parent 56eedda commit a8a304a

11 files changed

Lines changed: 608 additions & 326 deletions

File tree

dbms/src/Flash/Disaggregated/S3LockService.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include <Interpreters/Context.h>
2020
#include <Storages/DeltaMerge/File/DMFile.h>
2121
#include <Storages/KVStore/TMTContext.h>
22+
#include <Storages/S3/Lifecycle.h>
2223
#include <Storages/S3/S3Common.h>
2324
#include <Storages/S3/S3Filename.h>
2425
#include <TiDB/OwnerInfo.h>

dbms/src/Storages/S3/Credentials.cpp

Lines changed: 77 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -282,38 +282,95 @@ std::shared_ptr<Aws::Auth::AWSCredentialsProvider> buildECSCredentialsProvider(c
282282
return nullptr;
283283
}
284284

285+
class TiFlashEnvironmentCredentialsProvider : public Aws::Auth::AWSCredentialsProvider
286+
{
287+
public:
288+
/**
289+
* Reads credentials from the Environment variables ACCESS_KEY_ID and SECRET_ACCESS_KEY and SESSION_TOKEN if they exist.
290+
* If they are not found, empty credentials are returned.
291+
*/
292+
Aws::Auth::AWSCredentials GetAWSCredentials() override
293+
{
294+
auto log = Logger::get();
295+
Aws::Auth::AWSCredentials credentials;
296+
297+
if (auto access_key = Aws::Environment::GetEnv("ACCESS_KEY_ID"); !access_key.empty())
298+
{
299+
credentials.SetAWSAccessKeyId(access_key);
300+
301+
auto secret_key = Aws::Environment::GetEnv("SECRET_ACCESS_KEY");
302+
if (!secret_key.empty())
303+
{
304+
credentials.SetAWSSecretKey(secret_key);
305+
}
306+
307+
auto session_token = Aws::Environment::GetEnv("SESSION_TOKEN");
308+
if (!session_token.empty())
309+
{
310+
credentials.SetSessionToken(session_token);
311+
}
312+
313+
LOG_INFO(
314+
log,
315+
"Creating TiFlashEnvironmentCredentialsProvider with ACCESS_KEY_ID, "
316+
"access_key_id_size={} secret_key_size={} session_token_size={}",
317+
access_key.size(),
318+
secret_key.size(),
319+
session_token.size());
320+
}
321+
322+
return credentials;
323+
}
324+
};
325+
285326
/// S3CredentialsProviderChain ///
286327

287-
S3CredentialsProviderChain::S3CredentialsProviderChain(const Aws::Client::ClientConfiguration & cfg)
328+
S3CredentialsProviderChain::S3CredentialsProviderChain(const Aws::Client::ClientConfiguration & cfg, CloudVendor vendor)
288329
: log(Logger::get())
289330
{
290331
/// AWS API tries credentials providers one by one. Some of providers (like ProfileConfigFileAWSCredentialsProvider) can be
291332
/// quite verbose even if nobody configured them. So tiflash use our provider first and only after it use default providers.
292333
/// And ProcessCredentialsProvider is useless in tiflash deployment cases, removed.
293334

294-
if (auto provider = DB::S3::STSAssumeRoleWebIdentityCredentialsProvider::build(); provider != nullptr)
295-
AddProvider(provider);
296-
297-
// Alibaba Cloud credentials providers
298-
if (auto provider = DB::S3::AlibabaCloud::ECSRAMRoleCredentialsProvider::build(cfg); provider != nullptr)
335+
switch (vendor)
299336
{
300-
AddProvider(provider);
337+
case CloudVendor::AWS:
338+
{
339+
if (auto provider = DB::S3::STSAssumeRoleWebIdentityCredentialsProvider::build(); provider != nullptr)
340+
AddProvider(provider);
341+
// AWS environment variable credentials provider always added
342+
AddProvider(std::make_shared<Aws::Auth::EnvironmentAWSCredentialsProvider>());
343+
344+
// AWS ECS credentials provider
345+
if (auto provider = buildECSCredentialsProvider(log); provider != nullptr)
346+
AddProvider(provider);
347+
348+
/// Quite verbose provider (argues if file with credentials doesn't exist) so it's the last one
349+
/// in chain.
350+
AddProvider(std::make_shared<Aws::Auth::ProfileConfigFileAWSCredentialsProvider>());
351+
break;
301352
}
302-
if (auto provider = DB::S3::AlibabaCloud::OIDCCredentialsProvider::build(cfg); provider != nullptr)
353+
case CloudVendor::AlibabaCloud:
303354
{
304-
AddProvider(provider);
355+
// Alibaba Cloud credentials providers
356+
if (auto provider = DB::S3::AlibabaCloud::ECSRAMRoleCredentialsProvider::build(cfg); provider != nullptr)
357+
{
358+
AddProvider(provider);
359+
}
360+
if (auto provider = DB::S3::AlibabaCloud::OIDCCredentialsProvider::build(cfg); provider != nullptr)
361+
{
362+
AddProvider(provider);
363+
}
364+
break;
365+
}
366+
case CloudVendor::Unknown:
367+
{
368+
AddProvider(std::make_shared<TiFlashEnvironmentCredentialsProvider>());
369+
// Add AWS environment variable credentials provider as a default fallback
370+
AddProvider(std::make_shared<Aws::Auth::EnvironmentAWSCredentialsProvider>());
371+
break;
372+
}
305373
}
306-
307-
// AWS environment variable credentials provider always added
308-
AddProvider(std::make_shared<Aws::Auth::EnvironmentAWSCredentialsProvider>());
309-
310-
// AWS ECS credentials provider
311-
if (auto provider = buildECSCredentialsProvider(log); provider != nullptr)
312-
AddProvider(provider);
313-
314-
/// Quite verbose provider (argues if file with credentials doesn't exist) so it's the last one
315-
/// in chain.
316-
AddProvider(std::make_shared<Aws::Auth::ProfileConfigFileAWSCredentialsProvider>());
317374
}
318375

319376
} // namespace DB::S3

dbms/src/Storages/S3/Credentials.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#pragma once
1616

1717
#include <Common/Logger.h>
18+
#include <Storages/S3/S3Common.h>
1819
#include <aws/core/auth/AWSCredentialsProviderChain.h>
1920

2021
namespace DB::S3
@@ -23,7 +24,7 @@ namespace DB::S3
2324
class S3CredentialsProviderChain : public Aws::Auth::AWSCredentialsProviderChain
2425
{
2526
public:
26-
explicit S3CredentialsProviderChain(const Aws::Client::ClientConfiguration & cfg);
27+
explicit S3CredentialsProviderChain(const Aws::Client::ClientConfiguration & cfg, CloudVendor vendor);
2728

2829
private:
2930
LoggerPtr log;

dbms/src/Storages/S3/Lifecycle.cpp

Lines changed: 258 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,258 @@
1+
// Copyright 2025 PingCAP, Inc.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#include <Common/FailPoint.h>
16+
#include <Common/Logger.h>
17+
#include <Storages/S3/Lifecycle.h>
18+
#include <Storages/S3/S3Common.h>
19+
#include <aws/s3/model/ExpirationStatus.h>
20+
#include <aws/s3/model/GetBucketLifecycleConfigurationRequest.h>
21+
#include <aws/s3/model/LifecycleConfiguration.h>
22+
#include <aws/s3/model/LifecycleExpiration.h>
23+
#include <aws/s3/model/LifecycleRule.h>
24+
#include <aws/s3/model/LifecycleRuleAndOperator.h>
25+
#include <aws/s3/model/PutBucketLifecycleConfigurationRequest.h>
26+
#include <aws/s3/model/Rule.h>
27+
#include <aws/s3/model/Tag.h>
28+
29+
namespace DB::FailPoints
30+
{
31+
extern const char force_set_lifecycle_resp[];
32+
} // namespace DB::FailPoints
33+
34+
namespace DB::S3
35+
{
36+
37+
Aws::S3::Model::BucketLifecycleConfiguration genNewLifecycleConfig(
38+
const Aws::Vector<Aws::S3::Model::LifecycleRule> & existing_rules,
39+
Int32 expire_days,
40+
bool use_ali_oss_format)
41+
{
42+
static_assert(TaggingObjectIsDeleted == "tiflash_deleted=true");
43+
std::vector<Aws::S3::Model::Tag> filter_tags{
44+
Aws::S3::Model::Tag().WithKey("tiflash_deleted").WithValue("true"),
45+
};
46+
Aws::S3::Model::LifecycleRule rule;
47+
rule.WithStatus(Aws::S3::Model::ExpirationStatus::Enabled)
48+
.WithExpiration(Aws::S3::Model::LifecycleExpiration().WithDays(expire_days))
49+
.WithID("tiflashgc");
50+
if (!use_ali_oss_format)
51+
{
52+
// Reference: https://docs.aws.amazon.com/AmazonS3/latest/userguide/S3OutpostsLifecycleCLIJava.html
53+
rule.WithFilter(Aws::S3::Model::LifecycleRuleFilter().WithAnd(
54+
Aws::S3::Model::LifecycleRuleAndOperator().WithPrefix("").WithTags(filter_tags)));
55+
}
56+
else
57+
{
58+
// Alibaba cloud oss format
59+
// Note that "Prefix" field is required
60+
// Reference: https://github.com/aliyun/aliyun-oss-cpp-sdk/blob/c42600fb0b2057494ae3b77b93afeff42dfba0a4/sdk/src/model/SetBucketLifecycleRequest.cc#L40-L44
61+
rule.WithFilter(Aws::S3::Model::LifecycleRuleFilter().WithTag(filter_tags[0]).WithPrefix("")) //
62+
.SetAliOssFormat(true);
63+
}
64+
65+
auto new_rules = existing_rules;
66+
new_rules.emplace_back(rule);
67+
Aws::S3::Model::BucketLifecycleConfiguration lifecycle_config;
68+
lifecycle_config.WithRules(new_rules);
69+
70+
return lifecycle_config;
71+
}
72+
73+
struct RuleInfo
74+
{
75+
bool check_success = false;
76+
bool rule_has_been_set = false;
77+
Aws::Vector<Aws::S3::Model::LifecycleRule> rules;
78+
};
79+
80+
RuleInfo checkLifecycleRuleExist(const TiFlashS3Client & client)
81+
{
82+
Aws::S3::Model::GetBucketLifecycleConfigurationRequest req;
83+
req.SetBucket(client.bucket());
84+
auto outcome = client.GetBucketLifecycleConfiguration(req);
85+
if (!outcome.IsSuccess())
86+
{
87+
const auto & error = outcome.GetError();
88+
// The life cycle is not added at all
89+
if (error.GetErrorType() == Aws::S3::S3Errors::RESOURCE_NOT_FOUND
90+
|| error.GetExceptionName() == "NoSuchLifecycleConfiguration")
91+
{
92+
return RuleInfo{.check_success = true, .rule_has_been_set = false, .rules = {}};
93+
}
94+
95+
LOG_ERROR(
96+
client.log,
97+
"GetBucketLifecycle fail, please check the bucket lifecycle configuration or create the lifecycle rule"
98+
" manually, bucket={} {}",
99+
client.bucket(),
100+
S3ErrorMessage(error));
101+
return RuleInfo{.check_success = false, .rule_has_been_set = false, .rules = {}};
102+
}
103+
104+
auto res = outcome.GetResultWithOwnership();
105+
auto old_rules = res.GetRules();
106+
fiu_do_on(FailPoints::force_set_lifecycle_resp, {
107+
if (auto v = FailPointHelper::getFailPointVal(FailPoints::force_set_lifecycle_resp); v)
108+
{
109+
auto rules = std::any_cast<std::vector<Aws::S3::Model::LifecycleRule>>(*v);
110+
old_rules = rules;
111+
}
112+
});
113+
114+
bool lifecycle_rule_has_been_set = false;
115+
static_assert(TaggingObjectIsDeleted == "tiflash_deleted=true");
116+
for (const auto & rule : old_rules)
117+
{
118+
if (rule.GetAliOssFormat())
119+
LOG_INFO(client.log, "Found existing lifecycle rule in AliOSS format, rule_id={}", rule.GetID());
120+
121+
const auto & filt = rule.GetFilter();
122+
123+
std::optional<Aws::S3::Model::Tag> tag;
124+
if (!filt.AndHasBeenSet())
125+
{
126+
// For AWS S3, filt.AndHasBeenSet() == false
127+
tag = filt.GetTag();
128+
}
129+
else
130+
{
131+
// For minio filt.AndHasBeenSet() == true
132+
const auto & and_op = filt.GetAnd();
133+
const auto & tags = and_op.GetTags();
134+
if (tags.size() != 1 || !and_op.PrefixHasBeenSet() || !and_op.GetPrefix().empty())
135+
{
136+
continue;
137+
}
138+
tag = tags[0];
139+
}
140+
if (!tag)
141+
continue;
142+
if (tag->GetKey() == "tiflash_deleted" && tag->GetValue() == "true")
143+
{
144+
if (rule.GetStatus() == Aws::S3::Model::ExpirationStatus::Enabled)
145+
{
146+
lifecycle_rule_has_been_set = true;
147+
}
148+
else
149+
{
150+
LOG_ERROR(
151+
client.log,
152+
"The lifecycle rule is added but not enabled, please check the bucket lifecycle "
153+
"configuration or create the lifecycle rule manually, "
154+
"rule_id={} rule_status={} tag.key={} tag.value={}",
155+
rule.GetID(),
156+
Aws::S3::Model::ExpirationStatusMapper::GetNameForExpirationStatus(rule.GetStatus()),
157+
tag->GetKey(),
158+
tag->GetValue());
159+
}
160+
break;
161+
}
162+
}
163+
164+
return RuleInfo{
165+
.check_success = true,
166+
.rule_has_been_set = lifecycle_rule_has_been_set,
167+
.rules = std::move(old_rules),
168+
};
169+
}
170+
171+
// Ensure the lifecycle rule with filter `TaggingObjectIsDeleted` has been added.
172+
// The lifecycle rule is required when using S3GCMethod::Lifecycle to expire the
173+
// deleted objects automatically using the cloud platform lifecycle mechanism.
174+
// If the lifecycle rule not exist, try to add the rule with `expire_days` days
175+
// to expire the objects.
176+
// Return true if the rule has been added or already exists.
177+
bool ensureLifecycleRuleExist(const TiFlashS3Client & client, Int32 expire_days)
178+
{
179+
auto rule_info = checkLifecycleRuleExist(client);
180+
if (!rule_info.check_success)
181+
{
182+
// Failed to check the lifecycle rule existence, return false
183+
return false;
184+
}
185+
186+
bool rule_has_been_set = rule_info.rule_has_been_set;
187+
auto & old_rules = rule_info.rules;
188+
if (rule_has_been_set)
189+
{
190+
LOG_INFO(
191+
client.log,
192+
"The lifecycle rule has been set, n_rules={} filter={}",
193+
old_rules.size(),
194+
TaggingObjectIsDeleted);
195+
return true;
196+
}
197+
198+
LOG_INFO(
199+
client.log,
200+
"The lifecycle rule with filter \"{}\" has not been added, n_rules={}",
201+
TaggingObjectIsDeleted,
202+
old_rules.size());
203+
204+
// Try add the tiflash rule to lifecycle
205+
bool use_ali_oss_format = false;
206+
auto lifecycle_config = genNewLifecycleConfig(old_rules, expire_days, use_ali_oss_format);
207+
Aws::S3::Model::PutBucketLifecycleConfigurationRequest request;
208+
request.WithBucket(client.bucket()).WithLifecycleConfiguration(lifecycle_config);
209+
auto outcome = client.PutBucketLifecycleConfiguration(request);
210+
if (outcome.IsSuccess())
211+
{
212+
LOG_INFO(
213+
client.log,
214+
"The lifecycle rule has been added, new_n_rules={} tag={} use_ali_oss_format={}",
215+
old_rules.size(),
216+
TaggingObjectIsDeleted,
217+
use_ali_oss_format);
218+
return true;
219+
}
220+
const auto & error = outcome.GetError();
221+
LOG_WARNING(
222+
client.log,
223+
"Create lifecycle rule with tag filter \"{}\" failed, retrying with another format, bucket={} "
224+
"use_ali_oss_format={} {}",
225+
TaggingObjectIsDeleted,
226+
client.bucket(),
227+
use_ali_oss_format,
228+
S3ErrorMessage(error));
229+
230+
// Retry with another format
231+
use_ali_oss_format = true;
232+
lifecycle_config = genNewLifecycleConfig(old_rules, expire_days, use_ali_oss_format);
233+
request.WithBucket(client.bucket()).WithLifecycleConfiguration(lifecycle_config);
234+
outcome = client.PutBucketLifecycleConfiguration(request);
235+
if (outcome.IsSuccess())
236+
{
237+
LOG_INFO(
238+
client.log,
239+
"The lifecycle rule has been added, new_n_rules={} tag={} use_ali_oss_format={}",
240+
old_rules.size(),
241+
TaggingObjectIsDeleted,
242+
use_ali_oss_format);
243+
return true;
244+
}
245+
246+
// Still failed to create lifecycle rule, log an error
247+
LOG_ERROR(
248+
client.log,
249+
"Create lifecycle rule with tag filter \"{}\" failed, please check the bucket lifecycle configuration or "
250+
"create the lifecycle rule manually, bucket={} use_ali_oss_format={} {}",
251+
TaggingObjectIsDeleted,
252+
client.bucket(),
253+
use_ali_oss_format,
254+
S3ErrorMessage(error));
255+
return false;
256+
}
257+
258+
} // namespace DB::S3

0 commit comments

Comments
 (0)