Skip to content

Commit d6e069d

Browse files
committed
[fix](be) Include HDFS connection in file handle cache key
### What problem does this PR solve? Issue Number: None Related PR: None Problem Summary: HDFS file handles were cached only by path and mtime, so a later query using a different hdfsFS authentication context could reuse a handle opened from another context when the same file path and mtime matched. ### Release note None ### Check List (For Author) - Test: Manual test - `build-support/clang-format.sh be/src/io/fs/file_handle_cache.cpp be/src/io/fs/file_handle_cache.h be/test/io/fs/file_handle_cache_test.cpp` - `git diff --cached --check` - BE UT not run locally per request - Behavior changed: No - Does this need documentation: No
1 parent cbe23a3 commit d6e069d

3 files changed

Lines changed: 77 additions & 7 deletions

File tree

be/src/io/fs/file_handle_cache.cpp

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
#include "io/fs/file_handle_cache.h"
2323

24+
#include <cstdint>
2425
#include <thread>
2526
#include <tuple>
2627

@@ -100,7 +101,7 @@ FileHandleCache::Accessor::~Accessor() {
100101
#ifdef USE_HADOOP_HDFS
101102
if (hdfsUnbufferFile(get()->file()) != 0) {
102103
VLOG_FILE << "FS does not support file handle unbuffering, closing file="
103-
<< _cache_accessor.get_key()->first;
104+
<< _cache_accessor.get_key()->second.first;
104105
destroy();
105106
} else {
106107
// Calling explicit release to handle metrics
@@ -148,11 +149,13 @@ Status FileHandleCache::get_file_handle(const hdfsFS& fs, const std::string& fna
148149
FileHandleCache::Accessor* accessor, bool* cache_hit) {
149150
DCHECK_GE(mtime, 0);
150151
// Hash the key and get appropriate partition
151-
int index =
152-
HashUtil::hash(fname.data(), cast_set<int>(fname.size()), 0) % _cache_partitions.size();
152+
uintptr_t fs_identity = reinterpret_cast<uintptr_t>(fs);
153+
uint32_t seed = HashUtil::hash(&fs_identity, sizeof(fs_identity), 0);
154+
int index = HashUtil::hash(fname.data(), cast_set<int>(fname.size()), seed) %
155+
_cache_partitions.size();
153156
FileHandleCachePartition& p = _cache_partitions[index];
154157

155-
auto cache_key = std::make_pair(fname, mtime);
158+
auto cache_key = make_cache_key(fs, fname, mtime);
156159

157160
// If this requires a new handle, skip to the creation codepath. Otherwise,
158161
// find an unused entry with the same mtime
@@ -187,6 +190,15 @@ Status FileHandleCache::get_file_handle(const hdfsFS& fs, const std::string& fna
187190
return Status::OK();
188191
}
189192

193+
#ifdef BE_TEST
194+
bool FileHandleCache::same_cache_key_for_test(const hdfsFS& lhs_fs, const std::string& lhs_fname,
195+
int64_t lhs_mtime, const hdfsFS& rhs_fs,
196+
const std::string& rhs_fname, int64_t rhs_mtime) {
197+
return make_cache_key(lhs_fs, lhs_fname, lhs_mtime) ==
198+
make_cache_key(rhs_fs, rhs_fname, rhs_mtime);
199+
}
200+
#endif
201+
190202
void FileHandleCache::_evict_handles_loop() {
191203
while (!_is_shut_down.load()) {
192204
if (_unused_handle_timeout_secs) {

be/src/io/fs/file_handle_cache.h

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,12 @@
2222
#pragma once
2323

2424
#include <array>
25+
#include <cstdint>
2526
#include <list>
2627
#include <map>
2728
#include <memory>
29+
#include <string>
30+
#include <utility>
2831

2932
#include "common/status.h"
3033
#include "io/fs/file_system.h"
@@ -111,13 +114,15 @@ class ExclusiveHdfsFileHandle : public HdfsFileHandle {
111114
/// mtime is older than the file's current mtime.
112115
class FileHandleCache {
113116
private:
117+
using CacheKey = std::pair<hdfsFS, std::pair<std::string, int64_t>>;
118+
114119
/// Each partition operates independently, and thus has its own thread-safe cache.
115120
/// To avoid contention on the lock_ due to false sharing the partitions are
116121
/// aligned to cache line boundaries.
117122
struct FileHandleCachePartition : public CacheLineAligned {
118-
// Cache key is a pair of filename and mtime
119-
// Using std::pair to spare boilerplate of hash function
120-
typedef LruMultiCache<std::pair<std::string, int64_t>, CachedHdfsFileHandle> CacheType;
123+
// The same HDFS path can be opened through different hdfsFS instances with
124+
// different authentication contexts, so the filesystem handle is part of the key.
125+
typedef LruMultiCache<CacheKey, CachedHdfsFileHandle> CacheType;
121126
CacheType cache;
122127
};
123128

@@ -176,7 +181,17 @@ class FileHandleCache {
176181
int64_t file_size, bool require_new_handle, Accessor* accessor,
177182
bool* cache_hit) WARN_UNUSED_RESULT;
178183

184+
#ifdef BE_TEST
185+
static bool same_cache_key_for_test(const hdfsFS& lhs_fs, const std::string& lhs_fname,
186+
int64_t lhs_mtime, const hdfsFS& rhs_fs,
187+
const std::string& rhs_fname, int64_t rhs_mtime);
188+
#endif
189+
179190
private:
191+
static CacheKey make_cache_key(const hdfsFS& fs, const std::string& fname, int64_t mtime) {
192+
return {fs, {fname, mtime}};
193+
}
194+
180195
/// Periodic check to evict unused file handles. Only executed by _eviction_thread.
181196
void _evict_handles_loop();
182197

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
#include "io/fs/file_handle_cache.h"
19+
20+
#include <gtest/gtest.h>
21+
22+
#include <cstdint>
23+
#include <string>
24+
25+
namespace doris::io {
26+
27+
TEST(FileHandleCacheTest, CacheKeyIncludesHdfsFs) {
28+
auto first_fs = reinterpret_cast<hdfsFS>(static_cast<uintptr_t>(0x1));
29+
auto second_fs = reinterpret_cast<hdfsFS>(static_cast<uintptr_t>(0x2));
30+
const std::string fname = "/user/hive/warehouse/table/data.parquet";
31+
constexpr int64_t mtime = 12345;
32+
33+
EXPECT_TRUE(FileHandleCache::same_cache_key_for_test(first_fs, fname, mtime, first_fs, fname,
34+
mtime));
35+
EXPECT_FALSE(FileHandleCache::same_cache_key_for_test(first_fs, fname, mtime, second_fs, fname,
36+
mtime));
37+
EXPECT_FALSE(FileHandleCache::same_cache_key_for_test(first_fs, fname, mtime, first_fs,
38+
fname + ".other", mtime));
39+
EXPECT_FALSE(FileHandleCache::same_cache_key_for_test(first_fs, fname, mtime, first_fs, fname,
40+
mtime + 1));
41+
}
42+
43+
} // namespace doris::io

0 commit comments

Comments
 (0)