Skip to content

Commit f424d26

Browse files
authored
[core] Add FileType enum for classifying Paimon files (#7613)
1 parent fe9b975 commit f424d26

5 files changed

Lines changed: 357 additions & 3 deletions

File tree

paimon-core/src/main/java/org/apache/paimon/consumer/ConsumerManager.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ public class ConsumerManager implements Serializable {
4545

4646
private static final long serialVersionUID = 1L;
4747

48-
private static final String CONSUMER_PREFIX = "consumer-";
48+
public static final String CONSUMER_PREFIX = "consumer-";
4949

5050
private final FileIO fileIO;
5151
private final Path tablePath;

paimon-core/src/main/java/org/apache/paimon/schema/SchemaManager.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@
109109
@ThreadSafe
110110
public class SchemaManager implements Serializable {
111111

112-
private static final String SCHEMA_PREFIX = "schema-";
112+
public static final String SCHEMA_PREFIX = "schema-";
113113

114114
private final FileIO fileIO;
115115
private final Path tableRoot;
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
package org.apache.paimon.utils;
20+
21+
import org.apache.paimon.consumer.ConsumerManager;
22+
import org.apache.paimon.fs.Path;
23+
import org.apache.paimon.io.DataFilePathFactory;
24+
import org.apache.paimon.schema.SchemaManager;
25+
import org.apache.paimon.service.ServiceManager;
26+
27+
/**
28+
* Classification of Paimon files.
29+
*
30+
* <ul>
31+
* <li>{@link #META}: snapshot, schema, manifest, statistics, tag, changelog metadata, hint files,
32+
* _SUCCESS, consumer, service files
33+
* <li>{@link #DATA}: data files and any unrecognized files (default)
34+
* <li>{@link #BUCKET_INDEX}: bucket level index files (Hash, DV)
35+
* <li>{@link #GLOBAL_INDEX}: table level global index files (btree, bitmap, lumina, tantivy)
36+
* <li>{@link #FILE_INDEX}: data-file index files (bloom filter, bitmap, etc.)
37+
* </ul>
38+
*/
39+
public enum FileType {
40+
META,
41+
DATA,
42+
BUCKET_INDEX,
43+
GLOBAL_INDEX,
44+
FILE_INDEX;
45+
46+
private static final String MANIFEST = "manifest";
47+
private static final String CHANGELOG_DIR = "changelog";
48+
private static final String GLOBAL_INDEX_INFIX = "global-index-";
49+
50+
/** Returns {@code true} if this file type is any kind of index. */
51+
public boolean isIndex() {
52+
return this == BUCKET_INDEX || this == GLOBAL_INDEX || this == FILE_INDEX;
53+
}
54+
55+
/**
56+
* Classify a file based on its full path.
57+
*
58+
* <p>When the file does not match any known pattern, it defaults to {@link #DATA}.
59+
*/
60+
public static FileType classify(Path filePath) {
61+
String name = filePath.getName();
62+
63+
// meta file prefixes: snapshot-, schema-, stat-, tag-, consumer-, service-
64+
if (name.startsWith(SnapshotManager.SNAPSHOT_PREFIX)
65+
|| name.startsWith(SchemaManager.SCHEMA_PREFIX)
66+
|| name.startsWith(FileStorePathFactory.STATISTICS_PREFIX)
67+
|| name.startsWith(TagManager.TAG_PREFIX)
68+
|| name.startsWith(ConsumerManager.CONSUMER_PREFIX)
69+
|| name.startsWith(ServiceManager.SERVICE_PREFIX)) {
70+
return META;
71+
}
72+
73+
// file index: {data-file}.index (e.g. data-xxx.orc.index)
74+
// must check before global index since global index also ends with ".index"
75+
if (name.endsWith(DataFilePathFactory.INDEX_PATH_SUFFIX)) {
76+
if (name.contains(GLOBAL_INDEX_INFIX)) {
77+
return GLOBAL_INDEX;
78+
}
79+
return FILE_INDEX;
80+
}
81+
82+
// manifest, manifest-list, index-manifest: name contains "manifest"
83+
if (name.contains(MANIFEST)) {
84+
return META;
85+
}
86+
87+
// bucket index: name starts with "index-" (e.g. index-{uuid}-{N})
88+
if (name.startsWith(FileStorePathFactory.INDEX_PREFIX)) {
89+
return BUCKET_INDEX;
90+
}
91+
92+
// hint files
93+
if ("EARLIEST".equals(name) || "LATEST".equals(name)) {
94+
return META;
95+
}
96+
97+
// success files
98+
if ("_SUCCESS".equals(name) || name.endsWith("_SUCCESS")) {
99+
return META;
100+
}
101+
102+
// changelog metadata: parent dir is "changelog" and name starts with "changelog-"
103+
if (name.startsWith(ChangelogManager.CHANGELOG_PREFIX)
104+
&& CHANGELOG_DIR.equals(filePath.getParent().getName())) {
105+
return META;
106+
}
107+
108+
// default: DATA
109+
return DATA;
110+
}
111+
}

paimon-core/src/main/java/org/apache/paimon/utils/TagManager.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ public class TagManager {
6262

6363
private static final Logger LOG = LoggerFactory.getLogger(TagManager.class);
6464

65-
private static final String TAG_PREFIX = "tag-";
65+
public static final String TAG_PREFIX = "tag-";
6666

6767
private final FileIO fileIO;
6868
private final Path tablePath;
Lines changed: 243 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,243 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
package org.apache.paimon.utils;
20+
21+
import org.apache.paimon.fs.Path;
22+
23+
import org.junit.jupiter.api.Test;
24+
25+
import static org.assertj.core.api.Assertions.assertThat;
26+
27+
/** Tests for {@link FileType}. */
28+
public class FileTypeTest {
29+
30+
private static final String TABLE_ROOT = "hdfs://cluster/warehouse/db.db/table";
31+
32+
// ===== META files =====
33+
34+
@Test
35+
public void testMetaFiles() {
36+
// snapshot
37+
assertThat(FileType.classify(new Path(TABLE_ROOT + "/snapshot/snapshot-1")))
38+
.isEqualTo(FileType.META);
39+
assertThat(FileType.classify(new Path(TABLE_ROOT + "/snapshot/snapshot-100")))
40+
.isEqualTo(FileType.META);
41+
// schema
42+
assertThat(FileType.classify(new Path(TABLE_ROOT + "/schema/schema-0")))
43+
.isEqualTo(FileType.META);
44+
assertThat(FileType.classify(new Path(TABLE_ROOT + "/schema/schema-5")))
45+
.isEqualTo(FileType.META);
46+
// manifest
47+
assertThat(FileType.classify(new Path(TABLE_ROOT + "/manifest/manifest-a1b2c3d4-0")))
48+
.isEqualTo(FileType.META);
49+
// manifest-list
50+
assertThat(FileType.classify(new Path(TABLE_ROOT + "/manifest/manifest-list-a1b2c3d4-0")))
51+
.isEqualTo(FileType.META);
52+
// index-manifest
53+
assertThat(FileType.classify(new Path(TABLE_ROOT + "/manifest/index-manifest-a1b2c3d4-0")))
54+
.isEqualTo(FileType.META);
55+
// statistics
56+
assertThat(FileType.classify(new Path(TABLE_ROOT + "/statistics/stat-a1b2c3d4-0")))
57+
.isEqualTo(FileType.META);
58+
// tag
59+
assertThat(FileType.classify(new Path(TABLE_ROOT + "/tag/tag-2024-01-01")))
60+
.isEqualTo(FileType.META);
61+
assertThat(FileType.classify(new Path(TABLE_ROOT + "/tag/tag-myTag")))
62+
.isEqualTo(FileType.META);
63+
// changelog metadata
64+
assertThat(FileType.classify(new Path(TABLE_ROOT + "/changelog/changelog-1")))
65+
.isEqualTo(FileType.META);
66+
assertThat(FileType.classify(new Path(TABLE_ROOT + "/changelog/changelog-100")))
67+
.isEqualTo(FileType.META);
68+
// hint files
69+
assertThat(FileType.classify(new Path(TABLE_ROOT + "/snapshot/EARLIEST")))
70+
.isEqualTo(FileType.META);
71+
assertThat(FileType.classify(new Path(TABLE_ROOT + "/snapshot/LATEST")))
72+
.isEqualTo(FileType.META);
73+
// success files
74+
assertThat(FileType.classify(new Path(TABLE_ROOT + "/dt=2024-01-01/bucket-0/_SUCCESS")))
75+
.isEqualTo(FileType.META);
76+
assertThat(FileType.classify(new Path(TABLE_ROOT + "/tag/tag-success-file/myTag_SUCCESS")))
77+
.isEqualTo(FileType.META);
78+
// consumer
79+
assertThat(FileType.classify(new Path(TABLE_ROOT + "/consumer/consumer-myGroup")))
80+
.isEqualTo(FileType.META);
81+
// service
82+
assertThat(FileType.classify(new Path(TABLE_ROOT + "/service/service-primary-key-lookup")))
83+
.isEqualTo(FileType.META);
84+
}
85+
86+
// ===== BUCKET_INDEX files =====
87+
88+
@Test
89+
public void testBucketIndexFiles() {
90+
// under /index/ dir
91+
assertThat(FileType.classify(new Path(TABLE_ROOT + "/index/index-a1b2c3d4-0")))
92+
.isEqualTo(FileType.BUCKET_INDEX);
93+
// under bucket dir
94+
assertThat(
95+
FileType.classify(
96+
new Path(TABLE_ROOT + "/dt=2024-01-01/bucket-0/index-a1b2c3d4-0")))
97+
.isEqualTo(FileType.BUCKET_INDEX);
98+
}
99+
100+
// ===== GLOBAL_INDEX files =====
101+
102+
@Test
103+
public void testGlobalIndexFiles() {
104+
// btree global index
105+
assertThat(
106+
FileType.classify(
107+
new Path(
108+
TABLE_ROOT
109+
+ "/index/btree-global-index-a1b2c3d4-e5f6.index")))
110+
.isEqualTo(FileType.GLOBAL_INDEX);
111+
// bitmap global index
112+
assertThat(
113+
FileType.classify(
114+
new Path(
115+
TABLE_ROOT
116+
+ "/index/bitmap-global-index-a1b2c3d4-e5f6.index")))
117+
.isEqualTo(FileType.GLOBAL_INDEX);
118+
// lumina vector global index
119+
assertThat(
120+
FileType.classify(
121+
new Path(
122+
TABLE_ROOT
123+
+ "/index/lumina-vector-ann-global-index-a1b2c3d4.index")))
124+
.isEqualTo(FileType.GLOBAL_INDEX);
125+
// tantivy fulltext global index
126+
assertThat(
127+
FileType.classify(
128+
new Path(
129+
TABLE_ROOT
130+
+ "/index/tantivy-fulltext-global-index-a1b2c3d4.index")))
131+
.isEqualTo(FileType.GLOBAL_INDEX);
132+
}
133+
134+
// ===== FILE_INDEX files =====
135+
136+
@Test
137+
public void testFileIndexFiles() {
138+
assertThat(
139+
FileType.classify(
140+
new Path(
141+
TABLE_ROOT
142+
+ "/dt=2024-01-01/bucket-0/data-a1b2c3d4-0.orc.index")))
143+
.isEqualTo(FileType.FILE_INDEX);
144+
assertThat(
145+
FileType.classify(
146+
new Path(
147+
TABLE_ROOT
148+
+ "/dt=2024-01-01/bucket-0/data-a1b2c3d4-0.parquet.index")))
149+
.isEqualTo(FileType.FILE_INDEX);
150+
}
151+
152+
// ===== isIndex() =====
153+
154+
@Test
155+
public void testIsIndex() {
156+
assertThat(FileType.BUCKET_INDEX.isIndex()).isTrue();
157+
assertThat(FileType.GLOBAL_INDEX.isIndex()).isTrue();
158+
assertThat(FileType.FILE_INDEX.isIndex()).isTrue();
159+
assertThat(FileType.META.isIndex()).isFalse();
160+
assertThat(FileType.DATA.isIndex()).isFalse();
161+
}
162+
163+
// ===== DATA files =====
164+
165+
@Test
166+
public void testDataFiles() {
167+
// orc data file
168+
assertThat(
169+
FileType.classify(
170+
new Path(
171+
TABLE_ROOT
172+
+ "/dt=2024-01-01/bucket-0/data-a1b2c3d4-0.orc")))
173+
.isEqualTo(FileType.DATA);
174+
// parquet data file
175+
assertThat(
176+
FileType.classify(
177+
new Path(
178+
TABLE_ROOT
179+
+ "/dt=2024-01-01/bucket-0/data-a1b2c3d4-0.parquet")))
180+
.isEqualTo(FileType.DATA);
181+
// changelog data file
182+
assertThat(
183+
FileType.classify(
184+
new Path(
185+
TABLE_ROOT
186+
+ "/dt=2024-01-01/bucket-0/changelog-a1b2c3d4-0.orc")))
187+
.isEqualTo(FileType.DATA);
188+
// blob file
189+
assertThat(
190+
FileType.classify(
191+
new Path(
192+
TABLE_ROOT
193+
+ "/dt=2024-01-01/bucket-0/data-a1b2c3d4-0.blob")))
194+
.isEqualTo(FileType.DATA);
195+
// vector file
196+
assertThat(
197+
FileType.classify(
198+
new Path(
199+
TABLE_ROOT
200+
+ "/dt=2024-01-01/bucket-0/data-a1b2c3d4-0.vector.lance")))
201+
.isEqualTo(FileType.DATA);
202+
// unknown file defaults to DATA
203+
assertThat(
204+
FileType.classify(
205+
new Path(TABLE_ROOT + "/dt=2024-01-01/bucket-0/unknown-file.bin")))
206+
.isEqualTo(FileType.DATA);
207+
}
208+
209+
// ===== Edge cases =====
210+
211+
@Test
212+
public void testChangelogDirInParentPathNotMisjudged() {
213+
// table root path itself contains "changelog", should not be misjudged as META
214+
String tricky = "hdfs://cluster/changelog/warehouse/db.db/table";
215+
assertThat(
216+
FileType.classify(
217+
new Path(tricky + "/dt=2024-01-01/bucket-0/data-a1b2c3d4-0.orc")))
218+
.isEqualTo(FileType.DATA);
219+
assertThat(
220+
FileType.classify(
221+
new Path(
222+
tricky
223+
+ "/dt=2024-01-01/bucket-0/changelog-a1b2c3d4-0.orc")))
224+
.isEqualTo(FileType.DATA);
225+
}
226+
227+
@Test
228+
public void testBranchPaths() {
229+
String branchRoot = TABLE_ROOT + "/branch/branch-dev";
230+
assertThat(FileType.classify(new Path(branchRoot + "/snapshot/snapshot-1")))
231+
.isEqualTo(FileType.META);
232+
assertThat(FileType.classify(new Path(branchRoot + "/schema/schema-0")))
233+
.isEqualTo(FileType.META);
234+
assertThat(FileType.classify(new Path(branchRoot + "/changelog/changelog-1")))
235+
.isEqualTo(FileType.META);
236+
assertThat(FileType.classify(new Path(branchRoot + "/index/index-a1b2c3d4-0")))
237+
.isEqualTo(FileType.BUCKET_INDEX);
238+
assertThat(
239+
FileType.classify(
240+
new Path(branchRoot + "/index/btree-global-index-a1b2c3d4.index")))
241+
.isEqualTo(FileType.GLOBAL_INDEX);
242+
}
243+
}

0 commit comments

Comments
 (0)