Skip to content

Commit fab01a1

Browse files
authored
Use plain functions for copy function in duckdb (#8109)
This also fixes the segfault of empty "name" passed to copy function, see https://dataengineeringcentral.substack.com/p/benchmarking-vortex-file-format-vs Run bindgen before cpp so that changes to ffi for duckdb would propagate to vortex.h before C++ code compilation. Signed-off-by: Mikhail Kot <mikhail@spiraldb.com>
1 parent b8236af commit fab01a1

15 files changed

Lines changed: 293 additions & 525 deletions

File tree

vortex-duckdb/build.rs

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -304,7 +304,8 @@ fn try_build_duckdb(
304304
}
305305
}
306306

307-
fn c2rust(crate_dir: &Path, duckdb_include_dir: &Path) {
307+
/// Generate rust functions with bindgen from C sources.
308+
fn bindgen_c2rust(crate_dir: &Path, duckdb_include_dir: &Path) {
308309
let bindings = bindgen::Builder::default()
309310
.header("cpp/include/duckdb_vx.h")
310311
.override_abi(Abi::CUnwind, ".*")
@@ -350,7 +351,8 @@ fn c2rust(crate_dir: &Path, duckdb_include_dir: &Path) {
350351
}
351352
}
352353

353-
fn cpp(duckdb_include_dir: &Path) {
354+
/// Generate libvortex_duckdb.*
355+
fn compile_cpp(duckdb_include_dir: &Path) {
354356
cc::Build::new()
355357
.std("c++20")
356358
.flags(["-Wall", "-Wextra", "-Wpedantic", "-Werror"])
@@ -367,7 +369,8 @@ fn cpp(duckdb_include_dir: &Path) {
367369
}
368370
}
369371

370-
fn rust2c(crate_dir: &Path) {
372+
/// Generate include/vortex.h from rust sources
373+
fn cbindgen_rust2c(crate_dir: &Path) {
371374
let header = crate_dir.join("include/vortex.h");
372375
let output = cbindgen::Builder::new()
373376
.with_config(cbindgen::Config::from_file(crate_dir.join("cbindgen.toml")).unwrap())
@@ -484,7 +487,7 @@ fn main() {
484487

485488
let duckdb_include_dir = inner_dir.join("src").join("include");
486489
println!("cargo:rerun-if-changed=cpp/include");
487-
c2rust(&crate_dir, &duckdb_include_dir);
488-
cpp(&duckdb_include_dir);
489-
rust2c(&crate_dir);
490+
bindgen_c2rust(&crate_dir, &duckdb_include_dir);
491+
cbindgen_rust2c(&crate_dir);
492+
compile_cpp(&duckdb_include_dir);
490493
}

vortex-duckdb/cbindgen.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
language = "C"
22
cpp_compat = true
3+
usize_is_size_t = true
4+
style = "type"
5+
braces = "SameLine"
36

47
header = """
58
// SPDX-License-Identifier: Apache-2.0
Lines changed: 67 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -1,172 +1,132 @@
11
// SPDX-License-Identifier: Apache-2.0
22
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3-
4-
#include "duckdb_vx.h"
53
#include "duckdb_vx/data.hpp"
64
#include "duckdb_vx/error.hpp"
7-
5+
#include "duckdb_vx/table_function.h"
6+
#include "vortex.h"
87
#include "duckdb/function/copy_function.hpp"
98
#include "duckdb/main/capi/capi_internal.hpp"
109
#include "duckdb/main/client_context.hpp"
1110
#include "duckdb/main/connection.hpp"
1211
#include "duckdb/parser/parsed_data/create_copy_function_info.hpp"
1312

1413
using namespace duckdb;
14+
using vortex::CData;
15+
using vortex::IntoErrString;
1516

16-
namespace vortex {
17-
18-
struct CCopyBindData final : TableFunctionData {
19-
CCopyBindData(const duckdb_vx_copy_func_vtab_t vtab_p, unique_ptr<CData> ffi_data_p)
20-
: vtab(vtab_p), ffi_data(std::move(ffi_data_p)) {
17+
struct CopyBindData final : TableFunctionData {
18+
CopyBindData(unique_ptr<CData> ffi_data) : ffi_data(std::move(ffi_data)) {
2119
}
22-
23-
const duckdb_vx_copy_func_vtab_t vtab;
2420
unique_ptr<CData> ffi_data;
2521
};
2622

27-
struct CCopyGlobalData final : GlobalFunctionData {
28-
explicit CCopyGlobalData(unique_ptr<CData> ffi_data_p) : ffi_data(std::move(ffi_data_p)) {
23+
struct CopyGlobalData final : GlobalFunctionData {
24+
CopyGlobalData(unique_ptr<CData> ffi_data) : ffi_data(std::move(ffi_data)) {
2925
}
3026

3127
unique_ptr<CData> ffi_data;
3228
};
3329

34-
struct CCopyLocalData final : LocalFunctionData {
35-
explicit CCopyLocalData(unique_ptr<CData> ffi_data_p) : ffi_data(std::move(ffi_data_p)) {
30+
unique_ptr<FunctionData> copy_to_bind(ClientContext &,
31+
CopyFunctionBindInput &,
32+
const vector<string> &column_names,
33+
const vector<LogicalType> &column_types) {
34+
vector<const char *> ffi_column_names(column_names.size());
35+
for (size_t i = 0; i < column_names.size(); ++i) {
36+
ffi_column_names[i] = column_names[i].c_str();
3637
}
3738

38-
unique_ptr<CData> ffi_data;
39-
};
40-
41-
static duckdb_vx_copy_func_vtab_t copy_vtab_one;
42-
43-
unique_ptr<FunctionData> c_bind_one(ClientContext & /*context*/,
44-
CopyFunctionBindInput &info,
45-
const vector<string> &column_names,
46-
const vector<LogicalType> &column_types) {
47-
48-
auto c_column_names = vector<char *>();
49-
c_column_names.reserve(column_names.size());
50-
for (const auto &col_id : column_names) {
51-
c_column_names.push_back(const_cast<char *>(col_id.c_str()));
52-
}
53-
54-
auto c_column_types = vector<duckdb_logical_type>();
55-
c_column_types.reserve(c_column_types.size());
56-
for (auto &col_type : column_types) {
57-
c_column_types.push_back(reinterpret_cast<duckdb_logical_type>(const_cast<LogicalType *>(&col_type)));
39+
vector<duckdb_logical_type> ffi_column_types(column_types.size());
40+
for (size_t i = 0; i < column_types.size(); ++i) {
41+
// duckdb C api doesn't allow passing const LogicalTypes. We never
42+
// modify input in copy function.
43+
ffi_column_types[i] =
44+
reinterpret_cast<duckdb_logical_type>(const_cast<LogicalType *>(&column_types[i]));
5845
}
5946

6047
duckdb_vx_error error_out = nullptr;
61-
// TODO(myrrc): do we pass ownership of c_column_names in bind?
62-
// If yes, it's a UB as we'd double delete on function return
63-
auto ffi_bind_data = copy_vtab_one.bind(reinterpret_cast<duckdb_vx_copy_func_bind_input>(&info),
64-
c_column_names.data(),
65-
c_column_names.size(),
66-
c_column_types.data(),
67-
c_column_types.size(),
68-
&error_out);
48+
const duckdb_vx_data ffi_bind_data = duckdb_copy_function_copy_to_bind(ffi_column_names.data(),
49+
ffi_column_names.size(),
50+
ffi_column_types.data(),
51+
ffi_column_types.size(),
52+
&error_out);
6953
if (error_out) {
7054
throw BinderException(IntoErrString(error_out));
7155
}
72-
73-
return make_uniq<CCopyBindData>(
74-
// This should only be filled out once
75-
copy_vtab_one,
76-
unique_ptr<CData>(reinterpret_cast<CData *>(ffi_bind_data)));
56+
auto cdata = unique_ptr<CData>(reinterpret_cast<CData *>(ffi_bind_data));
57+
return make_uniq<CopyBindData>(std::move(cdata));
7758
}
7859

7960
unique_ptr<GlobalFunctionData>
80-
c_init_global(ClientContext &context, FunctionData &bind_data, const string &file_path) {
81-
auto &bind = bind_data.Cast<CCopyBindData>();
82-
duckdb_vx_error error_out = nullptr;
83-
auto global_data = bind.vtab.init_global(reinterpret_cast<duckdb_client_context>(&context),
84-
bind.ffi_data->DataPtr(),
85-
file_path.c_str(),
86-
&error_out);
87-
if (error_out) {
88-
throw ExecutorException(IntoErrString(error_out));
89-
}
61+
copy_to_initialize_global(ClientContext &context, FunctionData &bind_data, const string &file_path) {
62+
void *const ffi_bind = bind_data.Cast<CopyBindData>().ffi_data->DataPtr();
63+
const auto ffi_ctx = reinterpret_cast<duckdb_client_context>(&context);
9064

91-
return make_uniq<CCopyGlobalData>(unique_ptr<CData>(reinterpret_cast<CData *>(global_data)));
92-
}
93-
94-
unique_ptr<LocalFunctionData> c_init_local(ExecutionContext & /*context*/, FunctionData &bind_data) {
95-
auto &bind = bind_data.Cast<CCopyBindData>();
9665
duckdb_vx_error error_out = nullptr;
97-
auto data = bind.vtab.init_local(bind.ffi_data->DataPtr(), &error_out);
66+
const duckdb_vx_data ffi_global =
67+
duckdb_copy_function_copy_to_initialize_global(ffi_ctx, ffi_bind, file_path.c_str(), &error_out);
9868
if (error_out) {
9969
throw ExecutorException(IntoErrString(error_out));
10070
}
10171

102-
return make_uniq<CCopyLocalData>(unique_ptr<CData>(reinterpret_cast<CData *>(data)));
72+
auto cdata = unique_ptr<CData>(reinterpret_cast<CData *>(ffi_global));
73+
return make_uniq<CopyGlobalData>(std::move(cdata));
10374
}
10475

105-
void c_copy_to_sink(ExecutionContext & /*context*/,
106-
FunctionData &bind_data,
107-
GlobalFunctionData &gstate,
108-
LocalFunctionData &lstate,
109-
DataChunk &input) {
110-
auto &bind = bind_data.Cast<CCopyBindData>();
111-
auto &global = gstate.Cast<CCopyGlobalData>();
112-
auto &local = lstate.Cast<CCopyLocalData>();
76+
void copy_to_sink(ExecutionContext &,
77+
FunctionData &bind_data,
78+
GlobalFunctionData &gstate,
79+
LocalFunctionData &,
80+
DataChunk &input) {
81+
void *const ffi_bind = bind_data.Cast<CopyBindData>().ffi_data->DataPtr();
82+
void *const ffi_global = gstate.Cast<CopyGlobalData>().ffi_data->DataPtr();
83+
auto ffi_chunk = reinterpret_cast<duckdb_data_chunk>(&input);
11384
duckdb_vx_error error_out = nullptr;
114-
bind.vtab.copy_to_sink(bind.ffi_data->DataPtr(),
115-
global.ffi_data->DataPtr(),
116-
local.ffi_data->DataPtr(),
117-
reinterpret_cast<duckdb_data_chunk>(&input),
118-
&error_out);
85+
duckdb_copy_function_copy_to_sink(ffi_bind, ffi_global, ffi_chunk, &error_out);
11986
if (error_out) {
12087
throw ExecutorException(IntoErrString(error_out));
12188
}
12289
}
12390

124-
void copy_to_finalize(ClientContext & /*context*/, FunctionData &bind_data, GlobalFunctionData &gstate) {
125-
auto &bind = bind_data.Cast<CCopyBindData>();
126-
auto &global = gstate.Cast<CCopyGlobalData>();
91+
void copy_to_finalize(ClientContext &, FunctionData &, GlobalFunctionData &gstate) {
92+
void *const ffi_global = gstate.Cast<CopyGlobalData>().ffi_data->DataPtr();
12793
duckdb_vx_error error_out = nullptr;
128-
bind.vtab.copy_to_finalize(bind.ffi_data->DataPtr(), global.ffi_data->DataPtr(), &error_out);
94+
duckdb_copy_function_copy_to_finalize(ffi_global, &error_out);
12995
if (error_out) {
13096
throw ExecutorException(IntoErrString(error_out));
13197
}
13298
}
13399

134-
extern "C" duckdb_vx_copy_func_vtab_t *get_vtab_one() {
135-
return &copy_vtab_one;
136-
}
100+
extern "C" duckdb_state duckdb_vx_register_copy_function(duckdb_database ffi_db) {
101+
D_ASSERT(ffi_db);
102+
const DatabaseWrapper &wrapper = *reinterpret_cast<DatabaseWrapper *>(ffi_db);
103+
DatabaseInstance &db = *wrapper.database->instance;
137104

138-
extern "C" duckdb_state duckdb_vx_copy_func_register_vtab_one(duckdb_database ffi_db) {
139-
if (!ffi_db) {
140-
return DuckDBError;
141-
}
142-
143-
auto wrapper = reinterpret_cast<duckdb::DatabaseWrapper *>(ffi_db);
144-
auto db = wrapper->database->instance;
145-
auto copy_function = CopyFunction(copy_vtab_one.name);
146-
147-
copy_function.copy_to_bind = c_bind_one;
148-
copy_function.copy_to_initialize_global = c_init_global;
149-
copy_function.copy_to_initialize_local = c_init_local;
150-
151-
copy_function.copy_to_sink = c_copy_to_sink;
152-
copy_function.copy_to_finalize = copy_to_finalize;
153-
copy_function.extension = copy_vtab_one.extension;
105+
CopyFunction fn("vortex");
106+
fn.copy_to_bind = copy_to_bind;
107+
fn.copy_to_initialize_global = copy_to_initialize_global;
108+
fn.copy_to_initialize_local = [](auto &, auto &) {
109+
return make_uniq<LocalFunctionData>();
110+
};
111+
fn.copy_to_sink = copy_to_sink;
112+
fn.copy_to_finalize = copy_to_finalize;
113+
fn.extension = "vortex";
154114

155115
// TODO(joe): expose this via c our api
156-
copy_function.execution_mode = [](bool /*preserve_insertion_order*/, bool /*supports_batch_index*/) {
116+
fn.execution_mode = [](bool, bool) {
157117
return CopyFunctionExecutionMode::REGULAR_COPY_TO_FILE;
158118
};
159119
// TODO(joe): handle parameters as in table_function
160120

161121
try {
162-
auto &system_catalog = Catalog::GetSystemCatalog(*db);
163-
auto data = CatalogTransaction::GetSystemTransaction(*db);
164-
CreateCopyFunctionInfo copy_info(std::move(copy_function));
122+
Catalog &system_catalog = Catalog::GetSystemCatalog(db);
123+
CatalogTransaction data = CatalogTransaction::GetSystemTransaction(db);
124+
CreateCopyFunctionInfo copy_info(std::move(fn));
165125
system_catalog.CreateCopyFunction(data, copy_info);
166-
} catch (...) {
126+
} catch (const std::exception &e) {
127+
ErrorData data(e);
128+
DUCKDB_LOG_ERROR(db, "Failed to create Vortex copy function:\t" + data.Message());
167129
return DuckDBError;
168130
}
169131
return DuckDBSuccess;
170132
}
171-
172-
} // namespace vortex
Lines changed: 8 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -1,79 +1,20 @@
11
// SPDX-License-Identifier: Apache-2.0
22
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3-
4-
/**
5-
* We redefine a C API for DuckDB Copy Functions used to copy/write values from duckdb to a vortex file.
6-
* See table_filter.h for more info
7-
*/
83
#pragma once
9-
10-
#include "data.h"
11-
#include "error.h"
12-
#include "logical_type.h"
13-
#include "duckdb_vx/data.h"
14-
15-
#ifdef __cplusplus /* If compiled as C++, use C ABI */
16-
extern "C" {
17-
#endif
18-
19-
// Info passed into the bind callback. The callback should set error or else add result columns.
20-
typedef struct duckdb_vx_copy_func_bind_input_ *duckdb_vx_copy_func_bind_input;
21-
22-
// Input data passed into the init_global and init_local callbacks.
23-
typedef struct {
24-
const void *bind_data;
25-
const void *local_state;
26-
const void *global_state;
27-
} duckdb_vx_copy_func_init_input;
4+
#include "duckdb.h"
285

296
// TODO(joe): expose via c api
307
// typedef enum copy_function_execution_mode_ {
318
// REGULAR_COPY_TO_FILE = 1,
329
// PARALLEL_COPY_TO_FILE,
3310
// BATCH_COPY_TO_FILE
3411
// } copy_function_execution_mode;
12+
//
13+
// TODO(joe): expose via c api
14+
// copy_function_execution_mode (*execution_mode)(bool preserve_insertion_order, bool
3515

36-
// A transparent DuckDB copy function vtable, which can be used to configure a copy function.
37-
typedef struct {
38-
// The name of the copy function.
39-
const char *name;
40-
41-
// The extension of the files written by the copy function.
42-
const char *extension;
43-
44-
duckdb_vx_data (*bind)(duckdb_vx_copy_func_bind_input input,
45-
const char *const *column_names,
46-
unsigned long column_name_count,
47-
const duckdb_logical_type *column_types,
48-
unsigned long column_type_count,
49-
duckdb_vx_error *error_out);
50-
51-
duckdb_vx_data (*init_global)(duckdb_client_context ctx,
52-
const void *bind_data,
53-
const char *file_path,
54-
duckdb_vx_error *error_out);
55-
56-
duckdb_vx_data (*init_local)(const void *bind_data, duckdb_vx_error *error_out);
57-
58-
void (*copy_to_sink)(const void *bind_data,
59-
void *global_data,
60-
void *local_data,
61-
duckdb_data_chunk data_chunk_out,
62-
duckdb_vx_error *error_out);
63-
64-
void (*copy_to_finalize)(const void *bind_data, void *global_data, duckdb_vx_error *error_out);
65-
66-
// TODO(joe): expose via c api
67-
// copy_function_execution_mode (*execution_mode)(bool preserve_insertion_order, bool
68-
// supports_batch_index);
69-
} duckdb_vx_copy_func_vtab_t;
70-
71-
// Due to a limitation in the copy function duckdb api we have to have global (copy) vtabs.
72-
duckdb_vx_copy_func_vtab_t *get_vtab_one();
73-
74-
// A single function for configuring the DuckDB table function vtable.
75-
duckdb_state duckdb_vx_copy_func_register_vtab_one(duckdb_database ffi_db);
76-
77-
#ifdef __cplusplus /* End C ABI */
78-
};
16+
#ifdef __cplusplus
17+
extern "C" duckdb_state duckdb_vx_register_copy_function(duckdb_database ffi_db);
18+
#else
19+
duckdb_state duckdb_vx_register_copy_function(duckdb_database ffi_db);
7920
#endif

vortex-duckdb/cpp/table_function.cpp

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,12 +34,7 @@ struct CTableBindData final : FunctionData {
3434
};
3535

3636
unique_ptr<FunctionData> CTableBindData::Copy() const {
37-
duckdb_vx_error error_out = nullptr;
38-
const auto copied_ffi_data = duckdb_table_function_bind_data_clone(ffi_data->DataPtr(), &error_out);
39-
if (error_out) {
40-
throw BinderException(IntoErrString(error_out));
41-
}
42-
37+
const auto copied_ffi_data = duckdb_table_function_bind_data_clone(ffi_data->DataPtr());
4338
auto ffi_data_p = unique_ptr<CData>(reinterpret_cast<CData *>(copied_ffi_data));
4439
return make_uniq<CTableBindData>(std::move(ffi_data_p), types);
4540
}

0 commit comments

Comments
 (0)