diff --git a/CMakeLists.txt b/CMakeLists.txt index bc5a94f..bea9760 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,6 +7,9 @@ set(CMAKE_CXX_STANDARD_REQUIRED True) # Set extension name here set(TARGET_NAME duckdb_rdkit) +# Find Boost with components before RDKit, so that Boost::system etc. targets +# are created (vcpkg's BoostConfig only creates targets for requested components) +find_package(Boost REQUIRED COMPONENTS system serialization iostreams) find_package(RDKit REQUIRED) set(EXTENSION_NAME ${TARGET_NAME}_extension) diff --git a/duckdb b/duckdb index 1986445..e05f4ff 160000 --- a/duckdb +++ b/duckdb @@ -1 +1 @@ -Subproject commit 19864453f7d0ed095256d848b46e7b8630989bac +Subproject commit e05f4ff81a8fbed460514f80e457260043ff8e9d diff --git a/extension-ci-tools b/extension-ci-tools index 1a6fd9d..e6882cf 160000 --- a/extension-ci-tools +++ b/extension-ci-tools @@ -1 +1 @@ -Subproject commit 1a6fd9db1d65240095da86892d6616a47b117f0f +Subproject commit e6882cf3b0e9afa1616980dff8f7b2d8cbaf49db diff --git a/notes/rdkit-vcpkg-port.md b/notes/rdkit-vcpkg-port.md new file mode 100644 index 0000000..c1cda09 --- /dev/null +++ b/notes/rdkit-vcpkg-port.md @@ -0,0 +1,123 @@ +# RDKit vcpkg Port Notes + +## Summary + +RDKit is not available in vcpkg. Creating a C++ only port is feasible. + +## Current Build Issue + +The project uses: +- **vcpkg toolchain** for DuckDB extension build system +- **Spack-installed RDKit** at `/mnt/aux-data/teague/Dev/spack/var/spack/environments/duckdb/.spack-env/view/` + +The conflict: RDKit's cmake config (`rdkit-targets.cmake:61-64`) declares dependencies on `Boost::system`, `Boost::serialization`, `Boost::iostreams`. When vcpkg's toolchain intercepts `find_package(Boost)`, it looks in vcpkg's install tree where Boost doesn't exist. + +## RDKit Dependencies (from Spack recipe) + +### Required +- `boost` (+system +serialization +iostreams) - **in vcpkg** +- `sqlite` - **in vcpkg** + +### Optional (for full build) +- `freetype` - **in vcpkg** +- `eigen3` (for 3D descriptors) - **in vcpkg** +- `coordgen` - **NOT in vcpkg** (would need port) +- `maeparser` - **NOT in vcpkg** (would need port) +- `freesasa` - **NOT in vcpkg** +- Python/NumPy (for wrappers) - not needed for C++ only + +### For C++ Only Build +Only need: boost, sqlite, optionally eigen3/freetype. All available in vcpkg. + +## vcpkg Port Structure + +A port requires two files in `/home/teague/Dev/vcpkg/ports/rdkit/`: + +### vcpkg.json +```json +{ + "name": "rdkit", + "version": "2024.03.3", + "description": "RDKit: Open-Source Cheminformatics Software", + "homepage": "https://www.rdkit.org", + "license": "BSD-3-Clause", + "dependencies": [ + "boost-system", + "boost-serialization", + "boost-iostreams", + "sqlite3", + { + "name": "vcpkg-cmake", + "host": true + }, + { + "name": "vcpkg-cmake-config", + "host": true + } + ], + "features": { + "freetype": { + "description": "Build with FreeType support", + "dependencies": ["freetype"] + }, + "3d": { + "description": "Build 3D descriptor calculators", + "dependencies": ["eigen3"] + } + } +} +``` + +### portfile.cmake (skeleton) +```cmake +vcpkg_from_github( + OUT_SOURCE_PATH SOURCE_PATH + REPO rdkit/rdkit + REF Release_2024_03_3 + SHA512 + HEAD_REF master +) + +vcpkg_cmake_configure( + SOURCE_PATH "${SOURCE_PATH}" + OPTIONS + -DRDK_INSTALL_INTREE=OFF + -DRDK_BUILD_PYTHON_WRAPPERS=OFF + -DRDK_BUILD_COORDGEN_SUPPORT=OFF + -DRDK_BUILD_MAEPARSER_SUPPORT=OFF + -DRDK_BUILD_FREESASA_SUPPORT=OFF + -DRDK_BUILD_YAEHMOP_SUPPORT=OFF + -DRDK_BUILD_XYZ2MOL_SUPPORT=OFF +) + +vcpkg_cmake_install() +vcpkg_cmake_config_fixup(CONFIG_PATH lib/cmake/rdkit) +vcpkg_copy_pdbs() + +file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/debug/include") +vcpkg_install_copyright(FILE_LIST "${SOURCE_PATH}/license.txt") +``` + +## Effort Estimate (C++ Only) + +| Task | Time | +|------|------| +| Create vcpkg.json | 15 min | +| Create portfile.cmake | 30 min | +| Debug build issues | 1-2 hours | +| Test on Linux | 30 min | +| **Total** | **2-4 hours** | + +## Alternative: Fix Current Build + +Instead of creating a port, could: +1. Add boost to vcpkg manifest so `Boost::system` target exists +2. Disable vcpkg toolchain's find_package override for specific packages +3. Use Spack entirely (remove vcpkg toolchain) + +## References + +- Spack recipe: `spack edit rdkit` +- RDKit CMake docs: https://github.com/rdkit/rdkit/blob/master/Code/cmake/Modules/ +- vcpkg port tutorial: https://learn.microsoft.com/en-us/vcpkg/get_started/get-started-adding-to-registry +- Example complex port: `/home/teague/Dev/vcpkg/ports/eigen3/` diff --git a/src/cast.cpp b/src/cast.cpp index 4be63d5..32c0e74 100644 --- a/src/cast.cpp +++ b/src/cast.cpp @@ -4,7 +4,6 @@ #include "duckdb/common/types/string_type.hpp" #include "duckdb/common/types/vector.hpp" #include "duckdb/function/cast/default_casts.hpp" -#include "duckdb/main/extension_util.hpp" #include "mol_formats.hpp" #include "types.hpp" #include "umbra_mol.hpp" @@ -74,14 +73,12 @@ bool MolToVarcharCast(Vector &source, Vector &result, idx_t count, return true; } -void RegisterCasts(DatabaseInstance &instance) { - ExtensionUtil::RegisterCastFunction(instance, LogicalType::VARCHAR, - ::duckdb_rdkit::Mol(), - BoundCastInfo(VarcharToMolCast), 1); +void RegisterCasts(ExtensionLoader &loader) { + loader.RegisterCastFunction(LogicalType::VARCHAR, ::duckdb_rdkit::Mol(), + BoundCastInfo(VarcharToMolCast), 1); - ExtensionUtil::RegisterCastFunction(instance, duckdb_rdkit::Mol(), - LogicalType::VARCHAR, - BoundCastInfo(MolToVarcharCast), 1); + loader.RegisterCastFunction(duckdb_rdkit::Mol(), LogicalType::VARCHAR, + BoundCastInfo(MolToVarcharCast), 1); } } // namespace duckdb_rdkit diff --git a/src/duckdb_rdkit_extension.cpp b/src/duckdb_rdkit_extension.cpp index 5768555..e59b7f9 100644 --- a/src/duckdb_rdkit_extension.cpp +++ b/src/duckdb_rdkit_extension.cpp @@ -4,7 +4,7 @@ #define DUCKDB_EXTENSION_MAIN #include "cast.hpp" -#include "duckdb/main/extension_util.hpp" +#include "duckdb/main/extension/extension_loader.hpp" #include "duckdb_rdkit_extension.hpp" #include "mol_compare.hpp" #include "mol_formats.hpp" @@ -20,39 +20,43 @@ namespace duckdb { -static void LoadInternal(DatabaseInstance &instance) { - duckdb_rdkit::RegisterTypes(instance); - duckdb_rdkit::RegisterCasts(instance); - duckdb_rdkit::RegisterFormatFunctions(instance); - duckdb_rdkit::RegisterCompareFunctions(instance); - duckdb_rdkit::RegisterDescriptorFunctions(instance); +static void LoadInternal(ExtensionLoader &loader) { + duckdb_rdkit::RegisterTypes(loader); + duckdb_rdkit::RegisterCasts(loader); + duckdb_rdkit::RegisterFormatFunctions(loader); + duckdb_rdkit::RegisterCompareFunctions(loader); + duckdb_rdkit::RegisterDescriptorFunctions(loader); for (auto &fun : SDFFunctions::GetTableFunctions()) { - ExtensionUtil::RegisterFunction(instance, fun); + loader.RegisterFunction(fun); } // SDF replacement scan + auto &instance = loader.GetDatabaseInstance(); auto &config = DBConfig::GetConfig(instance); config.replacement_scans.emplace_back(SDFFunctions::ReadSDFReplacement); } -void DuckdbRdkitExtension::Load(DuckDB &db) { LoadInternal(*db.instance); } +void DuckdbRdkitExtension::Load(ExtensionLoader &loader) { + LoadInternal(loader); +} std::string DuckdbRdkitExtension::Name() { return "duckdb_rdkit"; } } // namespace duckdb +#ifdef DUCKDB_BUILD_LOADABLE_EXTENSION extern "C" { -DUCKDB_EXTENSION_API void duckdb_rdkit_init(duckdb::DatabaseInstance &db) { - duckdb::DuckDB db_wrapper(db); - db_wrapper.LoadExtension(); +DUCKDB_CPP_EXTENSION_ENTRY(duckdb_rdkit, loader) { + duckdb::LoadInternal(loader); } DUCKDB_EXTENSION_API const char *duckdb_rdkit_version() { return duckdb::DuckDB::LibraryVersion(); } } +#endif #ifndef DUCKDB_EXTENSION_MAIN #error DUCKDB_EXTENSION_MAIN not defined diff --git a/src/include/cast.hpp b/src/include/cast.hpp index 7814334..81e4200 100644 --- a/src/include/cast.hpp +++ b/src/include/cast.hpp @@ -9,6 +9,6 @@ bool VarcharToMolCast(Vector &source, Vector &result, idx_t count, void MolToVarchar(Vector &source, Vector &result, idx_t count); bool MolToVarcharCast(Vector &source, Vector &result, idx_t count, CastParameters ¶meters); -void RegisterCasts(DatabaseInstance &instance); +void RegisterCasts(ExtensionLoader &loader); } // namespace duckdb_rdkit diff --git a/src/include/common.hpp b/src/include/common.hpp index 1753225..8e1ad48 100644 --- a/src/include/common.hpp +++ b/src/include/common.hpp @@ -2,7 +2,7 @@ #include "duckdb.hpp" #include "duckdb/common/helper.hpp" -#include "duckdb/main/extension_util.hpp" +#include "duckdb/main/extension/extension_loader.hpp" // including common.hpp into the other files makes it so that // it is not necessary to put duckdb::FUNCTION. Brings in the namespace diff --git a/src/include/duckdb_rdkit_extension.hpp b/src/include/duckdb_rdkit_extension.hpp index be18a75..495ee19 100644 --- a/src/include/duckdb_rdkit_extension.hpp +++ b/src/include/duckdb_rdkit_extension.hpp @@ -4,9 +4,11 @@ namespace duckdb { +class ExtensionLoader; + class DuckdbRdkitExtension : public Extension { public: - void Load(DuckDB &db) override; + void Load(ExtensionLoader &loader) override; std::string Name() override; }; diff --git a/src/include/mol_compare.hpp b/src/include/mol_compare.hpp index 98827bf..fea0247 100644 --- a/src/include/mol_compare.hpp +++ b/src/include/mol_compare.hpp @@ -1,5 +1,5 @@ #pragma once #include "common.hpp" namespace duckdb_rdkit { -void RegisterCompareFunctions(DatabaseInstance &instance); +void RegisterCompareFunctions(ExtensionLoader &loader); } // namespace duckdb_rdkit diff --git a/src/include/mol_descriptors.hpp b/src/include/mol_descriptors.hpp index 520fe09..e2229c8 100644 --- a/src/include/mol_descriptors.hpp +++ b/src/include/mol_descriptors.hpp @@ -1,5 +1,5 @@ #pragma once #include "common.hpp" namespace duckdb_rdkit { -void RegisterDescriptorFunctions(DatabaseInstance &instance); +void RegisterDescriptorFunctions(ExtensionLoader &loader); } diff --git a/src/include/mol_formats.hpp b/src/include/mol_formats.hpp index ef0ba21..2256ecc 100644 --- a/src/include/mol_formats.hpp +++ b/src/include/mol_formats.hpp @@ -12,5 +12,5 @@ std::string rdkit_mol_to_binary_mol(const RDKit::ROMol mol); std::unique_ptr rdkit_binary_mol_to_mol(std::string bmol); std::string rdkit_mol_to_smiles(RDKit::ROMol mol); -void RegisterFormatFunctions(DatabaseInstance &instance); +void RegisterFormatFunctions(ExtensionLoader &loader); } // namespace duckdb_rdkit diff --git a/src/include/sdf_scanner/sdf_scan.hpp b/src/include/sdf_scanner/sdf_scan.hpp index b6e33c3..619874c 100644 --- a/src/include/sdf_scanner/sdf_scan.hpp +++ b/src/include/sdf_scanner/sdf_scan.hpp @@ -1,6 +1,6 @@ #pragma once #include "GraphMol/FileParsers/MolSupplier.h" -#include "duckdb/common/multi_file_reader.hpp" +#include "duckdb/common/multi_file/multi_file_reader.hpp" #include "duckdb/common/unique_ptr.hpp" #include "duckdb/execution/execution_context.hpp" #include "duckdb/function/function.hpp" diff --git a/src/include/types.hpp b/src/include/types.hpp index a31416d..cf692b7 100644 --- a/src/include/types.hpp +++ b/src/include/types.hpp @@ -8,5 +8,5 @@ namespace duckdb_rdkit { LogicalType Mol(); -void RegisterTypes(DatabaseInstance &instance); +void RegisterTypes(ExtensionLoader &loader); } // namespace duckdb_rdkit diff --git a/src/mol_compare.cpp b/src/mol_compare.cpp index 04afac9..11c84f3 100644 --- a/src/mol_compare.cpp +++ b/src/mol_compare.cpp @@ -2,7 +2,6 @@ #include "duckdb/common/types/vector.hpp" #include "duckdb/execution/expression_executor_state.hpp" #include "duckdb/function/scalar_function.hpp" -#include "duckdb/main/extension_util.hpp" #include "mol_formats.hpp" #include "types.hpp" #include "umbra_mol.hpp" @@ -145,18 +144,18 @@ static void is_substruct(DataChunk &args, ExpressionState &state, }); } -void RegisterCompareFunctions(DatabaseInstance &instance) { +void RegisterCompareFunctions(ExtensionLoader &loader) { ScalarFunctionSet set("is_exact_match"); // left type and right type set.AddFunction(ScalarFunction({duckdb_rdkit::Mol(), duckdb_rdkit::Mol()}, LogicalType::BOOLEAN, is_exact_match)); - ExtensionUtil::RegisterFunction(instance, set); + loader.RegisterFunction(set); ScalarFunctionSet set_is_substruct("is_substruct"); set_is_substruct.AddFunction( ScalarFunction({duckdb_rdkit::Mol(), duckdb_rdkit::Mol()}, LogicalType::BOOLEAN, is_substruct)); - ExtensionUtil::RegisterFunction(instance, set_is_substruct); + loader.RegisterFunction(set_is_substruct); } } // namespace duckdb_rdkit diff --git a/src/mol_descriptors.cpp b/src/mol_descriptors.cpp index aef4947..246ddd6 100644 --- a/src/mol_descriptors.cpp +++ b/src/mol_descriptors.cpp @@ -8,7 +8,6 @@ #include "duckdb/common/vector_operations/unary_executor.hpp" #include "duckdb/execution/expression_executor_state.hpp" #include "duckdb/function/function_set.hpp" -#include "duckdb/main/extension_util.hpp" #include "mol_formats.hpp" #include "qed.hpp" #include "types.hpp" @@ -131,45 +130,45 @@ void mol_num_rotatable_bonds(DataChunk &args, ExpressionState &state, Vector &re }); } -void RegisterDescriptorFunctions(DatabaseInstance &instance) { +void RegisterDescriptorFunctions(ExtensionLoader &loader) { ScalarFunctionSet set_mol_amw("mol_amw"); set_mol_amw.AddFunction( ScalarFunction({duckdb_rdkit::Mol()}, LogicalType::FLOAT, mol_amw)); - ExtensionUtil::RegisterFunction(instance, set_mol_amw); + loader.RegisterFunction(set_mol_amw); ScalarFunctionSet set_mol_exactmw("mol_exactmw"); set_mol_exactmw.AddFunction( ScalarFunction({duckdb_rdkit::Mol()}, LogicalType::FLOAT, mol_exactmw)); - ExtensionUtil::RegisterFunction(instance, set_mol_exactmw); + loader.RegisterFunction(set_mol_exactmw); ScalarFunctionSet set_mol_tpsa("mol_tpsa"); set_mol_tpsa.AddFunction( ScalarFunction({duckdb_rdkit::Mol()}, LogicalType::FLOAT, mol_tpsa)); - ExtensionUtil::RegisterFunction(instance, set_mol_tpsa); + loader.RegisterFunction(set_mol_tpsa); ScalarFunctionSet set_mol_qed("mol_qed"); set_mol_qed.AddFunction( ScalarFunction({duckdb_rdkit::Mol()}, LogicalType::FLOAT, mol_qed)); - ExtensionUtil::RegisterFunction(instance, set_mol_qed); + loader.RegisterFunction(set_mol_qed); ScalarFunctionSet set_mol_logp("mol_logp"); set_mol_logp.AddFunction( ScalarFunction({duckdb_rdkit::Mol()}, LogicalType::FLOAT, mol_logp)); - ExtensionUtil::RegisterFunction(instance, set_mol_logp); + loader.RegisterFunction(set_mol_logp); ScalarFunctionSet set_mol_hbd("mol_hbd"); set_mol_hbd.AddFunction( ScalarFunction({duckdb_rdkit::Mol()}, LogicalType::INTEGER, mol_hbd)); - ExtensionUtil::RegisterFunction(instance, set_mol_hbd); + loader.RegisterFunction(set_mol_hbd); ScalarFunctionSet set_mol_hba("mol_hba"); set_mol_hba.AddFunction( ScalarFunction({duckdb_rdkit::Mol()}, LogicalType::INTEGER, mol_hba)); - ExtensionUtil::RegisterFunction(instance, set_mol_hba); + loader.RegisterFunction(set_mol_hba); ScalarFunctionSet set_mol_num_rotatable_bonds("mol_num_rotatable_bonds"); set_mol_num_rotatable_bonds.AddFunction( ScalarFunction({duckdb_rdkit::Mol()}, LogicalType::INTEGER, mol_num_rotatable_bonds)); - ExtensionUtil::RegisterFunction(instance, set_mol_num_rotatable_bonds); + loader.RegisterFunction(set_mol_num_rotatable_bonds); } } // namespace duckdb_rdkit diff --git a/src/mol_formats.cpp b/src/mol_formats.cpp index f4a313b..be11297 100644 --- a/src/mol_formats.cpp +++ b/src/mol_formats.cpp @@ -4,7 +4,6 @@ #include "duckdb/common/types.hpp" #include "duckdb/execution/expression_executor_state.hpp" #include "duckdb/function/function_set.hpp" -#include "duckdb/main/extension_util.hpp" #include "types.hpp" #include "umbra_mol.hpp" #include @@ -114,22 +113,22 @@ void mol_to_rdkit_mol(DataChunk &args, ExpressionState &state, Vector &result) { }); } -void RegisterFormatFunctions(DatabaseInstance &instance) { +void RegisterFormatFunctions(ExtensionLoader &loader) { // Register scalar functions ScalarFunctionSet mol_from_smiles_set("mol_from_smiles"); mol_from_smiles_set.AddFunction( ScalarFunction({LogicalType::VARCHAR}, Mol(), mol_from_smiles)); - ExtensionUtil::RegisterFunction(instance, mol_from_smiles_set); + loader.RegisterFunction(mol_from_smiles_set); ScalarFunctionSet mol_to_smiles_set("mol_to_smiles"); mol_to_smiles_set.AddFunction( ScalarFunction({Mol()}, LogicalType::VARCHAR, mol_to_smiles)); - ExtensionUtil::RegisterFunction(instance, mol_to_smiles_set); + loader.RegisterFunction(mol_to_smiles_set); ScalarFunctionSet mol_to_rdkit_mol_set("mol_to_rdkit_mol"); mol_to_rdkit_mol_set.AddFunction( ScalarFunction({Mol()}, LogicalType::BLOB, mol_to_rdkit_mol)); - ExtensionUtil::RegisterFunction(instance, mol_to_rdkit_mol_set); + loader.RegisterFunction(mol_to_rdkit_mol_set); } } // namespace duckdb_rdkit diff --git a/src/sdf_scanner/sdf_functions.cpp b/src/sdf_scanner/sdf_functions.cpp index 322115a..16ef70e 100644 --- a/src/sdf_scanner/sdf_functions.cpp +++ b/src/sdf_scanner/sdf_functions.cpp @@ -1,7 +1,8 @@ #include "sdf_scanner/sdf_functions.hpp" #include "duckdb/common/assert.hpp" -#include "duckdb/common/multi_file_list.hpp" -#include "duckdb/common/multi_file_reader.hpp" +#include "duckdb/common/multi_file/multi_file_list.hpp" +#include "duckdb/common/multi_file/multi_file_reader.hpp" +#include "duckdb/common/open_file_info.hpp" #include "duckdb/common/types.hpp" #include "duckdb/common/types/vector.hpp" #include "duckdb/function/function.hpp" @@ -135,9 +136,17 @@ unique_ptr ReadSDFBind(ClientContext &context, // } } } - //! get the files - SimpleMultiFileList file_list(std::move(bind_data->files)); - bind_data->files = file_list.GetAllFiles(); + //! get the files - convert string paths to OpenFileInfo + vector file_infos; + for (auto &file : bind_data->files) { + file_infos.emplace_back(file); + } + SimpleMultiFileList file_list(std::move(file_infos)); + auto all_files = file_list.GetAllFiles(); + bind_data->files.clear(); + for (auto &file_info : all_files) { + bind_data->files.push_back(file_info.path); + } if (bind_data->files.size() > 1) { throw NotImplementedException( "Reading more than one sdf file is currently not supported."); diff --git a/src/sdf_scanner/sdf_scan.cpp b/src/sdf_scanner/sdf_scan.cpp index 0b6ab33..90b29a7 100644 --- a/src/sdf_scanner/sdf_scan.cpp +++ b/src/sdf_scanner/sdf_scan.cpp @@ -1,7 +1,7 @@ #include "sdf_scanner/sdf_scan.hpp" #include "duckdb/common/allocator.hpp" #include "duckdb/common/helper.hpp" -#include "duckdb/common/multi_file_reader.hpp" +#include "duckdb/common/multi_file/multi_file_reader.hpp" #include "duckdb/common/types.hpp" #include "duckdb/common/unique_ptr.hpp" #include "duckdb/common/vector_size.hpp" @@ -19,7 +19,11 @@ void SDFScanData::Bind(ClientContext &context, TableFunctionBindInput &input) { auto multi_file_reader = MultiFileReader::Create(input.table_function); auto file_list = multi_file_reader->CreateFileList(context, input.inputs[0]); - files = file_list->GetAllFiles(); + auto all_files = file_list->GetAllFiles(); + files.clear(); + for (auto &file_info : all_files) { + files.push_back(file_info.path); + } } unique_ptr diff --git a/src/types.cpp b/src/types.cpp index 0f05bec..87d7415 100644 --- a/src/types.cpp +++ b/src/types.cpp @@ -5,7 +5,6 @@ #include "duckdb/common/constants.hpp" #include "duckdb/common/types.hpp" #include "duckdb/main/database.hpp" -#include "duckdb/main/extension_util.hpp" namespace duckdb_rdkit { @@ -15,9 +14,9 @@ LogicalType Mol() { return blob_type; } -void RegisterTypes(DatabaseInstance &instance) { +void RegisterTypes(ExtensionLoader &loader) { // Register Mol type - ExtensionUtil::RegisterType(instance, "Mol", Mol()); + loader.RegisterType("Mol", Mol()); } } // namespace duckdb_rdkit diff --git a/test/sql/edge_cases.test b/test/sql/edge_cases.test new file mode 100644 index 0000000..49b84d1 --- /dev/null +++ b/test/sql/edge_cases.test @@ -0,0 +1,258 @@ +# name: test/sql/edge_cases.test +# description: Edge case tests for duckdb_rdkit extension +# group: [duckdb_rdkit] + +require duckdb_rdkit + +# ============================================================================ +# NULL handling tests +# ============================================================================ + +# mol_from_smiles with NULL input +query I +SELECT mol_from_smiles(NULL); +---- +NULL + +# mol_to_smiles with NULL input +query I +SELECT mol_to_smiles(NULL::mol); +---- +NULL + +# descriptors with NULL mol +query I +SELECT mol_amw(NULL::mol); +---- +NULL + +query I +SELECT mol_exactmw(NULL::mol); +---- +NULL + +query I +SELECT mol_logp(NULL::mol); +---- +NULL + +query I +SELECT mol_tpsa(NULL::mol); +---- +NULL + +query I +SELECT mol_hbd(NULL::mol); +---- +NULL + +query I +SELECT mol_hba(NULL::mol); +---- +NULL + +query I +SELECT mol_qed(NULL::mol); +---- +NULL + +query I +SELECT mol_num_rotatable_bonds(NULL::mol); +---- +NULL + +# is_exact_match with NULL +query I +SELECT is_exact_match(NULL::mol, 'CCO'::mol); +---- +NULL + +query I +SELECT is_exact_match('CCO'::mol, NULL::mol); +---- +NULL + +query I +SELECT is_exact_match(NULL::mol, NULL::mol); +---- +NULL + +# is_substruct with NULL +query I +SELECT is_substruct(NULL::mol, 'CCO'::mol); +---- +NULL + +query I +SELECT is_substruct('CCO'::mol, NULL::mol); +---- +NULL + +# ============================================================================ +# Invalid SMILES handling +# ============================================================================ + +# Invalid SMILES returns NULL (not error) +query I +SELECT mol_from_smiles('not_a_smiles'); +---- +NULL + +query I +SELECT mol_from_smiles('C(C)(C)(C)(C)C'); +---- +NULL + +# Empty SMILES returns empty (not NULL) - this is consistent with RDKit behavior +query I +SELECT mol_from_smiles('') IS NOT NULL; +---- +true + +# ============================================================================ +# Stereochemistry tests +# ============================================================================ + +# E/Z isomers - note: is_exact_match currently treats E/Z isomers as equivalent +# This tests current behavior; stereochemistry comparison may be enhanced in future +query I +SELECT is_exact_match('C/C=C/C'::mol, 'C/C=C\C'::mol); +---- +true + +# Same E isomer with different notation +query I +SELECT is_exact_match('C/C=C/C'::mol, mol_from_smiles('C/C=C/C')); +---- +true + +# R/S chirality - note: is_exact_match treats enantiomers as equivalent +# This tests current behavior; stereochemistry comparison may be enhanced in future +query I +SELECT is_exact_match('[C@H](O)(F)Cl'::mol, '[C@@H](O)(F)Cl'::mol); +---- +true + +# Same chiral molecule +query I +SELECT is_exact_match('[C@H](O)(F)Cl'::mol, '[C@H](O)(F)Cl'::mol); +---- +true + +# Chiral center preserved through round-trip +query I +SELECT mol_to_smiles('[C@H](Cl)(F)O'::mol) LIKE '%@%'; +---- +true + +# ============================================================================ +# Large/Complex molecule tests +# ============================================================================ + +# Taxol (paclitaxel) - a complex natural product +statement ok +CREATE TABLE complex_mols (name VARCHAR, m Mol); + +statement ok +INSERT INTO complex_mols VALUES + ('taxol', 'CC1=C2[C@@]([C@]([C@H]([C@@H]3[C@]4([C@H](OC4)C[C@@H]([C@]3(C(=O)[C@@H]2OC(=O)C)C)O)OC(=O)C)OC(=O)c5ccccc5)(C[C@@H]1OC(=O)[C@H](O)[C@@H](NC(=O)c6ccccc6)c7ccccc7)O)(C)C'), + ('aspirin', 'CC(=O)Oc1ccccc1C(=O)O'), + ('caffeine', 'Cn1cnc2c1c(=O)n(c(=O)n2C)C'); + +# Complex molecule can be stored and retrieved +query I +SELECT COUNT(*) FROM complex_mols WHERE m IS NOT NULL; +---- +3 + +# Descriptors work on complex molecules +query I +SELECT name FROM complex_mols WHERE mol_amw(m) > 500 ORDER BY name; +---- +taxol + +# Substructure search on complex molecule - find molecules with benzene +query I +SELECT name FROM complex_mols WHERE is_substruct(m, 'c1ccccc1'::mol) ORDER BY name; +---- +aspirin +taxol + +# ============================================================================ +# Molecule with charged atoms +# ============================================================================ + +query I +SELECT mol_to_smiles('[NH4+]'::mol); +---- +[NH4+] + +query I +SELECT mol_to_smiles('[O-]C(=O)C'::mol); +---- +CC(=O)[O-] + +# Zwitterion +query I +SELECT mol_to_smiles('[NH3+]CCC([O-])=O'::mol) IS NOT NULL; +---- +true + +# ============================================================================ +# Molecules with isotopes +# ============================================================================ + +query I +SELECT mol_to_smiles('[13CH4]'::mol); +---- +[13CH4] + +query I +SELECT mol_to_smiles('[2H]C([2H])([2H])[2H]'::mol) IS NOT NULL; +---- +true + +# ============================================================================ +# Ring systems +# ============================================================================ + +# Fused rings - naphthalene +query I +SELECT is_substruct('c1ccc2ccccc2c1'::mol, 'c1ccccc1'::mol); +---- +true + +# Spiro compound +query I +SELECT mol_to_smiles('C1CCC2(CC1)CCCCC2'::mol) IS NOT NULL; +---- +true + +# Bridged bicyclic - norbornane +query I +SELECT mol_to_smiles('C1CC2CCC1C2'::mol) IS NOT NULL; +---- +true + +# ============================================================================ +# Edge case SMILES +# ============================================================================ + +# Single atom +query I +SELECT mol_to_smiles('C'::mol); +---- +C + +# Disconnected fragments (salts) +query I +SELECT mol_to_smiles('[Na+].[Cl-]'::mol) IS NOT NULL; +---- +true + +# Very long chain +query I +SELECT mol_amw('CCCCCCCCCCCCCCCCCCCCCCCCCCCCCC'::mol) > 400; +---- +true + diff --git a/test/sql/mol_search.test b/test/sql/mol_search.test index 16759bd..3f47fe8 100644 --- a/test/sql/mol_search.test +++ b/test/sql/mol_search.test @@ -32,3 +32,65 @@ SELECT m FROM molecules WHERE is_exact_match(m,'CCO'); CCO CCO +# ============================================================================ +# is_substruct tests - substructure searching +# ============================================================================ + +# benzene is a substructure of toluene +query I +SELECT is_substruct('Cc1ccccc1'::mol, 'c1ccccc1'::mol); +---- +true + +# toluene is NOT a substructure of benzene (has extra methyl) +query I +SELECT is_substruct('c1ccccc1'::mol, 'Cc1ccccc1'::mol); +---- +false + +# ethanol contains C-O substructure +query I +SELECT is_substruct('CCO'::mol, 'CO'::mol); +---- +true + +# find all molecules containing benzene ring +query I rowsort +SELECT m FROM molecules WHERE is_substruct(m, 'c1ccccc1'::mol); +---- +Cc1ccccc1 +c1ccccc1 + +# find all molecules containing pyridine ring +query I rowsort +SELECT m FROM molecules WHERE is_substruct(m, 'c1ccncc1'::mol); +---- +c1ccc(-c2ccccn2)nc1 +c1ccncc1 + +# bipyridine contains pyridine as substructure +query I +SELECT is_substruct('c1ccc(-c2ccccn2)nc1'::mol, 'c1ccncc1'::mol); +---- +true + +# same molecule is substructure of itself +query I +SELECT is_substruct('CCO'::mol, 'CCO'::mol); +---- +true + +# SMARTS-like pattern: any aromatic ring (using benzene as proxy) +query I +SELECT COUNT(*) FROM molecules WHERE is_substruct(m, 'c1ccccc1'::mol); +---- +2 + +# hydroxyl group substructure search +query I rowsort +SELECT m FROM molecules WHERE is_substruct(m, 'O'::mol); +---- +CCO +CCO + + diff --git a/vcpkg.json b/vcpkg.json index 1128b92..62a08a8 100644 --- a/vcpkg.json +++ b/vcpkg.json @@ -1,3 +1,7 @@ { - "dependencies": ["openssl"] + "dependencies": [ + "boost-system", + "boost-serialization", + "boost-iostreams" + ] }