diff --git a/CMakeLists.txt b/CMakeLists.txt index bc5a94f..fbfbff4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -35,7 +35,9 @@ set(DUCKDB_RDKIT_LIBRARIES RDKit::SmilesParse RDKit::GraphMol RDKit::Descriptors - ) + RDKit::Fingerprints + RDKit::DataStructs + ) # Link OpenSSL in both the static library as the loadable extension target_link_libraries(${EXTENSION_NAME} ${DUCKDB_RDKIT_LIBRARIES}) target_link_libraries(${LOADABLE_EXTENSION_NAME} ${DUCKDB_RDKIT_LIBRARIES}) diff --git a/README.md b/README.md index 543d38b..8a38827 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ cheminformatics work with DuckDB. - Example: `SELECT mol, id FROM 'test.sdf';` -### Searches +### Molecular comparisons - `is_exact_match(mol1, mol2)`: exact structure search. Returns true if the two molecules are the same. (Chirality sensitive search is not on) - Note: if you are looking for very specific capabilities with exact match with regards @@ -63,6 +63,8 @@ cheminformatics work with DuckDB. might be an option to consider. You would need to write this to your DB and then you can do a simple VARCHAR based search on those columns. - `is_substruct(mol1, mol2)`: returns true if mol2 is a substructure of mol1. +- `tanimoto_similarity(mol1, mol2)`: returns the tanimoto similarity between both Morgan Fingerprints (radius=2, nb_bits=2048) +- `tanimoto_similarity(mol1, mol2, radius, nb_bits)`: returns the tanimoto similarity between both Morgan Fingerprints with specific radius & nb_bits ### Molecule conversion functions diff --git a/src/mol_compare.cpp b/src/mol_compare.cpp index b15f0ca..f545ea9 100644 --- a/src/mol_compare.cpp +++ b/src/mol_compare.cpp @@ -11,6 +11,11 @@ #include #include #include +#include +#include +#include +#include + #include #include @@ -137,6 +142,47 @@ static void is_substruct(DataChunk &args, ExpressionState &state, }); } +double tanimoto_morgan(const std::string &m1_bmol, const std::string &m2_bmol, int radius = 2, int nb_bits = 2048) { + std::unique_ptr m1(new RDKit::ROMol()); + std::unique_ptr m2(new RDKit::ROMol()); + RDKit::MolPickler::molFromPickle(m1_bmol, *m1); + RDKit::MolPickler::molFromPickle(m2_bmol, *m2); + + auto fp1 = RDKit::MorganFingerprints::getFingerprintAsBitVect(*m1, radius, nb_bits); + auto fp2 = RDKit::MorganFingerprints::getFingerprintAsBitVect(*m2, radius, nb_bits); + + return TanimotoSimilarity(*fp1, *fp2); +} + +static void tanimoto_similarity(DataChunk &args, ExpressionState &state, Vector &result) { + D_ASSERT(args.ColumnCount() == 2 || args.ColumnCount() == 4); + + auto &left = args.data[0]; + auto &right = args.data[1]; + + int radius = 2; + int nb_bits = 2048; + + if (args.ColumnCount() == 4) { + auto &radius_vec = args.data[2]; + if (!duckdb::FlatVector::IsNull(radius_vec, 0)) { + radius = duckdb::FlatVector::GetData(radius_vec)[0]; + } + auto &nb_bits_vec = args.data[3]; + if (!duckdb::FlatVector::IsNull(nb_bits_vec, 0)) { + nb_bits = duckdb::FlatVector::GetData(nb_bits_vec)[0]; + } + } + + BinaryExecutor::Execute( + left, right, result, args.size(), + [&](string_t &left_umbra_blob, string_t &right_umbra_blob) { + auto left_mol = umbra_mol_t(left_umbra_blob); + auto right_mol = umbra_mol_t(right_umbra_blob); + return tanimoto_morgan(left_mol.GetBinaryMol(), right_mol.GetBinaryMol(), radius, nb_bits); + }); +} + void RegisterCompareFunctions(DatabaseInstance &instance) { ScalarFunctionSet set("is_exact_match"); // left type and right type @@ -149,6 +195,15 @@ void RegisterCompareFunctions(DatabaseInstance &instance) { ScalarFunction({duckdb_rdkit::Mol(), duckdb_rdkit::Mol()}, LogicalType::BOOLEAN, is_substruct)); ExtensionUtil::RegisterFunction(instance, set_is_substruct); + + ScalarFunctionSet set_tanimoto("tanimoto_similarity"); + set_tanimoto.AddFunction(ScalarFunction( + {duckdb_rdkit::Mol(), duckdb_rdkit::Mol()}, + LogicalType::DOUBLE, tanimoto_similarity)); + set_tanimoto.AddFunction(ScalarFunction( + {duckdb_rdkit::Mol(), duckdb_rdkit::Mol(), LogicalType::INTEGER, LogicalType::INTEGER}, + LogicalType::DOUBLE, tanimoto_similarity)); + ExtensionUtil::RegisterFunction(instance, set_tanimoto); } } // namespace duckdb_rdkit diff --git a/test/sql/mol_search.test b/test/sql/mol_compare.test similarity index 50% rename from test/sql/mol_search.test rename to test/sql/mol_compare.test index 16759bd..93a0a13 100644 --- a/test/sql/mol_search.test +++ b/test/sql/mol_compare.test @@ -32,3 +32,36 @@ SELECT m FROM molecules WHERE is_exact_match(m,'CCO'); CCO CCO +# test substruct search when some matches +query I +SELECT m FROM molecules WHERE is_substruct(m,'c1ccncc1'); +---- +c1ccncc1 +c1ccc(-c2ccccn2)nc1 + +# test substruct search when no match +query I +SELECT m FROM molecules WHERE is_substruct(m,'c1cCcncc1'); +---- + +# test similarity computation with default parameters +query II +SELECT m, tanimoto_similarity(m,mol_from_smiles('c1ccncc1')) FROM molecules; +---- +c1ccccc1 0.3333333333333333 +CCO 0.0 +c1ccncc1 1.0 +c1ccc(-c2ccccn2)nc1 0.2777777777777778 +CCO 0.0 +Cc1ccccc1 0.17647058823529413 + +# test similarity computation with custom parameters +query II +SELECT m, tanimoto_similarity(m,mol_from_smiles('c1ccncc1'), 4, 4096) FROM molecules; +---- +c1ccccc1 0.2727272727272727 +CCO 0.0 +c1ccncc1 1.0 +c1ccc(-c2ccccn2)nc1 0.20833333333333334 +CCO 0.0 +Cc1ccccc1 0.15