Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,9 @@ set(DUCKDB_RDKIT_LIBRARIES
RDKit::SmilesParse
RDKit::GraphMol
RDKit::Descriptors
)
RDKit::Fingerprints
RDKit::DataStructs
)
# Link OpenSSL in both the static library as the loadable extension
target_link_libraries(${EXTENSION_NAME} ${DUCKDB_RDKIT_LIBRARIES})
target_link_libraries(${LOADABLE_EXTENSION_NAME} ${DUCKDB_RDKIT_LIBRARIES})
Expand Down
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,16 @@ cheminformatics work with DuckDB.

- Example: `SELECT mol, id FROM 'test.sdf';`

### Searches
### Molecular comparisons

- `is_exact_match(mol1, mol2)`: exact structure search. Returns true if the two molecules are the same. (Chirality sensitive search is not on)
- Note: if you are looking for very specific capabilities with exact match with regards
to stereochemistry or tautomers, the `RegistrationHash` (https://rdkit.org/docs/source/rdkit.Chem.RegistrationHash.html)
might be an option to consider. You would need to write this to your DB and
then you can do a simple VARCHAR based search on those columns.
- `is_substruct(mol1, mol2)`: returns true if mol2 is a substructure of mol1.
- `tanimoto_similarity(mol1, mol2)`: returns the tanimoto similarity between both Morgan Fingerprints (radius=2, nb_bits=2048)
- `tanimoto_similarity(mol1, mol2, radius, nb_bits)`: returns the tanimoto similarity between both Morgan Fingerprints with specific radius & nb_bits

### Molecule conversion functions

Expand Down
55 changes: 55 additions & 0 deletions src/mol_compare.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@
#include <GraphMol/MolPickler.h>
#include <GraphMol/SmilesParse/SmilesWrite.h>
#include <GraphMol/Substruct/SubstructMatch.h>
#include <GraphMol/Fingerprints/MorganFingerprints.h>
#include <DataStructs/ExplicitBitVect.h>
#include <DataStructs/BitVect.h>
#include <DataStructs/BitVects.h>

#include <cstdio>
#include <memory>

Expand Down Expand Up @@ -137,6 +142,47 @@ static void is_substruct(DataChunk &args, ExpressionState &state,
});
}

double tanimoto_morgan(const std::string &m1_bmol, const std::string &m2_bmol, int radius = 2, int nb_bits = 2048) {
std::unique_ptr<RDKit::ROMol> m1(new RDKit::ROMol());
std::unique_ptr<RDKit::ROMol> m2(new RDKit::ROMol());
RDKit::MolPickler::molFromPickle(m1_bmol, *m1);
RDKit::MolPickler::molFromPickle(m2_bmol, *m2);

auto fp1 = RDKit::MorganFingerprints::getFingerprintAsBitVect(*m1, radius, nb_bits);
auto fp2 = RDKit::MorganFingerprints::getFingerprintAsBitVect(*m2, radius, nb_bits);

return TanimotoSimilarity(*fp1, *fp2);
}

static void tanimoto_similarity(DataChunk &args, ExpressionState &state, Vector &result) {
D_ASSERT(args.ColumnCount() == 2 || args.ColumnCount() == 4);

auto &left = args.data[0];
auto &right = args.data[1];

int radius = 2;
int nb_bits = 2048;

if (args.ColumnCount() == 4) {
auto &radius_vec = args.data[2];
if (!duckdb::FlatVector::IsNull(radius_vec, 0)) {
radius = duckdb::FlatVector::GetData<int32_t>(radius_vec)[0];
}
auto &nb_bits_vec = args.data[3];
if (!duckdb::FlatVector::IsNull(nb_bits_vec, 0)) {
nb_bits = duckdb::FlatVector::GetData<int32_t>(nb_bits_vec)[0];
}
}

BinaryExecutor::Execute<string_t, string_t, double>(
left, right, result, args.size(),
[&](string_t &left_umbra_blob, string_t &right_umbra_blob) {
auto left_mol = umbra_mol_t(left_umbra_blob);
auto right_mol = umbra_mol_t(right_umbra_blob);
return tanimoto_morgan(left_mol.GetBinaryMol(), right_mol.GetBinaryMol(), radius, nb_bits);
});
}

void RegisterCompareFunctions(DatabaseInstance &instance) {
ScalarFunctionSet set("is_exact_match");
// left type and right type
Expand All @@ -149,6 +195,15 @@ void RegisterCompareFunctions(DatabaseInstance &instance) {
ScalarFunction({duckdb_rdkit::Mol(), duckdb_rdkit::Mol()},
LogicalType::BOOLEAN, is_substruct));
ExtensionUtil::RegisterFunction(instance, set_is_substruct);

ScalarFunctionSet set_tanimoto("tanimoto_similarity");
set_tanimoto.AddFunction(ScalarFunction(
{duckdb_rdkit::Mol(), duckdb_rdkit::Mol()},
LogicalType::DOUBLE, tanimoto_similarity));
set_tanimoto.AddFunction(ScalarFunction(
{duckdb_rdkit::Mol(), duckdb_rdkit::Mol(), LogicalType::INTEGER, LogicalType::INTEGER},
LogicalType::DOUBLE, tanimoto_similarity));
ExtensionUtil::RegisterFunction(instance, set_tanimoto);
}

} // namespace duckdb_rdkit
33 changes: 33 additions & 0 deletions test/sql/mol_search.test → test/sql/mol_compare.test
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,36 @@ SELECT m FROM molecules WHERE is_exact_match(m,'CCO');
CCO
CCO

# test substruct search when some matches
query I
SELECT m FROM molecules WHERE is_substruct(m,'c1ccncc1');
----
c1ccncc1
c1ccc(-c2ccccn2)nc1

# test substruct search when no match
query I
SELECT m FROM molecules WHERE is_substruct(m,'c1cCcncc1');
----

# test similarity computation with default parameters
query II
SELECT m, tanimoto_similarity(m,mol_from_smiles('c1ccncc1')) FROM molecules;
----
c1ccccc1 0.3333333333333333
CCO 0.0
c1ccncc1 1.0
c1ccc(-c2ccccn2)nc1 0.2777777777777778
CCO 0.0
Cc1ccccc1 0.17647058823529413

# test similarity computation with custom parameters
query II
SELECT m, tanimoto_similarity(m,mol_from_smiles('c1ccncc1'), 4, 4096) FROM molecules;
----
c1ccccc1 0.2727272727272727
CCO 0.0
c1ccncc1 1.0
c1ccc(-c2ccccn2)nc1 0.20833333333333334
CCO 0.0
Cc1ccccc1 0.15