-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathmol_compare.cpp
More file actions
161 lines (142 loc) · 6.36 KB
/
mol_compare.cpp
File metadata and controls
161 lines (142 loc) · 6.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#include "common.hpp"
#include "duckdb/common/types/vector.hpp"
#include "duckdb/execution/expression_executor_state.hpp"
#include "duckdb/function/scalar_function.hpp"
#include "mol_formats.hpp"
#include "types.hpp"
#include "umbra_mol.hpp"
#include <GraphMol/Descriptors/MolDescriptors.h>
#include <GraphMol/GraphMol.h>
#include <GraphMol/MolPickler.h>
#include <GraphMol/SmilesParse/SmilesWrite.h>
#include <GraphMol/Substruct/SubstructMatch.h>
#include <cstdio>
#include <memory>
namespace duckdb_rdkit {
// credit: code is from chemicalite
// https://github.com/rvianello/chemicalite
// See mol_search.test for an example of
// a molecule which can return false negative, if the SMILES is different from
// the query
bool mol_cmp(std::string m1_bmol, std::string m2_bmol) {
std::unique_ptr<RDKit::ROMol> m1(new RDKit::ROMol());
std::unique_ptr<RDKit::ROMol> m2(new RDKit::ROMol());
RDKit::MolPickler::molFromPickle(m1_bmol, *m1);
RDKit::MolPickler::molFromPickle(m2_bmol, *m2);
// credit: code is from chemicalite
// https://github.com/rvianello/chemicalite
// See mol_search.test for an example of
// a molecule which can return false negative, if the SMILES is different
// from the query if m1 is substruct of m2 and m2 is substruct of m1,
// likely to be the same molecule
RDKit::MatchVectType matchVect;
bool recursion_possible = false;
bool do_chiral_match = false; /* FIXME: make configurable getDoChiralSSS(); */
bool ss1 = RDKit::SubstructMatch(*m1, *m2, matchVect, recursion_possible,
do_chiral_match);
bool ss2 = RDKit::SubstructMatch(*m2, *m1, matchVect, recursion_possible,
do_chiral_match);
if (ss1 && !ss2) {
return false;
} else if (!ss1 && ss2) {
return false;
}
// the above can still fail in some chirality cases
std::string smi1 = RDKit::MolToSmiles(*m1, do_chiral_match);
std::string smi2 = RDKit::MolToSmiles(*m2, do_chiral_match);
return smi1 == smi2;
}
static void is_exact_match(DataChunk &args, ExpressionState &state,
Vector &result) {
D_ASSERT(args.ColumnCount() == 2);
// args.data[i] is a FLAT_VECTOR
auto &left = args.data[0];
auto &right = args.data[1];
BinaryExecutor::Execute<string_t, string_t, bool>(
left, right, result, args.size(),
[&](string_t &left_umbra_blob, string_t &right_umbra_blob) {
auto left = umbra_mol_t(left_umbra_blob);
auto right = umbra_mol_t(right_umbra_blob);
// The prefix of a umbra_mol contains a bit vector for substructure
// screens. We also use this to check exact match. If the molecules
// being compared do not have the same substructures marked by the
// dalke_fp, they cannot be an exact match
if (memcmp(left.GetPrefix(), right.GetPrefix(),
umbra_mol_t::PREFIX_BYTES) != 0) {
return false;
};
// otherwise, do the more extensive check with rdkit
return mol_cmp(left.GetBinaryMol(), right.GetBinaryMol());
});
}
bool _is_substruct(umbra_mol_t target, umbra_mol_t query) {
// if the fragment exists in the query but not in the target,
// there is no way for a match. This only works in one direction
//
// If the fragment exists in the target, but not the query, it is still
// possible there is something in the query that matches the target, but
// is not captured in the dalke fingerprint
//
// If all fragments that are on in the query are also on in the target,
// this does not mean that the query is a substructure. It is possible
// that there is something in the query not captured in the fingerprint
// that is present in the query, but not in the target. For example,
// if the query has NCCCCCCCC, and the target has the N bit set,
// but it could be that the target is only NC
//
// It is only possible to short-circuit in the false case, not in the
// true case
auto q_prefix = query.GetPrefixAsInt();
auto t_prefix = target.GetPrefixAsInt();
// The 4 byte prefix in string_t is inlined. This is very fast to check.
// If we cannot conclude that the query is NOT a substructure of the target,
// we need the rest of the dalke fp. This requires chasing a pointer to the
// data of which the next 4 bytes of the dalke fp is at the front of.
if ((q_prefix & t_prefix) == q_prefix) {
auto q_dalke_fp = query.GetDalkeFP();
auto t_dalke_fp = target.GetDalkeFP();
if ((q_dalke_fp & t_dalke_fp) == q_dalke_fp) {
// query might be substructure of the target -- run a substructure match
// on the molecule objects
std::unique_ptr<RDKit::ROMol> left_mol(new RDKit::ROMol());
std::unique_ptr<RDKit::ROMol> right_mol(new RDKit::ROMol());
RDKit::MolPickler::molFromPickle(target.GetBinaryMol(), *left_mol);
RDKit::MolPickler::molFromPickle(query.GetBinaryMol(), *right_mol);
// copied from chemicalite
RDKit::MatchVectType matchVect;
bool recursion_possible = true;
bool do_chiral_match =
false; /* FIXME: make configurable getDoChiralSSS(); */
return RDKit::SubstructMatch(*left_mol, *right_mol, matchVect,
recursion_possible, do_chiral_match);
}
}
return false;
}
static void is_substruct(DataChunk &args, ExpressionState &state,
Vector &result) {
D_ASSERT(args.ColumnCount() == 2);
// args.data[i] is a FLAT_VECTOR
auto &left = args.data[0];
auto &right = args.data[1];
BinaryExecutor::Execute<string_t, string_t, bool>(
left, right, result, args.size(),
[&](string_t &left_umbra_blob, string_t &right_umbra_blob) {
auto left_umbra_mol = umbra_mol_t(left_umbra_blob);
auto right_umbra_mol = umbra_mol_t(right_umbra_blob);
return _is_substruct(left_umbra_mol, right_umbra_mol);
});
}
void RegisterCompareFunctions(ExtensionLoader &loader) {
ScalarFunctionSet set("is_exact_match");
// left type and right type
set.AddFunction(ScalarFunction({duckdb_rdkit::Mol(), duckdb_rdkit::Mol()},
LogicalType::BOOLEAN, is_exact_match));
loader.RegisterFunction(set);
ScalarFunctionSet set_is_substruct("is_substruct");
set_is_substruct.AddFunction(
ScalarFunction({duckdb_rdkit::Mol(), duckdb_rdkit::Mol()},
LogicalType::BOOLEAN, is_substruct));
loader.RegisterFunction(set_is_substruct);
}
} // namespace duckdb_rdkit