Skip to content

Commit b68c4d7

Browse files
authored
Feature: graph and dataset anonymizer (#415)
1 parent 8518a96 commit b68c4d7

10 files changed

Lines changed: 437 additions & 47 deletions

File tree

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ add_library(rdf4cpp
147147
src/rdf4cpp/writer/SerializationState.cpp
148148
src/rdf4cpp/IRIView.cpp
149149
src/rdf4cpp/IRIFactory.cpp
150+
src/rdf4cpp/util/Anonymizer.cpp
150151
private/rdf4cpp/parser/IStreamQuadIteratorSerdImpl.cpp
151152
private/rdf4cpp/regex/RegexImpl.cpp
152153
private/rdf4cpp/regex/RegexReplacerImpl.cpp

conanfile.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ def requirements(self):
3838
self.requires("highway/1.2.0")
3939
self.requires("dice-hash/0.4.11", transitive_headers=True)
4040
self.requires("dice-sparse-map/0.2.9", transitive_headers=True)
41-
self.requires("dice-template-library/1.13.0", transitive_headers=True)
41+
self.requires("dice-template-library/1.19.0", transitive_headers=True)
4242

4343
if self.options.with_test_deps:
4444
self.test_requires("doctest/2.4.11")

src/rdf4cpp/Dataset.cpp

Lines changed: 44 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#include "Dataset.hpp"
1+
#include <rdf4cpp/Dataset.hpp>
22
#include <rdf4cpp/Graph.hpp>
33
#include <rdf4cpp/writer/TryWrite.hpp>
44
#include <rdf4cpp/writer/SerializationState.hpp>
@@ -181,6 +181,10 @@ Dataset::iterator &Dataset::iterator::operator++() noexcept {
181181
return *this;
182182
}
183183

184+
void Dataset::iterator::operator++(int) noexcept {
185+
++*this;
186+
}
187+
184188
Dataset::iterator::reference Dataset::iterator::operator*() const noexcept {
185189
return cur_;
186190
}
@@ -189,12 +193,12 @@ Dataset::iterator::pointer Dataset::iterator::operator->() const noexcept {
189193
return &cur_;
190194
}
191195

192-
bool Dataset::iterator::operator==(Dataset::sentinel) const noexcept {
193-
return giter_ == gend_;
196+
bool operator==(Dataset::iterator const &self, Dataset::sentinel) noexcept {
197+
return self.giter_ == self.gend_;
194198
}
195199

196-
bool Dataset::iterator::operator!=(Dataset::sentinel) const noexcept {
197-
return !(*this == Dataset::sentinel{});
200+
bool operator==(Dataset::sentinel, Dataset::iterator const &self) noexcept {
201+
return self.giter_ == self.gend_;
198202
}
199203

200204
void Dataset::solution_iterator::fill_solution() noexcept {
@@ -208,6 +212,17 @@ void Dataset::solution_iterator::fill_solution() noexcept {
208212
}
209213
}
210214

215+
void Dataset::solution_iterator::advance_until_result() {
216+
while (iter_ == std::default_sentinel) {
217+
++giter_;
218+
if (giter_ == gend_) {
219+
return;
220+
}
221+
222+
iter_ = giter_->second.match(pat_.without_graph()).begin();
223+
}
224+
}
225+
211226
Dataset::solution_iterator::solution_iterator(Dataset const *parent,
212227
query::QuadPattern const &pat,
213228
typename storage_type::const_iterator beg,
@@ -217,6 +232,7 @@ Dataset::solution_iterator::solution_iterator(Dataset const *parent,
217232

218233
if (pat_.graph().is_variable()) {
219234
iter_ = giter_->second.match(tpat).begin();
235+
advance_until_result();
220236
} else if (auto const *g = parent_->find_graph(pat_.graph()); g != nullptr) {
221237
iter_ = g->match(tpat).begin();
222238
}
@@ -229,20 +245,17 @@ Dataset::solution_iterator &Dataset::solution_iterator::operator++() noexcept {
229245
++iter_;
230246

231247
if (pat_.graph().is_variable()) {
232-
while (iter_ == std::default_sentinel) {
233-
++giter_;
234-
if (giter_ == gend_) {
235-
return *this;
236-
}
237-
238-
iter_ = giter_->second.match(pat_.without_graph()).begin();
239-
}
248+
advance_until_result();
240249
}
241250

242251
fill_solution();
243252
return *this;
244253
}
245254

255+
void Dataset::solution_iterator::operator++(int) noexcept {
256+
++*this;
257+
}
258+
246259
Dataset::solution_iterator::reference Dataset::solution_iterator::operator*() const noexcept {
247260
return cur_;
248261
}
@@ -251,12 +264,26 @@ Dataset::solution_iterator::pointer Dataset::solution_iterator::operator->() con
251264
return &cur_;
252265
}
253266

254-
bool Dataset::solution_iterator::operator==(Dataset::sentinel) const noexcept {
255-
return iter_ == Dataset::sentinel{};
267+
bool operator==(Dataset::solution_iterator const &self, Dataset::sentinel) noexcept {
268+
return self.iter_ == Dataset::sentinel{};
269+
}
270+
271+
bool operator==(Dataset::sentinel, Dataset::solution_iterator const &self) noexcept {
272+
return self.iter_ == Dataset::sentinel{};
256273
}
257274

258-
bool Dataset::solution_iterator::operator!=(Dataset::sentinel) const noexcept {
259-
return !(*this == Dataset::sentinel{});
275+
Dataset Dataset::anonymize(util::Anonymizer &anonymizer) const {
276+
Dataset anon{anonymizer.node_storage()};
277+
278+
for (auto const &[graph_id, graph] : graphs_) {
279+
anon.graphs_.emplace(
280+
anonymizer.anonymize(to_node(graph_id)).backend_handle().id(),
281+
graph.anonymize(anonymizer)
282+
);
283+
}
284+
285+
return anon;
260286
}
261287

288+
262289
} // namespace rdf4cpp

src/rdf4cpp/Dataset.hpp

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include <rdf4cpp/Quad.hpp>
66
#include <rdf4cpp/query/QuadPattern.hpp>
77
#include <rdf4cpp/writer/BufWriter.hpp>
8+
#include <rdf4cpp/util/Anonymizer.hpp>
89

910
#include <dice/sparse-map/sparse_map.hpp>
1011

@@ -49,11 +50,12 @@ struct Dataset {
4950
typename storage_type::const_iterator gend) noexcept;
5051

5152
iterator &operator++() noexcept;
53+
void operator++(int) noexcept;
5254
reference operator*() const noexcept;
5355
pointer operator->() const noexcept;
5456

55-
bool operator==(sentinel) const noexcept;
56-
bool operator!=(sentinel) const noexcept;
57+
friend bool operator==(iterator const &self, sentinel) noexcept;
58+
friend bool operator==(sentinel, iterator const &self) noexcept;
5759
};
5860

5961
using const_iterator = iterator;
@@ -75,6 +77,7 @@ struct Dataset {
7577
Graph::solution_iterator iter_;
7678
value_type cur_;
7779

80+
void advance_until_result();
7881
void fill_solution() noexcept;
7982

8083
public:
@@ -84,11 +87,12 @@ struct Dataset {
8487
typename storage_type::const_iterator end) noexcept;
8588

8689
solution_iterator &operator++() noexcept;
90+
void operator++(int) noexcept;
8791
reference operator*() const noexcept;
8892
pointer operator->() const noexcept;
8993

90-
bool operator==(sentinel) const noexcept;
91-
bool operator!=(sentinel) const noexcept;
94+
friend bool operator==(solution_iterator const &self, sentinel) noexcept;
95+
friend bool operator==(sentinel, solution_iterator const &self) noexcept;
9296
};
9397

9498
struct solution_sequence {
@@ -202,6 +206,14 @@ struct Dataset {
202206

203207
friend std::ostream &operator<<(std::ostream &os, Dataset const &self);
204208

209+
210+
/**
211+
* Anonymize the dataset by removing all information except for the dataset structure itself.
212+
*
213+
* See `rdf4cpp::util::Anonymizer` for details
214+
*/
215+
[[nodiscard]] Dataset anonymize(util::Anonymizer &anonymizer) const;
216+
205217
// TODO: support union (+) and difference (-)
206218
// TODO: add empty
207219
};

src/rdf4cpp/Graph.cpp

Lines changed: 40 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,44 @@
1-
#include "Graph.hpp"
1+
#include <rdf4cpp/Graph.hpp>
22
#include <rdf4cpp/Dataset.hpp>
33
#include <rdf4cpp/writer/TryWrite.hpp>
44
#include <rdf4cpp/writer/SerializationState.hpp>
55

66
#include <utility>
7+
#include <ranges>
78

89
namespace rdf4cpp {
910

1011
storage::identifier::NodeBackendID Graph::to_node_id(Node node) noexcept {
1112
return node.backend_handle().id();
1213
}
1314

15+
Graph::triple Graph::to_id_triple(Statement const &stmt) noexcept {
16+
return triple{
17+
to_node_id(stmt.subject()),
18+
to_node_id(stmt.predicate()),
19+
to_node_id(stmt.object())
20+
};
21+
}
22+
1423
Node Graph::to_node(storage::identifier::NodeBackendID id) const noexcept {
1524
return Node{storage::identifier::NodeBackendHandle{id, node_storage_}};
1625
}
1726

27+
Statement Graph::to_statement(triple const &t) const noexcept {
28+
return Statement{to_node(t[0]), to_node(t[1]), to_node(t[2])};
29+
}
30+
1831
Graph::Graph(storage::DynNodeStoragePtr node_storage) noexcept : node_storage_{node_storage} {
1932
}
2033

2134
void Graph::add(Statement const &stmt_) {
2235
auto stmt = stmt_.to_node_storage(node_storage_);
23-
triples_.insert(triple{to_node_id(stmt.subject()), to_node_id(stmt.predicate()), to_node_id(stmt.object())});
36+
triples_.insert(to_id_triple(stmt));
2437
}
2538

2639
bool Graph::contains(Statement const &stmt_) const noexcept {
2740
auto const stmt = stmt_.try_get_in_node_storage(node_storage_);
28-
return triples_.contains(triple{to_node_id(stmt.subject()), to_node_id(stmt.predicate()), to_node_id(stmt.object())});
41+
return triples_.contains(to_id_triple(stmt));
2942
}
3043

3144
Graph::iterator Graph::begin() const noexcept {
@@ -86,27 +99,27 @@ std::ostream &operator<<(std::ostream &os, Graph const &graph) {
8699
return os;
87100
}
88101

89-
Statement Graph::iterator::to_statement(rdf4cpp::Graph::triple const &t) const noexcept {
90-
return Statement{parent_->to_node(t[0]), parent_->to_node(t[1]), parent_->to_node(t[2])};
91-
}
92-
93102
Graph::iterator::iterator(Graph const *parent, typename triple_storage_type::const_iterator beg, typename triple_storage_type::const_iterator end) noexcept : parent_{parent},
94103
iter_{beg},
95104
end_{end} {
96105
if (iter_ != end_) {
97-
cur_ = to_statement(*iter_);
106+
cur_ = parent_->to_statement(*iter_);
98107
}
99108
}
100109

101110
Graph::iterator &Graph::iterator::operator++() noexcept {
102111
++iter_;
103112
if (iter_ != end_) {
104-
cur_ = to_statement(*iter_);
113+
cur_ = parent_->to_statement(*iter_);
105114
}
106115

107116
return *this;
108117
}
109118

119+
void Graph::iterator::operator++(int) noexcept {
120+
++*this;
121+
}
122+
110123
Graph::reference Graph::iterator::operator*() const noexcept {
111124
return cur_;
112125
}
@@ -115,12 +128,12 @@ Graph::pointer Graph::iterator::operator->() const noexcept {
115128
return &cur_;
116129
}
117130

118-
bool Graph::iterator::operator==(Graph::sentinel) const noexcept {
119-
return iter_ == end_;
131+
bool operator==(Graph::iterator const &self, Graph::sentinel) noexcept {
132+
return self.iter_ == self.end_;
120133
}
121134

122-
bool Graph::iterator::operator!=(Graph::sentinel) const noexcept {
123-
return !(*this == Graph::sentinel{});
135+
bool operator==(Graph::sentinel, Graph::iterator const &self) noexcept {
136+
return self.iter_ == self.end_;
124137
}
125138

126139
bool Graph::solution_iterator::check_solution() noexcept {
@@ -168,12 +181,22 @@ Graph::solution_iterator::pointer Graph::solution_iterator::operator->() const n
168181
return &cur_;
169182
}
170183

171-
bool Graph::solution_iterator::operator==(Graph::sentinel) const noexcept {
172-
return iter_ == Graph::sentinel{};
184+
bool operator==(Graph::solution_iterator const &self, Graph::sentinel) noexcept {
185+
return self.iter_ == Graph::sentinel{};
186+
}
187+
188+
bool operator==(Graph::sentinel, Graph::solution_iterator const &self) noexcept {
189+
return self.iter_ == Graph::sentinel{};
173190
}
174191

175-
bool Graph::solution_iterator::operator!=(Graph::sentinel) const noexcept {
176-
return iter_ != Graph::sentinel{};
192+
Graph Graph::anonymize(util::Anonymizer &anonymizer) const {
193+
Graph anon{anonymizer.node_storage()};
194+
195+
for (auto const &non_anon_triple : triples_) {
196+
anon.triples_.insert(to_id_triple(anonymizer.anonymize(to_statement(non_anon_triple))));
197+
}
198+
199+
return anon;
177200
}
178201

179202
} // namespace rdf4cpp

src/rdf4cpp/Graph.hpp

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include <rdf4cpp/writer/BufWriter.hpp>
88
#include <rdf4cpp/writer/SerializationState.hpp>
99
#include <rdf4cpp/parser/RDFFileParser.hpp>
10+
#include <rdf4cpp/util/Anonymizer.hpp>
1011

1112
#include <dice/sparse-map/sparse_set.hpp>
1213

@@ -50,18 +51,18 @@ struct Graph {
5051

5152
Statement cur_;
5253

53-
Statement to_statement(triple const &t) const noexcept;
54-
5554
public:
5655
iterator() noexcept = default;
5756
iterator(Graph const *parent, typename triple_storage_type::const_iterator beg, typename triple_storage_type::const_iterator end) noexcept;
5857

5958
iterator &operator++() noexcept;
59+
void operator++(int) noexcept;
60+
6061
reference operator*() const noexcept;
6162
pointer operator->() const noexcept;
6263

63-
bool operator==(sentinel) const noexcept;
64-
bool operator!=(sentinel) const noexcept;
64+
friend bool operator==(iterator const &self, sentinel) noexcept;
65+
friend bool operator==(sentinel, iterator const &self) noexcept;
6566
};
6667

6768
using const_iterator = iterator;
@@ -87,15 +88,16 @@ struct Graph {
8788
query::TriplePattern const &pat) noexcept;
8889

8990
solution_iterator &operator++() noexcept;
91+
void operator++(int) noexcept;
9092
reference operator*() const noexcept;
9193
pointer operator->() const noexcept;
9294

93-
bool operator==(sentinel) const noexcept;
94-
bool operator!=(sentinel) const noexcept;
95+
friend bool operator==(solution_iterator const &self, sentinel) noexcept;
96+
friend bool operator==(sentinel, solution_iterator const &self) noexcept;
9597
};
9698

9799
struct solution_sequence {
98-
using value_type = Statement;
100+
using value_type = query::Solution;
99101
using size_type = size_t;
100102
using difference_type = ptrdiff_t;
101103
using reference = value_type const &;
@@ -127,7 +129,10 @@ struct Graph {
127129
triple_storage_type triples_;
128130

129131
static storage::identifier::NodeBackendID to_node_id(Node node) noexcept;
130-
Node to_node(storage::identifier::NodeBackendID id) const noexcept;
132+
static triple to_id_triple(Statement const &stmt) noexcept;
133+
134+
[[nodiscard]] Node to_node(storage::identifier::NodeBackendID id) const noexcept;
135+
[[nodiscard]] Statement to_statement(triple const &t) const noexcept;
131136

132137
public:
133138
explicit Graph(storage::DynNodeStoragePtr node_storage = storage::default_node_storage) noexcept;
@@ -196,6 +201,13 @@ struct Graph {
196201
*/
197202
friend std::ostream &operator<<(std::ostream &os, Graph const &graph);
198203

204+
/**
205+
* Anonymize the graph by removing all information except for the graph structure itself.
206+
*
207+
* See `rdf4cpp::util::Anonymizer` for details
208+
*/
209+
[[nodiscard]] Graph anonymize(util::Anonymizer &anonymizer) const;
210+
199211
// TODO: support union (+) and difference (-); open question: which graph name should be assigned?
200212
// TODO: add empty
201213
};

0 commit comments

Comments
 (0)