diff --git a/.gitmodules b/.gitmodules index f4e5432..87e7ce9 100644 --- a/.gitmodules +++ b/.gitmodules @@ -7,3 +7,6 @@ [submodule "specifications/normalization"] path = specifications/normalization url = https://github.com/json-ld/normalization.git +[submodule "specifications/rdf-canon"] + path = specifications/rdf-canon + url = https://github.com/w3c/rdf-canon.git diff --git a/lib/pyld/canon.py b/lib/pyld/canon.py index a22d85e..4dd7a1b 100644 --- a/lib/pyld/canon.py +++ b/lib/pyld/canon.py @@ -1,8 +1,14 @@ import copy import hashlib +import rdflib +from rdflib import XSD, BNode, Dataset, Literal, Node +from rdflib.graph import DATASET_DEFAULT_GRAPH_ID +from rdflib.plugins.parsers.nquads import NQuadsParser +from rdflib.plugins.serializers.nt import _quote_encode + from pyld.identifier_issuer import IdentifierIssuer -from pyld.nquads import parse_nquads, serialize_nquad +from pyld.util import from_legacy_dataset, to_legacy_dataset class URDNA2015: @@ -14,11 +20,12 @@ def __init__(self): self.blank_node_info = {} self.hash_to_blank_nodes = {} self.canonical_issuer = IdentifierIssuer('_:c14n') - self.quads = [] - self.POSITIONS = {'subject': 's', 'object': 'o', 'name': 'g'} + self.dataset = None + self.POSITIONS = ['s', 'p', 'o', 'g'] + self.hash_algorithm = hashlib.sha256 # 4.4) Normalization Algorithm - def main(self, dataset, options): + def main(self, dataset: str | dict | Dataset, options) -> str | dict: # handle invalid output format if 'format' in options and ( options['format'] != 'application/n-quads' @@ -26,32 +33,69 @@ def main(self, dataset, options): ): raise UnknownFormatError('Unknown output format.', options['format']) + # handle differtent input types: nquads string, dict (legacy ), or rdflib Dataset + rdflib_dataset = Dataset() + parser = NQuadsParser() + if isinstance(dataset, str): + # Only support N-Quads string input for now + if ( + options['inputFormat'] != 'application/n-quads' + and options['inputFormat'] != 'application/nquads' + ): + raise UnknownFormatError('Unknown input format.', options['format']) + rdflib.NORMALIZE_LITERALS = False + parser.parse(rdflib.parser.StringInputSource(dataset), rdflib_dataset) + elif isinstance(dataset, dict): + rdflib_dataset = from_legacy_dataset(dataset) + elif isinstance(dataset, Dataset): + rdflib_dataset = dataset + else: + raise ValueError(f'Unsupported dataset type: {type(dataset)}') + + normalized, bnode_map = self._canonicalize(rdflib_dataset) + + # Merge any new bnode IDs from the parser into the id_map, + # mapping old bnode IDs to their new canonical IDs + for k, v in parser._bnode_ids.items(): + bnode_id = str(v) + if bnode_id in bnode_map: + bnode_map[k] = bnode_map[bnode_id] + del bnode_map[bnode_id] + + # If outputMap option is set, return the map of blank node identifiers. + if options.get('outputMap'): + return dict(sorted(bnode_map.items(), key=lambda item: item[1])) + + # 8) Return the normalized dataset. + if ( + options.get('format') == 'application/n-quads' + or options.get('format') == 'application/nquads' + ): + return normalized + + # If the output format is not nquads, return a dataset object. + result = Dataset().parse(data=normalized, format='nquads') + return to_legacy_dataset(result) + + def _canonicalize(self, dataset: Dataset) -> tuple[str, dict]: + """ + Performs RDF Dataset Canonicalization on the given rdflib.Dataset + and returns the normalized output along with a map of blank node + identifiers to their canonical identifiers. + """ + + self.dataset = dataset # 1) Create the normalization state. # 2) For every quad in input dataset: - for graph_name, triples in dataset.items(): - if graph_name == '@default': - graph_name = None - for triple in triples: - quad = triple - if graph_name is not None: - if graph_name.startswith('_:'): - quad['name'] = {'type': 'blank node'} - else: - quad['name'] = {'type': 'IRI'} - quad['name']['value'] = graph_name - self.quads.append(quad) - - # 2.1) For each blank node that occurs in the quad, add a - # reference to the quad using the blank node identifier in the - # blank node to quads map, creating a new entry if necessary. - for key, component in quad.items(): - if key == 'predicate' or component['type'] != 'blank node': - continue - id_ = component['value'] - self.blank_node_info.setdefault(id_, {'quads': []})['quads'].append( - quad - ) + for s, p, o, g in dataset.quads((None, None, None, None)): + # 2.1) For each blank node that occurs in the quad, add a + # reference to the quad using the blank node identifier in the + # blank node to quads map, creating a new entry if necessary. + for component in (s, o, g if g else None): + if isinstance(component, BNode): + id_ = str(component) + self.blank_node_info.setdefault(id_, {'quads': []})['quads'].append((s, p, o, g)) # 3) Create a list of non-normalized blank node identifiers and # populate it using the keys from the blank node to quads map. @@ -151,33 +195,41 @@ def main(self, dataset, options): # 7) For each quad, quad, in input dataset: normalized = [] - for quad in self.quads: + id_map = {} + for s, p, o, g in self.dataset.quads((None, None, None, None)): # 7.1) Create a copy, quad copy, of quad and replace any existing # blank node identifiers using the canonical identifiers previously # issued by canonical issuer. Note: We optimize away the copy here. - for key, component in quad.items(): - if key == 'predicate': - continue - if component['type'] == 'blank node' and not component[ - 'value' - ].startswith(self.canonical_issuer.prefix): - component['value'] = self.canonical_issuer.get_id( - component['value'] - ) + + # Helper to map nodes + def map_node(node): + if isinstance(node, BNode): + node_id = str(node) + # Only issue a new ID if it's not already canonicalized + cid = self.canonical_issuer.get_id(node_id) + if cid.startswith('_:'): + cid = cid[2:] # Strip '_:' prefix for rdflib BNode compatibility + id_map[node_id] = cid + return BNode(cid) + return node + + # Transform Subject, Object, and Graph Name (Predicate is never a BNode in RDFC1.0) + s_n = map_node(s) + p_n = p # Predicates are never BNodes in standard RDF + o_n = map_node(o) + g_n = map_node(g) + + # Use modified version of rdflib's internal _nq_row for standardized string output + line = self._nq_row((s_n, p_n, o_n),g_n) # 7.2) Add quad copy to the normalized dataset. - normalized.append(serialize_nquad(quad)) + normalized.append(line) # sort normalized output normalized.sort() - # 8) Return the normalized dataset. - if ( - options.get('format') == 'application/n-quads' - or options.get('format') == 'application/nquads' - ): - return ''.join(normalized) - return parse_nquads(''.join(normalized)) + # return nquads string + return ''.join(normalized), id_map # 4.6) Hash First Degree Quads def hash_first_degree_quads(self, id_): @@ -195,23 +247,25 @@ def hash_first_degree_quads(self, id_): quads = info['quads'] # 3) For each quad quad in quads: - for quad in quads: + for s, p, o, g in quads: # 3.1) Serialize the quad in N-Quads format with the following # special rule: # 3.1.1) If any component in quad is an blank node, then serialize # it using a special identifier as follows: - copy = {} - for key, component in quad.items(): - if key == 'predicate': - copy[key] = component - continue - # 3.1.2) If the blank node's existing blank node identifier - # matches the reference blank node identifier then use the - # blank node identifier _:a, otherwise, use the blank node - # identifier _:z. - copy[key] = self.modify_first_degree_component(id_, component, key) - nquads.append(serialize_nquad(copy)) + p_n = p # Predicates are never BNodes in standard RDF + + # 3.1.2) If the blank node's existing blank node identifier + # matches the reference blank node identifier then use the + # blank node identifier _:a, otherwise, use the blank node + # Replace current BNode with _:a, others with _:z for hashing + s_n = self.modify_first_degree_component(id_, s) + o_n = self.modify_first_degree_component(id_, o) + g_n = self.modify_first_degree_component(id_, g) + + # Use rdflib's internal _nt_row for standardized string output + line = self._nq_row((s_n, p_n, o_n), g_n) + nquads.append(line) # 4) Sort nquads in lexicographical order. nquads.sort() @@ -222,12 +276,10 @@ def hash_first_degree_quads(self, id_): return info['hash'] # helper for modifying component during Hash First Degree Quads - def modify_first_degree_component(self, id_, component, key): - if component['type'] != 'blank node': + def modify_first_degree_component(self, id_: str, component: Node, key: str = None): + if not isinstance(component, BNode): return component - component = copy.deepcopy(component) - component['value'] = '_:a' if component['value'] == id_ else '_:z' - return component + return BNode("a") if str(component) == id_ else BNode("z") # 4.7) Hash Related Blank Node def hash_related_blank_node(self, related, quad, issuer, position): @@ -261,7 +313,8 @@ def hash_related_blank_node(self, related, quad, issuer, position): # helper for getting a related predicate def get_related_predicate(self, quad): - return '<' + quad['predicate']['value'] + '>' + # quad is (s, p, o, g) + return f"<{str(quad[1])}>" # 4.8) Hash N-Degree Quads def hash_n_degree_quads(self, id_, issuer): @@ -400,20 +453,18 @@ def create_hash_to_related(self, id_, issuer): # 3.1) For each component in quad, if component is the subject, # object, and graph name and it is a blank node that is not # identified by identifier: - for key, component in quad.items(): - if ( - key != 'predicate' - and component['type'] == 'blank node' - and component['value'] != id_ - ): + for i, component in enumerate(quad): + if i != 1 and isinstance(component, BNode) and str(component) != id_: # 3.1.1) Set hash to the result of the Hash Related Blank # Node algorithm, passing the blank node identifier for # component as related, quad, path identifier issuer as # issuer, and position as either s, o, or g based on # whether component is a subject, object, graph name, # respectively. - related = component['value'] - position = self.POSITIONS[key] + + related = str(component) + # correct position codes: subject='s', object='o', graph='g' + position = self.POSITIONS[i] hash = self.hash_related_blank_node(related, quad, issuer, position) # 3.1.2) Add a mapping of hash to the blank node identifier @@ -425,7 +476,7 @@ def create_hash_to_related(self, id_, issuer): # helper to create appropriate hash object def create_hash(self): - return hashlib.sha256() + return self.hash_algorithm() # helper to hash a list of nquads def hash_nquads(self, nquads): @@ -434,6 +485,32 @@ def hash_nquads(self, nquads): md.update(nquad.encode('utf8')) return md.hexdigest() + # TODO: use drop-in replacements to not serialize with xsd:string; better to solve this at the rdflib level + def _nq_row(self, triple, context): + graph_name = ( + context.n3() + " " + if context and context != DATASET_DEFAULT_GRAPH_ID + else "" + ) + if isinstance(triple[2], Literal): + return f"{triple[0].n3()} {triple[1].n3()} {self._quoteLiteral(triple[2])} {graph_name}.\n" + else: + return f"{triple[0].n3()} {triple[1].n3()} {triple[2].n3()} {graph_name}.\n" + + def _quoteLiteral(self, l_: Literal) -> str: # noqa: N802 + """A simpler version of term.Literal.n3()""" + + encoded = _quote_encode(l_) + + if l_.language: + if l_.datatype: + raise Exception("Literal has datatype AND language!") + return f"{encoded}@{l_.language}" + elif l_.datatype and l_.datatype != XSD.string: + return f"{encoded}^^<{l_.datatype}>" + else: + return f"{encoded}" + class URGNA2012(URDNA2015): """ @@ -442,21 +519,19 @@ class URGNA2012(URDNA2015): def __init__(self): URDNA2015.__init__(self) + self.hash_algorithm = hashlib.sha1 # helper for modifying component during Hash First Degree Quads - def modify_first_degree_component(self, id_, component, key): - if component['type'] != 'blank node': + def modify_first_degree_component(self, id_: str, component: Node, key: str = None): + if not isinstance(component, BNode): return component - component = copy.deepcopy(component) if key == 'name': - component['value'] = '_:g' - else: - component['value'] = '_:a' if component['value'] == id_ else '_:z' - return component + return BNode("g") + return BNode("a") if str(component) == id_ else BNode("z") # helper for getting a related predicate def get_related_predicate(self, quad): - return quad['predicate']['value'] + return str(quad[1]) # helper for creating hash to related blank nodes map def create_hash_to_related(self, id_, issuer): @@ -470,16 +545,17 @@ def create_hash_to_related(self, id_, issuer): # 3) For each quad in quads: for quad in quads: + s, p , o, g = quad # 3.1) If the quad's subject is a blank node that does not match # identifier, set hash to the result of the Hash Related Blank Node # algorithm, passing the blank node identifier for subject as # related, quad, path identifier issuer as issuer, and p as # position. if ( - quad['subject']['type'] == 'blank node' - and quad['subject']['value'] != id_ + isinstance(s, BNode) + and str(s) != id_ ): - related = quad['subject']['value'] + related = str(s) position = 'p' # 3.2) Otherwise, if quad's object is a blank node that does # not match identifier, to the result of the Hash Related Blank @@ -487,10 +563,10 @@ def create_hash_to_related(self, id_, issuer): # as related, quad, path identifier issuer as issuer, and r # as position. elif ( - quad['object']['type'] == 'blank node' - and quad['object']['value'] != id_ + isinstance(o, BNode) + and str(o) != id_ ): - related = quad['object']['value'] + related = str(o) position = 'r' # 3.3) Otherwise, continue to the next quad. else: @@ -504,57 +580,73 @@ def create_hash_to_related(self, id_, issuer): return hash_to_related - # helper to create appropriate hash object - def create_hash(self): - return hashlib.sha1() +class RDFC10(URDNA2015): + """ + RDFC10 implements the RDF Canonicalization algorithm version 1.0. + """ + + def __init__(self, hash_algorithm = None): + URDNA2015.__init__(self) + # determine hash algorithm to use + if hash_algorithm is not None: + if hash_algorithm.lower() not in hashlib.algorithms_available: + raise UnknownFormatError('Unknown hash algorithm.', hash_algorithm) + self.hash_algorithm = getattr(hashlib, hash_algorithm.lower()) + + def _quoteLiteral(self, l_: Literal) -> str: # noqa: N802 + """A simpler version of term.Literal.n3()""" + + encoded = self._quote_encode(l_) + + if l_.language: + if l_.datatype: + raise Exception("Literal has datatype AND language!") + return f"{encoded}@{l_.language}" + elif l_.datatype and l_.datatype != XSD.string: + return f"{encoded}^^<{l_.datatype}>" + else: + return f"{encoded}" + + def _quote_encode(self, l_: str) -> str: + # Accept either an rdflib Literal or a plain string + s = str(l_) + + parts = [] + for ch in s: + code = ord(ch) + if ch == "\\": + parts.append('\\\\') + elif ch == '"': + parts.append('\\"') + elif ch == "\n": + parts.append('\\n') + elif ch == "\r": + parts.append('\\r') + elif ch == "\t": + parts.append('\\t') + elif code == 0x08: # backspace + parts.append('\\b') + elif code == 0x0C: # form feed + parts.append('\\f') + elif code == 0x0B or (code < 0x20) or (code == 0x7F): # vertical tab -> use \u000B + parts.append(f'\\u{code:04X}') + else: + parts.append(ch) + + return '"' + ''.join(parts) + '"' def permutations(elements): """ Generates all of the possible permutations for the given list of elements. + Uses itertools.permutations on a sorted copy. :param elements: the list of elements to permutate. """ - # begin with sorted elements - elements.sort() - # initialize directional info for permutation algorithm - left = {} - for v in elements: - left[v] = True - - length = len(elements) - last = length - 1 - while True: - yield elements - - # Calculate the next permutation using the Steinhaus-Johnson-Trotter - # permutation algorithm. - - # get largest mobile element k - # (mobile: element is greater than the one it is looking at) - k, pos = None, 0 - for i in range(length): - e = elements[i] - is_left = left[e] - if (k is None or e > k) and ( - (is_left and i > 0 and e > elements[i - 1]) - or (not is_left and i < last and e > elements[i + 1]) - ): - k, pos = e, i - - # no more permutations - if k is None: - return - - # swap k and the element it is looking at - swap = pos - 1 if left[k] else pos + 1 - elements[pos], elements[swap] = elements[swap], k - - # reverse the direction of all elements larger than k - for i in range(length): - if elements[i] > k: - left[elements[i]] = not left[elements[i]] - + from itertools import permutations as _it_permutations + els = sorted(elements) + for perm in _it_permutations(els): + yield list(perm) class UnknownFormatError(ValueError): """ diff --git a/lib/pyld/jsonld.py b/lib/pyld/jsonld.py index 672dce6..14c2a5d 100644 --- a/lib/pyld/jsonld.py +++ b/lib/pyld/jsonld.py @@ -31,7 +31,7 @@ from c14n.Canonicalize import canonicalize from pyld.__about__ import __copyright__, __license__, __version__ -from pyld.canon import URDNA2015, URGNA2012, UnknownFormatError +from pyld.canon import RDFC10, URDNA2015, URGNA2012, UnknownFormatError from pyld.identifier_issuer import IdentifierIssuer from pyld.nquads import ParserError, parse_nquads, serialize_nquad, serialize_nquads @@ -286,8 +286,8 @@ def normalize(input_, options=None): :param input_: the JSON-LD input to normalize. :param [options]: the options to use. - [algorithm] the algorithm to use: `URDNA2015` or `URGNA2012` - (default: `URGNA2012`). + [algorithm] the algorithm to use: `RDFC10`, `URDNA2015` or `URGNA2012` + (default: `RDFC10`). [base] the base IRI to use. [inputFormat] the format if input is not JSON-LD: 'application/n-quads' for N-Quads. @@ -925,8 +925,10 @@ def normalize(self, input_, options): :param input_: the JSON-LD input to normalize. :param options: the options to use. - [algorithm] the algorithm to use: `URDNA2015` or `URGNA2012` - (default: `URGNA2012`). + [algorithm] the algorithm to use: `RDFC10`, `URDNA2015` or `URGNA2012` + (default: `RDFC10`). + [hashAlgorithm] the hashing algorithm to use; only applicable to `RDFC10`. + (default: `SHA256`). [base] the base IRI to use. [contextResolver] internal use only. [inputFormat] the format if input is not JSON-LD: @@ -935,12 +937,15 @@ def normalize(self, input_, options): 'application/n-quads' for N-Quads. [documentLoader(url, options)] the document loader (default: _default_document_loader). + [outputMap] if True, the function will return a map of blank node + identifiers to their normalized identifiers instead of the + normalized dataset (default: False). - :return: the normalized output. + :return: the normalized output or the map of blank node identifiers. """ # set default options options = options.copy() if options else {} - options.setdefault('algorithm', 'URGNA2012') + options.setdefault('algorithm', 'RDFC10') options.setdefault('base', input_ if _is_string(input_) else '') options.setdefault('documentLoader', _default_document_loader) options.setdefault( @@ -950,21 +955,14 @@ def normalize(self, input_, options): options.setdefault('extractAllScripts', True) options.setdefault('processingMode', 'json-ld-1.1') - if options['algorithm'] not in ['URDNA2015', 'URGNA2012']: + if options['algorithm'] not in ['RDFC10', 'URDNA2015', 'URGNA2012']: raise JsonLdError( 'Unsupported normalization algorithm.', 'jsonld.NormalizeError' ) try: if 'inputFormat' in options: - if ( - options['inputFormat'] != 'application/n-quads' - and options['inputFormat'] != 'application/nquads' - ): - raise JsonLdError( - 'Unknown normalization input format.', 'jsonld.NormalizeError' - ) - dataset = JsonLdProcessor.parse_nquads(input_) + dataset = input_ else: # convert to RDF dataset then do normalization opts = dict(options) @@ -979,16 +977,22 @@ def normalize(self, input_, options): ) from cause # do normalization - if options['algorithm'] == 'URDNA2015': - try: - return URDNA2015().main(dataset, options) - except UnknownFormatError as cause: - raise JsonLdError( - str(cause), 'jsonld.UnknownFormat', {'format': cause.format} - ) from cause - + if options['algorithm'] == 'RDFC10': + algorithm = RDFC10(hash_algorithm = options.get('hashAlgorithm')) + elif options['algorithm'] == 'URDNA2015': + algorithm = URDNA2015() # assume URGNA2012 - return URGNA2012().main(dataset, options) + else: + algorithm = URGNA2012() + + try: + return algorithm.main(dataset, options) + except UnknownFormatError as cause: + raise JsonLdError( + str(cause), + 'jsonld.UnknownFormat', + {'format': cause.format}) from cause + def from_rdf(self, dataset, options): """ diff --git a/lib/pyld/util.py b/lib/pyld/util.py new file mode 100644 index 0000000..b983005 --- /dev/null +++ b/lib/pyld/util.py @@ -0,0 +1,120 @@ +from rdflib import BNode, Dataset, Literal, URIRef +from rdflib.graph import DATASET_DEFAULT_GRAPH_ID + + +# Helpers for converting between rdflib.Dataset and legacy dict structure used in pyld +def to_legacy_dataset(dataset: Dataset) -> dict: + """ + Transforms an rdflib.Dataset into the RDF.js-style dictionary structure, + ensuring Blank Node values start with '_:'. + """ + compat_dataset = {'@default':[]} + + for s, p, o, g in dataset.quads((None, None, None, None)): + # 1. Determine Graph Key + graph_id = '@default' + if g is not None and g != DATASET_DEFAULT_GRAPH_ID: + graph_id = f"_:{str(g)}" if isinstance(g, BNode) else str(g) + + if graph_id not in compat_dataset: + compat_dataset[graph_id] = [] + + # 2. Helper to convert nodes + def term_to_dict(node): + if isinstance(node, BNode): + # Ensure the value starts with _: + val = str(node) + if not val.startswith('_:'): + val = f"_:{val}" + return {'type': 'blank node', 'value': val} + + elif isinstance(node, URIRef): + return {'type': 'IRI', 'value': str(node)} + + elif isinstance(node, Literal): + res = {'type': 'literal', 'value': str(node)} + if node.language: + res['language'] = node.language + if node.datatype: + res['datatype'] = str(node.datatype) + return res + raise ValueError(f'Illegal node type {type(node)}') + + # 3. Build legacy quad + compat_dataset[graph_id].append( + { + 'subject': term_to_dict(s), + 'predicate': term_to_dict(p), + 'object': term_to_dict(o), + } + ) + + return compat_dataset + + +def from_legacy_dataset(dataset: dict) -> Dataset: + """ + Converts legacy dict structure back into an rdflib.Dataset. + """ + ds = Dataset() + + for graph_name, triples in dataset.items(): + # Handle graph name + try: + g = from_legacy_graph(graph_name, ds.default_graph) + except Exception as err: + raise ValueError(f'Illegal graph name: {graph_name}') from err + + for t in triples: + s, p, o = from_legacy_triple(t) + ds.add((s, p, o, g)) + + return ds + +def from_legacy_graph(graph: str, default_graph = DATASET_DEFAULT_GRAPH_ID) -> URIRef | BNode: + """ + Converts a legacy graph name into an rdflib URIRef or BNode. + """ + if graph == '@default': + return default_graph + # Check if graph name is a blank node or IRI + elif graph.startswith('_:'): + return BNode(graph[2:]) + else: + return URIRef(graph) + +def from_legacy_triple(triple: dict, normalize=False) -> tuple: + """ + Converts a legacy triple dict into an rdflib triple tuple. + """ + if not all(k in triple for k in ('subject', 'predicate', 'object')): + raise ValueError(f'Illegal quad structure: {triple}') + + def to_node(comp): + if not isinstance(comp, dict) or 'type' not in comp or 'value' not in comp: + raise ValueError(f'Illegal quad structure: {comp}') + + val = comp['value'] + if comp['type'] == 'blank node': + # Strip '_:' because RDFLib adds it back internally + return BNode(val[2:] if val.startswith('_:') else val) + elif comp['type'] == 'IRI': + return URIRef(val) + elif comp['type'] == 'literal': + return Literal( + val, + lang=comp.get('language'), + datatype=URIRef(comp['datatype']) + if comp.get('datatype') and not comp.get('language') + else None, + # Don't normalize literal values to prevent datetime issues + # TODO: this means only rdflib.Dataset() created with normalization turned off will work properly. + normalize=normalize, + ) + raise ValueError('Illegal component type {}'.format(comp['type'])) + + s = to_node(triple['subject']) + p = to_node(triple['predicate']) + o = to_node(triple['object']) + + return (s, p, o) diff --git a/requirements.txt b/requirements.txt index ba95135..630bd17 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,4 @@ aiohttp; python_version >= '3.5' lxml cachetools frozendict +rdflib \ No newline at end of file diff --git a/setup.py b/setup.py index b9d27ae..03304e9 100644 --- a/setup.py +++ b/setup.py @@ -52,11 +52,13 @@ 'cachetools', 'frozendict', 'lxml', + 'rdflib', ], extras_require={ 'requests': ['requests'], 'aiohttp': ['aiohttp'], 'cachetools': ['cachetools'], 'frozendict': ['frozendict'], + 'rdflib': ['rdflib'], } ) diff --git a/tests/runtests.py b/tests/runtests.py index 68354c7..323b1c9 100644 --- a/tests/runtests.py +++ b/tests/runtests.py @@ -91,12 +91,14 @@ 'https://w3c.github.io/json-ld-api/tests', 'https://w3c.github.io/json-ld-framing/tests', 'https://github.com/json-ld/normalization/tests', + 'https://w3c.github.io/rdf-canon/tests/vocab#' ] SPEC_DIRS = [ '../specifications/json-ld-api/tests/', '../specifications/json-ld-framing/tests/', '../specifications/normalization/tests/', + '../specifications/rdf-canon/tests/' ] # NOTE: The following TestRunner class can be removed because pytest now @@ -477,6 +479,10 @@ def runTest(self): print('ACTUAL: ', result) raise AssertionError('results differ') elif not self.is_negative: + # If the result is a dict and the expected value is a string, + # the expected value is probably JSON. + if isinstance(result, dict) and isinstance(expect, str): + expect = json.loads(expect) # Perform order-independent equivalence test if equal_unordered(result, expect): self.assertTrue(True) @@ -662,6 +668,7 @@ def create(test): if k not in http_options: options[k] = v options['documentLoader'] = create_document_loader(test) + options['hashAlgorithm'] = test.data.get('hashAlgorithm') options.update(opts or {}) if 'expandContext' in options: filename = os.path.join(test.dirname, options['expandContext']) @@ -1058,11 +1065,7 @@ def write(self, filename): }, 'rdfn:Urgna2012EvalTest': { 'pending': {'idRegex': []}, - 'skip': { - 'idRegex': [ - '.*manifest-urgna2012#test060$', - ] - }, + 'skip': {'idRegex': []}, 'fn': 'normalize', 'params': [ read_test_property('action'), @@ -1077,11 +1080,7 @@ def write(self, filename): }, 'rdfn:Urdna2015EvalTest': { 'pending': {'idRegex': []}, - 'skip': { - 'idRegex': [ - '.*manifest-urdna2015#test060$', - ] - }, + 'skip': {'idRegex': []}, 'fn': 'normalize', 'params': [ read_test_property('action'), @@ -1094,6 +1093,37 @@ def write(self, filename): ), ], }, + 'rdfc:RDFC10EvalTest': { + 'pending': {'idRegex': []}, + 'skip': {'idRegex': []}, + 'fn': 'normalize', + 'params': [ + read_test_property('action'), + create_test_options({ + 'algorithm': 'RDFC10', + 'inputFormat': 'application/n-quads', + 'format': 'application/n-quads' + }) + ] + }, + 'rdfc:RDFC10MapTest': { + 'pending': { + 'idRegex': [] + }, + 'skip': { + 'idRegex': [] + }, + 'fn': 'normalize', + 'params': [ + read_test_property('action'), + create_test_options({ + 'algorithm': 'RDFC10', + 'inputFormat': 'application/n-quads', + 'format': 'application/n-quads', + 'outputMap': True + }) + ] + } } diff --git a/tests/test_util.py b/tests/test_util.py new file mode 100644 index 0000000..0bd6c07 --- /dev/null +++ b/tests/test_util.py @@ -0,0 +1,320 @@ +import pytest +from rdflib import BNode, Dataset, Literal, URIRef +from rdflib.graph import DATASET_DEFAULT_GRAPH_ID + +from pyld.util import from_legacy_dataset, to_legacy_dataset + + +class TestToLegacyDataset: + """A comprehensive class-based test suite for `to_legacy_dataset` split into granular tests.""" + + def setup_method(self): + """Initialize a clean Dataset before each test.""" + self.dataset = Dataset() + + def test_to_legacy_dataset_empty_dataset_returns_minimal_structure(self): + """Ensure an empty dataset produces a minimal dict with exactly one graph ('@default') and no quads.""" + result = to_legacy_dataset(Dataset()) + assert len(result) == 1 + assert '@default' in result + assert len(result['@default']) == 0 + + def test_to_legacy_dataset_handles_blank_subject(self): + """Ensure blank node subjects are correctly converted to dict with type 'blank node' and value prefixed by '_:'.""" + self.dataset.add((BNode('b1'), URIRef('p1'), Literal('o1', lang='en'))) + result = to_legacy_dataset(self.dataset) + + # Get the quad from @default graph + quads_in_graph = result['@default'] + assert len(quads_in_graph) == 1 + + subject_entry = quads_in_graph[0]['subject'] + assert subject_entry['type'] == 'blank node' + assert subject_entry['value'].startswith('_:') + assert ( + subject_entry['value'][2:] == 'b1' + ) # Check that the original BNode identifier is preserved after prefix + + def test_to_legacy_dataset_handles_blank_object(self): + """Ensure blank node objects are correctly converted to dict with type 'blank node' and prefixed value.""" + self.dataset.add((URIRef('s2'), URIRef('p2'), BNode('b3'))) + result = to_legacy_dataset(self.dataset) + + # Get the quad from @default graph + quads_in_graph = result['@default'] + assert len(quads_in_graph) == 1 + + object_entry = quads_in_graph[0]['object'] + assert object_entry['type'] == 'blank node' + assert object_entry['value'].startswith('_:') + assert ( + object_entry['value'][2:] == 'b3' + ) # Check that the original BNode identifier is preserved after prefix + + def test_to_legacy_dataset_preserves_literal_language(self): + """Ensure literal objects with language are preserved in the output as a 'language' field.""" + + self.dataset.add((URIRef('s1'), URIRef('p2'), Literal('o3', lang='fr'))) + result = to_legacy_dataset(self.dataset) + + # Get the quad from @default graph + quads_in_graph = result['@default'] + assert len(quads_in_graph) == 1 + + object_entry = quads_in_graph[0]['object'] + assert object_entry.get('language') is not None + assert object_entry['language'] == 'fr' + + def test_to_legacy_dataset_preserves_literal_datatype(self): + """Ensure literal objects with a datatype are preserved in the output as 'datatype' field.""" + self.dataset.add( + ( + URIRef('s1'), + URIRef('p2'), + Literal("x", datatype="http://example.org/float"), + ), + ) + result = to_legacy_dataset(self.dataset) + + # Get the quad from @default graph + quads_in_graph = result['@default'] + assert len(quads_in_graph) == 1 + + object_entry = quads_in_graph[0]['object'] + assert object_entry.get('datatype') is not None + assert object_entry['datatype'] == "http://example.org/float" + + def test_to_legacy_dataset_correctly_maps_graph(self): + """Ensure graph key is correctly derived.""" + self.dataset.add( + ( + URIRef('s1'), + URIRef('p2'), + Literal('o3'), + URIRef('http://example.org/graph'), + ) + ) + result = to_legacy_dataset(self.dataset) + + # There should be two graphs: '@default' (empty) and 'http://example.org/graph' (with one quad) + assert len(result.keys()) == 2 + + # Check that '@default' graph exists and is empty + assert '@default' in result + assert len(result['@default']) == 0 + + # Check that 'http://example.org/graph' exists and contains the quad + assert 'http://example.org/graph' in result + assert len(result['http://example.org/graph']) == 1 + + def test_to_legacy_dataset_correctly_maps_bnode_graph(self): + """Ensure graph key is correctly prefixed with '_' for BNodes""" + self.dataset.add((URIRef('s1'), URIRef('p2'), Literal('o3'), BNode('g4'))) + result = to_legacy_dataset(self.dataset) + + # There should be two graphs: '@default' (empty) and '_:g4' (with one quad) + assert len(result.keys()) == 2 + + # Check that '@default' graph exists and is empty + assert '@default' in result + assert len(result['@default']) == 0 + + # Check that '_:g4' graph exists and contains the quad + assert '_:g4' in result + assert len(result['_:g4']) == 1 + + def test_to_legacy_dataset_correctly_maps_default_graph(self): + """Ensure graph key is correctly derived '@default' for None or default graph.""" + self.dataset.add( + (URIRef('s1'), URIRef('p2'), Literal('o3'), DATASET_DEFAULT_GRAPH_ID) + ) + self.dataset.add((URIRef('s2'), URIRef('p2'), Literal('o3'), None)) + print(list(self.dataset.quads((None, None, None, None)))) + result = to_legacy_dataset(self.dataset) + + assert len(result.keys()) == 1 + assert '@default' in result + assert ( + len(result['@default']) == 1 + ) # Only first quad should be in the @default graph + + +class TestFromLegacyDataset: + """A comprehensive class-based test suite for `from_legacy_dataset`, split into granular tests.""" + + def test_from_legacy_dataset_restores_default_graph(self): + """Ensure from_legacy_dataset correctly reconstructs the default graph.""" + legacy_data = { + '@default': [ + { + 'subject': {'type': 'blank node', 'value': '_:s1'}, + 'predicate': {'type': 'IRI', 'value': 'p1'}, + 'object': {'type': 'literal', 'value': 'o1'}, + } + ] + } + + restored_dataset = from_legacy_dataset(legacy_data) + + for quad in restored_dataset.quads((None, None, None, None)): + s, p, o, g = quad + assert g == DATASET_DEFAULT_GRAPH_ID + assert isinstance(s, BNode) + assert str(s) == 's1' + + @pytest.mark.parametrize( + "legacy_data", + [ + { + 'http://example.org': [ + { + 'subject': {'type': 'blank node', 'value': '_:s1'}, + 'predicate': {'type': 'IRI', 'value': 'p1'}, + 'object': {'type': 'literal', 'value': 'o1'}, + } + ] + }, + { + '_:g2': [ + { + 'subject': {'type': 'blank node', 'value': '_:s2'}, + 'predicate': {'type': 'blank node', 'value': '_:p4'}, + 'object': {'type': 'literal', 'value': 'o2'}, + } + ] + }, + ], + ) + def test_from_legacy_dataset_restores_blank_subject(self, legacy_data): + """Ensure from_legacy_dataset correctly reconstructs a blank node subject.""" + restored_dataset = from_legacy_dataset(legacy_data) + + for quad in restored_dataset.quads((None, None, None, None)): + s, p, o, g = quad + if isinstance(s, BNode): + assert not s.startswith('_:') + assert ( + str(s) + == legacy_data['_:' + str(g) if isinstance(g, BNode) else str(g)][ + 0 + ]['subject']['value'][2:] + ) + + @pytest.mark.parametrize( + "legacy_data", + [ + { + '@default': [ + { + 'subject': {'type': 'IRI', 'value': 's1'}, + 'predicate': {'type': 'blank node', 'value': '_:p2'}, + 'object': {'type': 'literal', 'value': 'o3'}, + } + ] + } + ], + ) + def test_from_legacy_dataset_restores_blank_predicate(self, legacy_data): + """Ensure from_legacy_dataset correctly reconstructs a blank node predicate.""" + restored_dataset = from_legacy_dataset(legacy_data) + + for quad in restored_dataset.quads((None, None, None, None)): + s, p, o, g = quad + if isinstance(p, BNode): + assert not str(p).startswith('_:') + assert str(p) == legacy_data['@default'][0]['predicate']['value'][2:] + + @pytest.mark.parametrize( + "legacy_data", + [ + { + '@default': [ + { + 'subject': {'type': 'IRI', 'value': 's1'}, + 'predicate': {'type': 'IRI', 'value': 'p2'}, + 'object': {'type': 'literal', 'value': 'o3', 'language': 'en'}, + } + ] + } + ], + ) + def test_from_legacy_dataset_restores_literal_with_language(self, legacy_data): + """Ensure from_legacy_dataset correctly restores a literal with language.""" + restored_dataset = from_legacy_dataset(legacy_data) + + for quad in restored_dataset.quads((None, None, None, None)): + s, p, o, g = quad + assert o.language == legacy_data['@default'][0]['object']['language'] + + @pytest.mark.parametrize( + "legacy_data", + [ + { + '@default': [ + { + 'subject': {'type': 'IRI', 'value': 's1'}, + 'predicate': {'type': 'IRI', 'value': 'p2'}, + 'object': { + 'type': 'literal', + 'value': 'o3', + 'datatype': 'http://example.org/float', + }, + } + ] + } + ], + ) + def test_from_legacy_dataset_restores_literal_with_datatype(self, legacy_data): + """Ensure from_legacy_dataset correctly restores a literal with datatype.""" + restored_dataset = from_legacy_dataset(legacy_data) + quads = list(restored_dataset.quads((None, None, None, None))) + + for quad in quads: + s, p, o, g = quad + if isinstance(o, Literal) and hasattr(o, 'datatype'): + assert ( + str(o.datatype) == legacy_data['@default'][0]['object']['datatype'] + ) + + def test_from_legacy_dataset_invalid_graph_raises_value_error(self): + """Ensure from_legacy_dataset raises ValueError when given invalid graph name.""" + with pytest.raises(ValueError, match="Illegal graph name: None"): + from_legacy_dataset( + { + None: [ + { + 'subject': {'type': 'IRI', 'value': 's1'}, + 'predicate': {'type': 'IRI', 'value': 'p2'}, + 'object': { + 'type': 'literal', + 'value': 'o3', + 'datatype': 'http://example.org/float', + }, + } + ] + } + ) + + def test_from_legacy_dataset_incomplete_quad_raises_value_error(self): + """Ensure from_legacy_dataset raises ValueError when given invalid input (e.g., missing structure).""" + with pytest.raises(ValueError, match="Illegal quad structure"): + from_legacy_dataset({"@default": [{"subject": {'type': 'IRI', 'value': 's1'}}]}) + + def test_from_legacy_dataset_invalid_quad_term_raises_value_error(self): + """Ensure from_legacy_dataset raises ValueError when given invalid input (e.g., missing structure).""" + with pytest.raises(ValueError, match="Illegal quad structure"): + from_legacy_dataset( + { + "@default": [ + { + 'subject': "bad", + 'predicate': {'type': 'IRI', 'value': 'p2'}, + 'object': { + 'type': 'literal', + 'value': 'o3', + 'datatype': 'http://example.org/float', + }, + } + ] + } + )