diff --git a/docs/developers.md b/docs/developers.md index 39367104..23e879c8 100644 --- a/docs/developers.md +++ b/docs/developers.md @@ -4,12 +4,19 @@ See [interface.py], which defines the interface of a backend and may serve as a template for creating new backends. -### Developing the sparqlwrapper +## Setting up local GraphDB and Fuseki services for testing Tripper comes with an inbuilt backend to the SPARQLWrapper. In order to test this properly a real triplestore is needed. This is not done in the -automatic workflows on github. However, a local graphDB can be setup as described below and tested with test_sparqlwrapper_graphdb.py. +automatic workflows on github. However, local graphDB and Fuseki services +can be setup as described below and tested with +`tests/backends/test_sparqlwrapper_graphdb_fuseki.py`. +The backend configurations corresponding to the local GraphDB and Fuseki services +can be found in `[tests/input/session.yaml]`. + + +### Setting up GraphDB service To create the local instance of graphdb: ```bash docker pull ontotext/graphdb:10.8.3 # latest tag 17.02.2025 @@ -30,6 +37,7 @@ You can now run the test test_sparqlwrapper_graphdb_fuseki.py with graphdb. Note that if the graphdb instance is not found the test will just be skipped. +### Setting up Fuseki service Similarly a jena-fuseki instance can be tested locally as follows: ```bash @@ -37,7 +45,7 @@ docker pull stain/jena-fuseki docker run -d --name fuseki -p 3030:3030 -e ADMIN_PASSWORD=admin0 -e=FUSEKI_DATASET_1=test_repo stain/jena-fuseki ``` -You can now run the test test_sparqlwrapper_graphdb_fuseki.py with fuseki. +You can now run the test `test_sparqlwrapper_graphdb_fuseki.py` with fuseki. Note that if the fuseki instance is not found the test will just be skipped. @@ -75,3 +83,4 @@ Then open http://127.0.0.1:8000/tripper/ in your browser. [interface.py]: https://github.com/EMMC-ASBL/tripper/blob/master/tripper/interface.py [mkdocs]: https://www.mkdocs.org/ +[tests/input/session.yaml]: https://github.com/EMMC-ASBL/tripper/blob/master/tests/input/session.yaml) diff --git a/docs/session.md b/docs/session.md index d1dc168b..00332fe3 100644 --- a/docs/session.md +++ b/docs/session.md @@ -11,18 +11,28 @@ The default location of this configuration file depends on the system: - Windows: `$HOME/AppData/Local/tripper/Config/session.yaml` - Darwin: `$HOME/Library/Config/tripper/session.yaml` -Add some default +The schema of the YAML file is simple. +A session should have a name that identifies it and should be followed by keyword arguments accepted by the `Triplestore` constructor. + +Here is an example of a possible session file: ``` +--- + +RdflibTest: + backend: rdflib + GraphDBTest: backend: sparqlwrapper base_iri: http://localhost:7200/repositories/test_repo update_iri: http://localhost:7200/repositories/test_repo/statements + check_url: http://localhost:7200/repositories FusekiTest: backend: sparqlwrapper base_iri: http://localhost:3030/test_repo update_iri: http://localhost:3030/test_repo/update + check_url: http://localhost:3030 username: admin password: admin0 @@ -30,15 +40,18 @@ MyKB: backend: sparqlwrapper base_iri: https://graphdb.myproject.eu/repositories/test_repo update_iri: https://graphdb.myproject.eu/repositories/test_repo/statements + check_url: https://graphdb.myproject.eu/repositories username: myname password: KEYRING ``` -The two first entries correspond to the GraphDB and Fuseki services -that can be started with docker as described in the [developers] -section. +The first entry is an in-memory rdflib backend. + +The second and third entries correspond to GraphDB and Fuseki services, +respectively. +These can be started with docker as described in the [developers] section. -The third entry is just a dummy example, showing how to use [keyring]. +The fourth entry is just a dummy example, showing how to use [keyring]. Each entry starts with the name identifying the configured triplestore. The keywords following it, correspond to the keyword arguments passed to the diff --git a/tests/backends/test_sparqlwrapper_graphdb_fuseki.py b/tests/backends/test_sparqlwrapper_graphdb_fuseki.py index 9d2b3dcc..eac9f54e 100644 --- a/tests/backends/test_sparqlwrapper_graphdb_fuseki.py +++ b/tests/backends/test_sparqlwrapper_graphdb_fuseki.py @@ -5,57 +5,37 @@ https://emmc-asbl.github.io/tripper/latest/developers/. """ +from pathlib import Path + import pytest +from tripper import Session + pytest.importorskip("pyld") -# URL to check if GraphDB is running. -GRAPHDB_CHECK_URL = "http://localhost:7200/repositories" -FUSEKI_CHECK_URL = "http://localhost:3030" - - -def get_triplestore(tsname: str) -> "Triplestore": - """Help function that returns a new triplestore object.""" - from tripper import Triplestore - - if tsname == "GraphDB": - ts = Triplestore( - backend="sparqlwrapper", - base_iri="http://localhost:7200/repositories/test_repo", - update_iri=( - "http://localhost:7200/repositories/test_repo/statements" - ), - ) - elif tsname == "Fuseki": - ts = Triplestore( - backend="sparqlwrapper", - base_iri=f"{FUSEKI_CHECK_URL}/test_repo", - update_iri=f"{FUSEKI_CHECK_URL}/test_repo/update", - username="admin", - password="admin0", - ) - else: - raise ValueError(f"Unsupported triplestore name: {tsname}") - - return ts +thisdir = Path(__file__).resolve().parent +indir = thisdir.parent / "input" + +session = Session(config=indir / "session.yaml") # if True: -# tsname = "Fuseki" -def populate_and_search(tsname): # pylint: disable=too-many-statements +# sessionName = "GraphDBTest" +# sessionName = "FusekiTest" +def populate_and_search(sessionName): # pylint: disable=too-many-statements """Do the test on the desried backend.""" # pylint: disable=too-many-locals - from pathlib import Path - from tripper import Literal from tripper.datadoc import acquire, save_datadoc, search - thisdir = Path(__file__).resolve().parent + ts = session.get_triplestore(sessionName) + if ts.check_url and not ts.available(timeout=1): + pytest.skip(f"{sessionName} service not available; skipping test.") + datasetinput = thisdir / "datadocumentation_sample.yaml" datasetinput2 = thisdir / "datadocumentation_sample2.yaml" - ts = get_triplestore(tsname) EX = ts.bind("ex", "http://www.example.org/") # Test DELETE query - clear the triplestore @@ -194,28 +174,12 @@ def populate_and_search(tsname): # pylint: disable=too-many-statements def test_graphdb(): - """ - Test the sparqlwrapper backend using GraphDB. - """ - # Check if GraphDB is available and write a warning if it is not. - from tripper.utils import check_service_availability - - if not check_service_availability(GRAPHDB_CHECK_URL, timeout=1): - pytest.skip("GraphDB instance not available locally; skipping tests.") - - print("Testing graphdb") - populate_and_search("GraphDB") + """Test the sparqlwrapper backend using GraphDB.""" + # Use service configured in tests/input/session.yaml + populate_and_search("GraphDBTest") def test_fuseki(): - """ - Test the sparqlwrapper backend using Fuseki. - """ - # Check if Fuseki is available and write a warning if it is not. - from tripper.utils import check_service_availability - - if not check_service_availability(FUSEKI_CHECK_URL, timeout=1): - pytest.skip("Fuseki instance not available locally; skipping tests.") - - print("Testing fuseki") - populate_and_search("Fuseki") + """Test the sparqlwrapper backend using Fuseki.""" + # Use service configured in tests/input/session.yaml + populate_and_search("FusekiTest") diff --git a/tests/datadoc/dataset_paths.py b/tests/datadoc/dataset_paths.py index 32b5b092..f813098b 100644 --- a/tests/datadoc/dataset_paths.py +++ b/tests/datadoc/dataset_paths.py @@ -6,8 +6,12 @@ from pathlib import Path +from tripper import Session + testdir = Path(__file__).absolute().parent.parent.resolve() rootdir = testdir.parent.resolve() ontodir = testdir / "ontologies" indir = testdir / "input" outdir = testdir / "output" + +session = Session(config=indir / "session.yaml") diff --git a/tests/datadoc/test_dataset.py b/tests/datadoc/test_dataset.py index 85a10d7f..673ea72f 100644 --- a/tests/datadoc/test_dataset.py +++ b/tests/datadoc/test_dataset.py @@ -1,7 +1,6 @@ """Test the dataset module.""" # pylint: disable=invalid-name,too-many-locals,duplicate-code - import pytest pytest.importorskip("yaml") @@ -356,20 +355,25 @@ def test_update_classes(): } in r3["subClassOf"] -def test_datadoc(): +# sessionName = "FusekiTest" +sessionName = "RdflibTest" +if True: + # def datasettest(sessionName): """Test save_datadoc() and acquire()/store().""" # pylint: disable=too-many-statements - from dataset_paths import indir # pylint: disable=import-error + from dataset_paths import indir, session # pylint: disable=import-error - from tripper import CHAMEO, DCAT, DCTERMS, EMMO, OTEIO, Triplestore + from tripper import CHAMEO, DCAT, DCTERMS, EMMO, OTEIO from tripper.datadoc import acquire, save_datadoc, search, store from tripper.datadoc.errors import NoSuchTypeError pytest.importorskip("dlite") pytest.importorskip("rdflib") - ts = Triplestore("rdflib") + ts = session.get_triplestore(sessionName) + if ts.check_url and not ts.available(timeout=1): + pytest.skip(f"{sessionName} service not available; skipping test.") # Load data documentation into triplestore datadoc = save_datadoc(ts, indir / "semdata.yaml") @@ -381,6 +385,8 @@ def test_datadoc(): SEMDATA = ts.namespaces["semdata"] iri = SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"] d = acquire(ts, iri, use_sparql=False) + print("----") + print(d) assert d["@id"] == iri assert set(d["@type"]) == { DCAT.Dataset, @@ -492,6 +498,115 @@ def test_datadoc(): SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"], } + # Filter on criterion, but without required value + assert set( + search( + ts, + criteria={"creator.name": None}, + ) + ) == { + SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"], + SEMDATA["SEM_cement_batch2/77600-23-001"], + SEMDATA["SEM_cement_batch2"], + } + + # Filter on criterion, but with any predicate + assert set( + search( + ts, + criteria={None: ["Named Lab Assistant"]}, + ) + ) == { + SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"], + SEMDATA["SEM_cement_missingcreator"], + } + + # Filter on criterion, but with any predicate + assert set( + search( + ts, + criteria={None: "Named Lab Assistant"}, + ) + ) == { + SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"], + SEMDATA["SEM_cement_missingcreator"], + } + + # Filter on more criteria with any predicate, testlabel tests that + # indirect search through inSeries works. + assert set( + search( + ts, + criteria={None: ["Named Lab Assistant", "testlabel"]}, + ) + ) == { + SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"], + SEMDATA["SEM_cement_missingcreator"], + SEMDATA["SEM_cement_batch2/77600-23-001"], + } + + # Filter on two different criteria in a dict) + assert set( + search( + ts, + criteria={"creator.name": "Sigurd Wenner", "label": "testlabel"}, + ) + ) == { + SEMDATA["SEM_cement_batch2"], + } + + # Filter on two different criteria in a list of tuples + assert set( + search( + ts, + criteria=[ + ("creator.name", "Sigurd Wenner"), + ("label", "testlabel"), + ], + ) + ) == { + SEMDATA["SEM_cement_batch2"], + } + + assert set( + search( + ts, + criteria=[ + (None, "Sigurd Wenner"), + (None, "testlabel"), + ], + ) + ) == { + SEMDATA["SEM_cement_batch2"], + SEMDATA["SEM_cement_batch2/77600-23-001"], + SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"], + } + + assert set( + search( + ts, + criteria=[ + (None, "http://onto-ns.com/meta/matchmaker/0.2/SEMImage"), + ], + ) + ) == { + SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"], + } + + assert set( + search( + ts, + criteria=[ + ( + "https://w3id.org/emmo/domain/oteio#hasDatamodel", + "http://onto-ns.com/meta/matchmaker/0.2/SEMImage", + ), + ], + ) + ) == { + SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"], + } + with pytest.raises(NoSuchTypeError): search(ts, type="invalid-type") @@ -499,10 +614,12 @@ def test_datadoc(): assert set(search(ts, regex={"dcterms:title": "SEM images"})) == { SEMDATA.SEM_cement_batch2, SAMPLE["SEM_cement_batch2/77600-23-001"], + SEMDATA.SEM_cement_missingcreator, } assert set(search(ts, regex={"dcterms:title": "SEM i[^ ]*s"})) == { SEMDATA.SEM_cement_batch2, SAMPLE["SEM_cement_batch2/77600-23-001"], + SEMDATA.SEM_cement_missingcreator, } # Get individual with given IRI @@ -581,7 +698,7 @@ def test_validate(): def test_pipeline(): """Test creating OTEAPI pipeline.""" - pytest.skip() + # pytest.skip() from tripper import Triplestore @@ -614,27 +731,6 @@ def test_pipeline(): pipeline.get() -def test_fuseki(): - """Test save and load dataset with Fuseki.""" - import os - - from tripper import Triplestore - - host = os.getenv("TRIPLESTORE_HOST", "localhost") - port = os.getenv("TRIPLESTORE_PORT", "3030") - fuseki_args = { - "backend": "fusekix", - "base_iri": "http://example.com/ontology#", - "triplestore_url": f"http://{host}:{port}", - "database": "openmodel", - } - try: - ts = Triplestore(**fuseki_args) - except ModuleNotFoundError: - pytest.skip("Cannot connect to Fuseki server") - ts.remove_database(**fuseki_args) - - def test_deprecated(): """Test deprecated save_dict(), load_dict() and search_iris().""" from tripper import Triplestore @@ -667,3 +763,49 @@ def test_deprecated(): with pytest.warns(DeprecationWarning): iris = search_iris(ts, criterias={"creator.name": "John Doe"}) assert iris == [EX.exdata] + + +def get_triplestore(tsname: str) -> "Triplestore": + """Help function that returns a new triplestore object.""" + from tripper import Triplestore + + if tsname == "GraphDB": + ts = Triplestore( + backend="sparqlwrapper", + base_iri="http://localhost:7200/repositories/test_repo", + update_iri=( + "http://localhost:7200/repositories/test_repo/statements" + ), + ) + elif tsname == "Fuseki": + ts = Triplestore( + backend="sparqlwrapper", + base_iri=f"{FUSEKI_CHECK_URL}/test_repo", + update_iri=f"{FUSEKI_CHECK_URL}/test_repo/update", + username="admin", + password="admin0", + ) + elif tsname == "rdflib": + ts = Triplestore("rdflib") + else: + raise ValueError(f"Unsupported triplestore name: {tsname}") + + return ts + + +# Use service configured in tests/input/session.yaml + + +def test_rdflib_datadoc(): + """Test the dataset module using rdflib.""" + datasettest("RdflibTest") + + +def test_graphdb_datadoc(): + """Test the dataset module using GraphDB.""" + datasettest("GraphDBTest") + + +def test_fuseki_datadoc(): + """Test the dataset module using Fuseki.""" + datasettest("FusekiTest") diff --git a/tests/input/semdata.yaml b/tests/input/semdata.yaml index ce66d8d1..176cf8f4 100644 --- a/tests/input/semdata.yaml +++ b/tests/input/semdata.yaml @@ -9,6 +9,7 @@ prefixes: dm: http://onto-ns.com/meta/characterisation/0.1/SEMImage# par: http://sintef.no/dlite/parser# gen: http://sintef.no/dlite/generator# + chameo: https://w3id.org/emmo/domain/characterisation-methodology/chameo# # List of documented datasets @@ -68,11 +69,25 @@ Dataset: contactPoint: hasName: Sigurd Wenner hasEmail: - + label: testlabel distribution: downloadURL: sftp://nas.aimen.es/P_MATCHMAKER_SHARE_SINTEF/SEM_cement_batch2 mediaType: inode/directory + - "@id": semdata:SEM_cement_missingcreator + "@type": sem:SEMImageSeries + title: Nested series of SEM images which is missing a creator + description: ... + curator: + - name: Named Lab Assistant + + distribution: + downloadURL: sftp://nas.aimen.es/P_MATCHMAKER_SHARE_SINTEF/SEM_cement_batch_missingcreator + mediaType: inode/directory + + + + Parser: - "@id": par:sem_hitachi diff --git a/tests/input/session.yaml b/tests/input/session.yaml index d2a86d6e..21b52d5d 100644 --- a/tests/input/session.yaml +++ b/tests/input/session.yaml @@ -1,3 +1,9 @@ +# Default sessions used for testing +# +# See https://emmc-asbl.github.io/tripper/latest/developers/ for how to set +# up local instances of GraphDB and Fuseki corresponding to the settings below. + + RdflibTest: backend: rdflib @@ -5,6 +11,7 @@ FusekiTest: backend: sparqlwrapper base_iri: http://localhost:3030/test_repo update_iri: http://localhost:3030/test_repo/update + check_url: http://localhost:3030 username: admin password: admin0 @@ -12,3 +19,4 @@ GraphDBTest: backend: sparqlwrapper base_iri: http://localhost:7200/repositories/test_repo update_iri: http://localhost:7200/repositories/test_repo/statements + check_url: http://localhost:7200/repositories diff --git a/tests/test_triplestore.py b/tests/test_triplestore.py index a4ba5879..29108142 100644 --- a/tests/test_triplestore.py +++ b/tests/test_triplestore.py @@ -274,6 +274,18 @@ def test_restriction() -> None: # pylint: disable=too-many-statements ] +def test_availability(): + """Test availability().""" + # Already tested in backends/test_sparqlwrapper_graphdb_fuseki.py + # Just add test for missing `check_url` + pytest.importorskip("rdflib") + from tripper.triplestore import Triplestore + + ts = Triplestore("rdflib") + with pytest.raises(ValueError): + ts.available() + + def test_backend_rdflib(expected_function_triplestore: str) -> None: """Specifically test the rdflib backend Triplestore. diff --git a/tests/test_utils.py b/tests/test_utils.py index 3ff0772e..984b212c 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -525,6 +525,44 @@ def test_prefix_iri(): prefix_iri("xxx", prefixes, require_prefixed=True) +def test_substitute_query(): + """Test substitute_query().""" + from tripper import FOAF + from tripper.utils import substitute_query + + assert ( + substitute_query( + query="SELECT ?s WHERE { ?s $name $obj }", + iris={"name": "foaf:name"}, + literals={"obj": "John Dow"}, + prefixes={"foaf": str(FOAF)}, + ) + == f'SELECT ?s WHERE {{ ?s <{FOAF.name}> "John Dow" }}' + ) + + assert ( + substitute_query( + query="SELECT ?s WHERE { ?s $name $obj }", + iris={ + "name": ( + 'http://xmlns.com/foaf/0.1/name> "x" . ' + " <" + ) + }, + literals={"obj": 'John Dow" . "'}, + ) + ) == ( + "SELECT ?s WHERE { ?s " + r' "John Dow\" . \"" }' + ) + + assert substitute_query("$x $y", iris={"x": "X"}) == " $y" + assert substitute_query("$x", iris={"x": "X"}, iriquote="[]") == "[X]" + assert substitute_query("$x", iris={"x": "X"}, iriquote=" ") == " X " + assert substitute_query("$x", iris={"x": "X"}, iriquote=None) == "X" + + def test_get_entry_points(): """Test get_entry_points()""" from tripper.utils import get_entry_points diff --git a/tripper/datadoc/dataset.py b/tripper/datadoc/dataset.py index 8f6fb889..0de6e3bc 100644 --- a/tripper/datadoc/dataset.py +++ b/tripper/datadoc/dataset.py @@ -31,12 +31,14 @@ from __future__ import annotations -# pylint: disable=invalid-name,redefined-builtin,import-outside-toplevel -# pylint: disable=too-many-branches import json import logging import re import warnings + +# pylint: disable=invalid-name,redefined-builtin,import-outside-toplevel +# pylint: disable=too-many-branches +from itertools import groupby from pathlib import Path from typing import TYPE_CHECKING @@ -1184,7 +1186,7 @@ def make_query( ts: Triplestore, type=None, criterias: "Optional[dict]" = None, # deprecated - criteria: "Optional[dict]" = None, # new preferred name + criteria: "Optional[Union[dict, list[tuple]]]" = None, # new preferred name regex: "Optional[dict]" = None, flags: "Optional[str]" = None, keywords: "Optional[Keywords]" = None, @@ -1201,7 +1203,10 @@ def make_query( if criterias is not None: warnings.warn( - "`criterias` is deprecated, use `criteria` instead", + ( + "`criterias` argument to make_query() is deprecated, use " + "the `criteria` instead" + ), category=DeprecationWarning, stacklevel=2, ) @@ -1209,6 +1214,15 @@ def make_query( if criteria is None: criteria = criterias + if isinstance(criteria, list): + criteria = sorted(criteria, key=lambda x: (x[0] is None, x[0])) + + res = { + key: [value for key, value in group] + for key, group in groupby(criteria, key=lambda x: x[0]) + } + criteria = res + keywords = get_keywords(keywords=keywords) context = get_context(keywords=keywords) context._create_caches() # pylint: disable=protected-access @@ -1229,7 +1243,7 @@ def make_query( cid = criteria.pop("@id", criteria.pop("_id", None)) rid = regex.pop("@id", regex.pop("_id", None)) if cid: - filters.append(f'FILTER(STR(?iri) = "{ts.expand_iri(cid)}") .') + filters.append(f'FILTER(STR(?iri) = "{ts.expand_iri(cid)}") .') # type: ignore elif rid: filters.append( f'FILTER REGEX(STR(?iri), "{ts.expand_iri(rid)}"{flags_arg}) .' @@ -1252,7 +1266,73 @@ def make_query( def add_crit(k, v, regex=False, s="iri"): """Add criteria to SPARQL query.""" nonlocal n - key = f"@{k[1:]}" if k.startswith("_") else k + + key = None if k is None else (f"@{k[1:]}" if k.startswith("_") else k) + + if key is None: + # any predicate on first hop; keep ?s (= ?iri) as the resource + + n += 1 + pvar = f"p{n}" + bn = f"bn{n}" + n += 1 + qvar = f"q{n}" + var = f"v{n}" + + # ?s ?p ?bn . ?bn ?q ?var . + crit.append(f"?{s} ?{pvar} ?{bn} .") + crit.append(f"?{bn} ?{qvar} ?{var} .") + # Only return non-blank subjects + if s == "iri": + filters.append("FILTER(!isBlank(?iri)) .") + + # Support list of values → VALUES (equality) or a single alternation for regex + if isinstance(v, list): + if regex: + pattern = "(" + "|".join(str(p) for p in v) + ")" + filters.append( + f'FILTER REGEX(STR(?{var}), "{pattern}"{flags_arg}) .' + ) + else: + vals = [] + for ele in v: + if ele in expanded: + vals.append(f"<{expanded[ele]}>") + elif isinstance(ele, str): + vals.append( + f"<{ele}>" + if re.match("^[a-z][a-z0-9.+-]*://", ele) + else f'"{ele}"' + ) + elif ele not in ("", None): + vals.append(ele) + if vals: + crit.append(f"VALUES ?{var} {{ {' '.join(vals)} }}") + else: + # single value + if v in expanded: + value = f"<{expanded[v]}>" + elif isinstance(v, str): + value = ( + f"<{v}>" + if re.match("^[a-z][a-z0-9.+-]*://", v) + else f'"{v}"' + ) + else: + value = v + if value: + if regex: + filters.append( + f"FILTER REGEX(STR(?{var}), {value}{flags_arg}) ." + ) + else: + # If it's an IRI token, compare directly; otherwise compare STR() + if isinstance(value, str) and value.startswith("<"): + filters.append(f"FILTER(?{var} = {value}) .") + else: + filters.append(f"FILTER(STR(?{var}) = {value}) .") + return + if isinstance(v, list): for ele in v: add_crit(key, ele, regex=regex, s=s) @@ -1281,12 +1361,15 @@ def add_crit(k, v, regex=False, s="iri"): n += 1 var = f"v{n}" crit.append(f"?{s} <{ts.expand_iri(key)}> ?{var} .") - if regex: - filters.append( - f"FILTER REGEX(STR(?{var}), {value}{flags_arg}) ." - ) - else: - filters.append(f"FILTER(STR(?{var}) = {value}) .") + + if value: + + if regex: + filters.append( + f"FILTER REGEX(STR(?{var}), {value}{flags_arg}) ." + ) + else: + filters.append(f"FILTER(STR(?{var}) = {value}) .") for k, v in criteria.items(): add_crit(k, v) @@ -1297,6 +1380,8 @@ def add_crit(k, v, regex=False, s="iri"): for k, v in regex.items(): add_crit(k, v, regex=True) + # Make sure that iris are iris (not blank nodes) + filters.append("FILTER(!isBlank(?iri)) .") where_statements = "\n ".join(crit + filters) query = f""" PREFIX rdf: <{RDF}> @@ -1312,7 +1397,7 @@ def search( ts: Triplestore, type=None, criterias: "Optional[dict]" = None, # deprecated - criteria: "Optional[dict]" = None, # new preferred name + criteria: "Optional[Union[list[tuple], dict]]" = None, # new preferred name regex: "Optional[dict]" = None, flags: "Optional[str]" = None, keywords: "Optional[Keywords]" = None, @@ -1323,17 +1408,34 @@ def search( Arguments: ts: Triplestore to search. type: Either a [resource type] (ex: "Dataset", "Distribution", ...) - or the IRI of a class to limit the search to. + or the IRI of a class to limit the search to. Can also be given + as a list of resource types or IRIs. criteria: Exact match criteria. A dict of IRI, value pairs, where the - IRIs refer to data properties on the resource match. The IRIs - may use any prefix defined in `ts`. E.g. if the prefix `dcterms` + IRIs refer to data properties on the resource match. If more than + one value is desired for a given criterion, values can be provided + in a list. It can also be given as a list of (key, value) tuples. + A combination of tuples and dict is not supported. + + The IRIsmay use any prefix defined in `ts`. E.g. if the prefix `dcterms` is in `ts`, it is expanded and the match criteria `dcterms:title` is correctly parsed. - regex: Like `criteria` but the values in the provided dict are regular - expressions used for the matching. - flags: Flags passed to regular expressions. - - `s`: Dot-all mode. The . matches any character. The default - doesn't match newline or carriage return. + + If the object (value) is given as None, all matches + that have any value for the given predicate are returned. + + If predicate (key) is given as None, search on all objects irrespective + of predicate is performed. + + Note that more than one value for a given key broadens the + search, i.e. it is an OR operation. + + The different key-value pairs in the dict are combined with AND. + + regex: Like `criteria` but the values in the provided dict are regular + expressions used for the matching. + flags: Flags passed to regular expressions. + - `s`: Dot-all mode. The . matches any character. The default + doesn't match newline or carriage return. - `m`: Multi-line mode. The ^ and $ characters matches beginning or end of line instead of beginning or end of string. - `i`: Case-insensitive mode. @@ -1354,10 +1456,36 @@ def search( search(ts, criteria={"contactPoint.hasName": "John Doe"}) + List IRIs of all resources with John Doe and Jane Doe as `contactPoint`: + + search(ts, criteria={"contactPoint.hasName": ["John Doe", "Jane Doe"]}) + + List IRIs of all resources that have a `contactPoint`: + + search(ts, criteria={"contactPoint.hasName": None}) + + List IRIs of all resources that have Jane Doe or Blue as object (value): + + search(ts, criteria={None: ["Jane Doe", "Blue"]}) + + Search with critera given as list of tuples: + search( + ts, + criteria=[ + ("contactPoint.hasName", "John Doe"), + ("fromSample", SAMPLE.batch2/sample3), + ], + ) + List IRIs of all samples: search(ts, type=CHAMEO.Sample) + List IRIs of all samples that are liquids: + search(ts, type=[CHAMEO.Sample, EMMO.Liquid] ) + + + List IRIs of all datasets with John Doe as `contactPoint` AND are measured on a given sample: diff --git a/tripper/triplestore.py b/tripper/triplestore.py index 9d13968f..c6b66f1e 100644 --- a/tripper/triplestore.py +++ b/tripper/triplestore.py @@ -47,6 +47,7 @@ ) from tripper.utils import ( bnode_iri, + check_service_availability, en, expand_iri, function_id, @@ -54,6 +55,7 @@ infer_iri, prefix_iri, split_iri, + substitute_query, ) if TYPE_CHECKING: # pragma: no cover @@ -134,6 +136,7 @@ def __init__( base_iri: "Optional[str]" = None, database: "Optional[str]" = None, package: "Optional[str]" = None, + check_url: "Optional[str]" = None, **kwargs, ) -> None: """Initialise triplestore using the backend with the given name. @@ -159,6 +162,7 @@ def __init__( supports it). package: Required when `backend` is a relative module. In that case, it is relative to `package`. + check_url: A URL to use for checking that the backend is available. kwargs: Keyword arguments passed to the backend's __init__() method. @@ -170,6 +174,7 @@ def __init__( namespaces: Dict mapping namespace prefixes to IRIs. package: Name of Python package if the backend is implemented as a relative module. Assigned to the `package` argument. + check_url: The value of the `check_url` argument. Notes: If the backend establishes a connection that should be closed @@ -192,6 +197,7 @@ def __init__( self.backend_name = backend_name self.database = database self.package = package + self.check_url = check_url self.kwargs = kwargs.copy() self.backend = cls(base_iri=base_iri, database=database, **kwargs) @@ -412,11 +418,32 @@ def serialize( ts.bind(prefix, iri) return ts.serialize(destination=destination, format=format, **kwargs) - def query(self, query_object, **kwargs) -> "Any": + def query( + self, + query: str, + iris: "Optional[dict]" = None, + literals: "Optional[dict]" = None, + **kwargs, + ) -> "Any": """SPARQL query. + The `query` argument may contain variables for IRIs and literals, + to be substituted using the `iris` and `literals` arguments. These + variables are prefixed `$`. This makes them easy to distinguish from + query variables, that are typically prefixed with `?`. + + The query substitutions may be useful when the query is constructed + from user input, since they are properly escaped and will be inserted + in the query as a single token. This may prevent sparql injection + attacks. + Arguments: - query_object: String with the SPARQL query. + query: String with the SPARQL query. + iris: Dict used for query substitutions that maps IRI variables + to IRIs. The IRIs may be provided as fully expanded or + prefixed with a prefix registered in the triplestore namespace. + literals: Dict used for query substitutions that maps literal + variables to literals. kwargs: Keyword arguments passed to the backend query() method. Returns: @@ -432,24 +459,63 @@ def query(self, query_object, **kwargs) -> "Any": Not all backends may support all types of queries. + Examples: + Query for everyone with the name "John Dow": + + >>> from tripper import FOAF, Literal, Triplestore + >>> ts = Triplestore(backend="rdflib") + >>> ts.bind("foaf", FOAF) + Namespace('http://xmlns.com/foaf/0.1/') + + >>> ts.add_triples([ + ... (":john", FOAF.name, Literal("John Dow")), + ... (":jack", FOAF.name, Literal("Jack Hudson")), + ... ]) + >>> ts.query( + ... "SELECT ?s WHERE { ?s $name $obj .}", + ... iris={"name": "foaf:name"}, + ... literals={"obj": "John Dow"}, + ... ) + [(':john',)] + """ self._check_method("query") - return self.backend.query(query_object=query_object, **kwargs) + new_query = substitute_query( + query, iris=iris, literals=literals, prefixes=self.namespaces + ) + return self.backend.query(new_query, **kwargs) - def update(self, update_object, **kwargs) -> None: + def update( + self, + query: str, + iris: "Optional[dict]" = None, + literals: "Optional[dict]" = None, + **kwargs, + ) -> None: """Update triplestore with SPARQL. Arguments: - update_object: String with the SPARQL query. + query: String with the SPARQL query. + iris: Dict used for query substitutions that maps IRI variables + to IRIs. The IRIs may be provided as fully expanded or + prefixed with a prefix registered in the triplestore namespace. + literals: Dict used for query substitutions that maps literal + variables to literals. kwargs: Keyword arguments passed to the backend update() method. Note: + See `query()` for how to the query substitution arguments `iris` + and `literals`. + This method is intended for INSERT and DELETE queries. Use the query() method for SELECT, ASK, CONSTRUCT and DESCRIBE queries. """ self._check_method("update") - return self.backend.update(update_object=update_object, **kwargs) + new_query = substitute_query( + query, iris=iris, literals=literals, prefixes=self.namespaces + ) + return self.backend.update(new_query, **kwargs) @overload def bind( @@ -1002,6 +1068,29 @@ def _get_restriction_dict(self, iri): "value": dct[p], } + def available(self, timeout: float = 5, interval: float = 1) -> bool: + """Checks if the backend is available. + + This is done by sending a request is send to the URL specified + in the `check_url` attribute and checking for the response. + + Arguments: + timeout: Total time in seconds to wait for a respond. + interval: Interval for checking response. + + Returns: + Returns true if the service responds with code 200, + otherwise false is returned. + + """ + if self.check_url is None: + raise ValueError( + "`check_url` must be assigned before calling available()" + ) + return check_service_availability( + self.check_url, timeout=timeout, interval=interval + ) + def map( self, source: str, diff --git a/tripper/utils.py b/tripper/utils.py index b1ac83f3..e93ae3a1 100644 --- a/tripper/utils.py +++ b/tripper/utils.py @@ -10,6 +10,7 @@ import sys import tempfile import urllib +import warnings from contextlib import contextmanager from pathlib import Path from typing import TYPE_CHECKING @@ -64,7 +65,9 @@ "extend_namespace", "expand_iri", "prefix_iri", + "substitute_query", "get_entry_points", + "check_service_availability", ) MATCH_PREFIXED_IRI = re.compile( @@ -732,6 +735,71 @@ def prefix_iri( return iri +def substitute_query( + query: str, + iris: "Optional[dict]" = None, + literals: "Optional[dict]" = None, + prefixes: "Optional[dict]" = None, + iriquote: str = "<>", +) -> "Any": + """Substitute IRI and literal variables in a SPARQL query. + + Arguments: + query: String with the SPARQL query. + iris: Dict used for query substitutions that maps IRI variables + to IRIs. The IRIs may be provided as fully expanded or + prefixed with the prefix defined in `prefixes`. + literals: Dict used for query substitutions that maps literal + variables to literals. For common datatypes, like strings + and numbers, the values can just be normal Python objects. + For special cases or more control, provide the values as + instances of `tripper.Literal`. + prefixes: Dict mapping prefixes to namespace URLs. + iriquote: Quote characters to use for IRIs. Should be a string of + length 2, with the start and end quote. + + Notes: + The `query` argument may contain variables for IRIs and literals, + to be substituted using the `iris` and `literals` arguments. These + variables are prefixed `$`. This makes them easy to distinguish from + query variables, that are typically prefixed with `?`. + + The query substitutions may be useful when the query is constructed + from user input, since they are properly escaped and will be inserted + in the query as a single token. This may prevent sparql injection + attacks. + """ + safe = "-._~:/?#@+&;=" # special IRI characters that are not escaped + mapping = {} + + if iriquote: + if len(iriquote) == 1: + iriquote = iriquote[0] * 2 + elif len(iriquote) > 2: + raise ValueError( + f"`iriquote` cannot be more than 2 characters: '{iriquote}'" + ) + if iriquote[1].isalnum() or iriquote[1] in safe: + warnings.warn( + f"End quote '{iriquote[1]}' is alphanumeric or in '{safe}'" + ) + + if iris: + if prefixes is None: + prefixes = {} + for k, v in iris.items(): + expanded = expand_iri(v, prefixes=prefixes) + quoted = urllib.parse.quote(expanded, safe=safe) + q1, q2 = iriquote if iriquote else ("", "") # type: ignore[misc] + mapping[k] = f"{q1}{quoted}{q2}" + + if literals: + for k, v in literals.items(): + mapping[k] = Literal(v).n3() + + return string.Template(query).safe_substitute(mapping) + + def get_entry_points(group: str): """Consistent interface to entry points for the given group. @@ -766,7 +834,9 @@ def get_entry_points(group: str): return eps -def check_service_availability(url: str, timeout=5, interval=1) -> bool: +def check_service_availability( + url: str, timeout: float = 5, interval: float = 1 +) -> bool: """Check whether the service with given URL is available. Arguments: