Web-Algebra/src/web_algebra/client.py at 25c269f80893a70a2ac55ea9ad75add6cfd99378 · AtomGraph/Web-Algebra · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
from typing import Optional
import ssl
import json
import logging
import time
import urllib.request
import urllib.error
from email.utils import parsedate_to_datetime
from http.client import HTTPResponse
from rdflib import Graph
from rdflib.plugins.sparql.parser import parseQuery


MEDIA_TYPES = {
    "application/n-triples": "nt",
    "text/turtle": "turtle",
    "application/ld+json": "json-ld",
    "application/rdf+xml": "xml",
}


class HTTPRedirectHandler308(urllib.request.HTTPRedirectHandler):
    def redirect_request(self, req, fp, code, msg, headers, newurl):
        """Handle 308 Permanent Redirect by preserving method and body"""
        if code == 308:
            return urllib.request.Request(
                newurl, data=req.data, headers=req.headers, method=req.get_method()
            )
        return super().redirect_request(req, fp, code, msg, headers, newurl)


class LinkedDataClient:
    def __init__(
        self,
        cert_pem_path: Optional[str] = None,
        cert_password: Optional[str] = None,
        verify_ssl: bool = True,
    ):
        """
        Initializes the LinkedDataClient with SSL configuration.

        :param cert_pem_path: Path to the certificate .pem file (containing both private key and certificate).
        :param cert_password: Password for the encrypted private key in the .pem file.
        :param verify_ssl: Whether to verify the server's SSL certificate. Default is True.
        """
        # Always create SSL context
        self.ssl_context = ssl.create_default_context()

        # Load client certificate if provided
        if cert_pem_path and cert_password:
            self.ssl_context.load_cert_chain(
                certfile=cert_pem_path, password=cert_password
            )

        # Configure SSL verification
        if not verify_ssl:
            self.ssl_context.check_hostname = False
            self.ssl_context.verify_mode = ssl.CERT_NONE

        # Create an HTTPS handler with the configured SSL context
        self.opener = urllib.request.build_opener(
            urllib.request.HTTPSHandler(context=self.ssl_context),
            HTTPRedirectHandler308(),
        )

        # Add proper User-Agent header for external services like Wikidata
        self.opener.addheaders = [
            (
                "User-Agent",
                "Web-Algebra/1.0 (LinkedData Processing System; https://github.com/atomgraph/Web-Algebra)",
            )
        ]

    def _request_with_retry(self, request: urllib.request.Request, max_retries: int = 5) -> HTTPResponse:
        """
        Execute HTTP request with automatic retry on 429 (Too Many Requests) responses.

        Respects the Retry-After header if present, otherwise uses exponential backoff.
        All other HTTP errors are raised immediately without retry.

        :param request: The urllib Request object to execute
        :param max_retries: Maximum number of retry attempts (default 5)
        :return: HTTPResponse object on success
        :raises: urllib.error.HTTPError for non-429 errors or after max retries
        """
        attempt = 0

        while attempt <= max_retries:
            try:
                return self.opener.open(request)
            except urllib.error.HTTPError as e:
                # Only retry on 429 (Too Many Requests)
                if e.code != 429:
                    raise

                # Check if we've exhausted retries
                if attempt >= max_retries:
                    logging.error(f"Max retries ({max_retries}) exceeded for {request.full_url}")
                    raise

                # Parse Retry-After header
                retry_after = e.headers.get('Retry-After')
                if retry_after:
                    try:
                        # Try parsing as seconds (integer)
                        wait_time = int(retry_after)
                    except ValueError:
                        # Try parsing as HTTP-date
                        try:
                            retry_date = parsedate_to_datetime(retry_after)
                            wait_time = (retry_date - parsedate_to_datetime(time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime()))).total_seconds()
                            wait_time = max(0, wait_time)  # Ensure non-negative
                        except Exception:
                            # Fallback to exponential backoff if parsing fails
                            wait_time = min(1 * (2 ** attempt), 60)
                else:
                    # No Retry-After header, use exponential backoff
                    wait_time = min(1 * (2 ** attempt), 60)

                attempt += 1
                logging.warning(f"HTTP 429 received for {request.full_url}. Retry {attempt}/{max_retries} after {wait_time:.1f} seconds")
                time.sleep(wait_time)

    def get(self, url: str) -> Graph:
        """
        Fetches RDF data from the given URL and returns it as an RDFLib Graph.

        :param url: The URL to fetch RDF data from.
        :return: An RDFLib Graph object containing the parsed RDF data.
        """
        # Set the Accept header
        accept_header = ", ".join(MEDIA_TYPES.keys())
        headers = {"Accept": accept_header}
        request = urllib.request.Request(url, headers=headers)

        # Perform the HTTP request with retry on 429
        response = self._request_with_retry(request)

        # Read and decode the response data
        data = response.read().decode("utf-8")
        content_type = response.headers.get("Content-Type").split(";")[0]
        rdf_format = MEDIA_TYPES.get(content_type)
        if not rdf_format:
            raise ValueError(
                f"Unsupported Content-Type: {content_type}. Supported types are: {', '.join(MEDIA_TYPES.keys())}"
            )

        # Parse the RDF data into an RDFLib Graph
        g = Graph()
        g.parse(data=data, format=rdf_format, publicID=url)
        return g

    def post(self, url: str, graph: Graph) -> HTTPResponse:
        """
        Sends RDF data to the given URL using HTTP POST.

        :param url: The URL to send RDF data to.
        :param data: An RDFLib Graph containing the data to send.
        :return: The HTTPResponse object.
        """
        # Serialize the RDF data to N-Triples
        data = graph.serialize(format="nt")
        headers = {
            "Content-Type": "application/n-triples",
            "Accept": "application/n-triples",
        }
        request = urllib.request.Request(
            url, data=data.encode("utf-8"), headers=headers, method="POST"
        )

        return self._request_with_retry(request)

    def put(self, url: str, graph: Graph) -> HTTPResponse:
        """
        Sends RDF data to the given URL using HTTP PUT.

        :param url: The URL to send RDF data to.
        :param data: An RDFLib Graph containing the data to send.
        :return: The HTTPResponse object.
        """
        # Serialize the RDF data to N-Triples
        data = graph.serialize(format="nt")
        headers = {
            "Content-Type": "application/n-triples",
            "Accept": "application/n-triples",
        }
        request = urllib.request.Request(
            url, data=data.encode("utf-8"), headers=headers, method="PUT"
        )

        return self._request_with_retry(request)

    def delete(self, url: str) -> HTTPResponse:
        """
        Sends an HTTP DELETE request to the given URL.

        :param url: The URL to send the DELETE request to.
        :return: The HTTPResponse object.
        """
        request = urllib.request.Request(url, method="DELETE")

        return self._request_with_retry(request)

    def patch(self, url: str, sparql_update: str) -> HTTPResponse:
        """
        Sends a SPARQL UPDATE query to the given URL using HTTP PATCH.

        :param url: The URL to send the SPARQL UPDATE to.
        :param sparql_update: The SPARQL UPDATE query string.
        :return: The HTTPResponse object.
        """
        headers = {
            "Content-Type": "application/sparql-update",
            "Accept": "application/n-triples",
        }
        request = urllib.request.Request(
            url, data=sparql_update.encode("utf-8"), headers=headers, method="PATCH"
        )

        return self._request_with_retry(request)


class SPARQLClient:
    def __init__(
        self,
        cert_pem_path: Optional[str] = None,
        cert_password: Optional[str] = None,
        verify_ssl: bool = True,
    ):
        """
        Initializes the SPARQLClient with optional SSL certificate.

        :param cert_pem_path: Path to .pem file containing cert+key
        :param cert_password: Password for the PEM file
        :param verify_ssl: Whether to verify server SSL certificate
        """
        # Always create SSL context
        self.ssl_context = ssl.create_default_context()

        # Load client certificate if provided
        if cert_pem_path and cert_password:
            self.ssl_context.load_cert_chain(
                certfile=cert_pem_path, password=cert_password
            )

        # Configure SSL verification
        if not verify_ssl:
            self.ssl_context.check_hostname = False
            self.ssl_context.verify_mode = ssl.CERT_NONE

        self.opener = urllib.request.build_opener(
            urllib.request.HTTPSHandler(context=self.ssl_context)
        )

        # Add proper User-Agent header for external services like Wikidata
        self.opener.addheaders = [
            (
                "User-Agent",
                "Web-Algebra/1.0 (LinkedData Processing System; https://github.com/atomgraph/Web-Algebra)",
            )
        ]

    def query(self, endpoint_url: str, query_string: str) -> dict:
        """
        Executes a SPARQL query. Returns Graph for CONSTRUCT/DESCRIBE, Result for SELECT/ASK.

        :param endpoint_url: The SPARQL endpoint URL
        :param query_string: SPARQL query string
        :return: rdflib.Graph or rdflib.query.Result
        """
        parsed = parseQuery(query_string)
        query_type = parsed[1].name  # e.g., 'SelectQuery', 'ConstructQuery'

        if query_type in {"SelectQuery", "AskQuery"}:
            accept = "application/sparql-results+json"
        elif query_type in {"ConstructQuery", "DescribeQuery"}:
            accept = "application/n-triples"
        else:
            raise ValueError(f"Unsupported query type: {query_type}")

        # Encode URL parameters
        params = urllib.parse.urlencode({"query": query_string})
        url = f"{endpoint_url}?{params}"
        headers = {"Accept": accept}

        request = urllib.request.Request(url, headers=headers)
        response = self.opener.open(request)
        data = response.read()

        if accept == "application/n-triples":
            g = Graph()
            # convert N-Triples to JSON-LD
            g.parse(data=data.decode("utf-8"), format="nt")
            jsonld_str = g.serialize(format="json-ld")
            jsonld_data = json.loads(jsonld_str)
            return jsonld_data
        else:
            # return SPARQL JSON results as a dict
            return json.loads(data.decode("utf-8"))