Skip to content

Commit 476a6a2

Browse files
authored
Merge pull request #16 from AtomGraph/feat-ldh-add-file
Add ldh-AddFile op + FileClient for LDH file uploads
2 parents faf7a7e + a6ec5b1 commit 476a6a2

6 files changed

Lines changed: 436 additions & 2 deletions

File tree

formal-semantics.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,12 @@ Abstract: URI × Literal × Maybe Literal × Maybe Literal × Maybe Literal →
246246
Python: def execute(self, url: URIRef, value: Literal, title: Literal = None, description: Literal = None, fragment: Literal = None) -> Any
247247
```
248248

249+
**ldh-AddFile** - Add file (binary) to LinkedDataHub document via multipart RDF/POST
250+
```
251+
Abstract: URI × Literal × Literal × Maybe Literal × Maybe Literal → Any
252+
Python: def execute(self, url: URIRef, file_path: Literal, title: Literal, description: Literal = None, content_type: Literal = None) -> Any
253+
```
254+
249255
**ldh-RemoveBlock** - Remove content block from LinkedDataHub document
250256
```
251257
Abstract: URI × Maybe URI → Any

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ dependencies = [
2424
"openai",
2525
"mcp[cli]==1.10.1",
2626
"pydantic-settings",
27+
"urllib3",
2728
]
2829

2930
[project.urls]

src/web_algebra/client.py

Lines changed: 121 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
from typing import Optional
1+
from typing import Optional, Tuple
2+
import hashlib
23
import ssl
34
import json
45
import time
@@ -10,6 +11,7 @@
1011
from http.client import HTTPResponse
1112
from rdflib import Graph
1213
from rdflib.plugins.sparql.parser import parseQuery
14+
from urllib3.filepost import encode_multipart_formdata
1315

1416

1517
MEDIA_TYPES = {
@@ -194,6 +196,124 @@ def patch(self, url: str, sparql_update: str) -> HTTPResponse:
194196
return self.opener.open(request)
195197

196198

199+
class FileClient:
200+
"""Multipart RDF/POST file upload for LinkedDataHub file resources.
201+
202+
Files are not Linked Data — request bodies are bytes with a Content-Type
203+
rather than RDF graphs — so they get their own client surface instead
204+
of being grafted onto `LinkedDataClient`. Auth and TLS setup duplicate
205+
`LinkedDataClient` / `SPARQLClient` by convention: each client in this
206+
module configures its own ssl_context + opener inline.
207+
208+
Wire format matches LinkedDataHub's `bin/add-file.sh` script: a
209+
multipart/form-data body using LDH's RDF/POST dialect where each
210+
`pu=<predicate>` form field is paired with the next `ol=<literal>` or
211+
`ou=<uri>` field, sharing a blank-node subject named via `sb=`. The
212+
file body itself is carried as a multipart file part labelled `ol`
213+
with the supplied Content-Type. LDH stores the bytes under its
214+
built-in `/uploads/{sha1}` namespace and appends the file's RDF
215+
description (filename, MIME type, sha1, title) to the target document.
216+
"""
217+
218+
_NFO_FILE_NAME = "http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#fileName"
219+
_NFO_FILE_DATA_OBJECT = "http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#FileDataObject"
220+
_DCT_TITLE = "http://purl.org/dc/terms/title"
221+
_DCT_DESCRIPTION = "http://purl.org/dc/terms/description"
222+
_RDF_TYPE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
223+
224+
def __init__(
225+
self,
226+
cert_pem_path: Optional[str] = None,
227+
cert_password: Optional[str] = None,
228+
verify_ssl: bool = True,
229+
):
230+
"""Initialize TLS context + opener; mirrors `LinkedDataClient.__init__`."""
231+
self.ssl_context = ssl.create_default_context()
232+
233+
if cert_pem_path and cert_password:
234+
self.ssl_context.load_cert_chain(
235+
certfile=cert_pem_path, password=cert_password
236+
)
237+
238+
if not verify_ssl:
239+
self.ssl_context.check_hostname = False
240+
self.ssl_context.verify_mode = ssl.CERT_NONE
241+
242+
self.opener = urllib.request.build_opener(
243+
urllib.request.HTTPSHandler(context=self.ssl_context),
244+
HTTPRedirectHandler308(),
245+
RetryAfterHandler(),
246+
)
247+
248+
self.opener.addheaders = [
249+
(
250+
"User-Agent",
251+
"Web-Algebra/1.0 (LinkedData Processing System; https://github.com/atomgraph/Web-Algebra)",
252+
)
253+
]
254+
255+
def add_file(
256+
self,
257+
target_url: str,
258+
file_body: bytes,
259+
content_type: str,
260+
title: str,
261+
description: Optional[str] = None,
262+
filename: Optional[str] = None,
263+
) -> Tuple[HTTPResponse, str]:
264+
"""RDF/POST a file to `target_url`.
265+
266+
:param target_url: The document URI the file's RDF description is
267+
appended to. Note this is *not* the URI the file ends up at —
268+
LDH stores the bytes under its own `/uploads/{sha1}` namespace
269+
regardless of `target_url`.
270+
:param file_body: Raw file bytes.
271+
:param content_type: MIME type of the file (e.g. `image/png`).
272+
:param title: `dct:title` literal.
273+
:param description: Optional `dct:description` literal.
274+
:param filename: Optional filename for the multipart part's
275+
`Content-Disposition`. Defaults to `"upload"` when absent;
276+
LDH does not depend on this value for URI minting.
277+
:return: `(HTTPResponse, sha1_hex)`. The sha1 is computed over
278+
`file_body` client-side so callers can construct the resulting
279+
`<base>/uploads/{sha1}` URI without parsing the response body.
280+
"""
281+
sha1 = hashlib.sha1(file_body).hexdigest()
282+
283+
# `encode_multipart_formdata` accepts a list of `(name, value)`
284+
# tuples — duplicates allowed, order preserved. A plain string/bytes
285+
# value becomes a form field; a `(filename, body, content_type)`
286+
# tuple becomes a file part. RDF/POST relies on this ordering
287+
# because each `pu=<predicate>` field is paired with the next
288+
# `ol=<literal>` / `ou=<uri>` field by LDH's parser.
289+
fields: list[tuple[str, object]] = [
290+
("rdf", ""),
291+
("sb", "file"),
292+
("pu", self._NFO_FILE_NAME),
293+
("ol", (filename or "upload", file_body, content_type)),
294+
("pu", self._DCT_TITLE),
295+
("ol", title),
296+
("pu", self._RDF_TYPE),
297+
("ou", self._NFO_FILE_DATA_OBJECT),
298+
]
299+
if description:
300+
fields.extend([
301+
("pu", self._DCT_DESCRIPTION),
302+
("ol", description),
303+
])
304+
305+
body, content_type_header = encode_multipart_formdata(fields)
306+
headers = {
307+
"Content-Type": content_type_header,
308+
"Accept": "text/turtle",
309+
}
310+
request = urllib.request.Request(
311+
target_url, data=body, headers=headers, method="POST"
312+
)
313+
response = self.opener.open(request)
314+
return response, sha1
315+
316+
197317
class SPARQLClient:
198318
def __init__(
199319
self,
Lines changed: 228 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,228 @@
1+
from typing import Any, Optional
2+
import logging
3+
import mimetypes
4+
import urllib.parse
5+
from pathlib import Path
6+
7+
from mcp import types
8+
from rdflib import Literal, URIRef
9+
from rdflib.namespace import XSD
10+
from rdflib.query import Result
11+
12+
from web_algebra.client import FileClient
13+
from web_algebra.json_result import JSONResult
14+
from web_algebra.mcp_tool import MCPTool
15+
from web_algebra.operation import Operation
16+
17+
18+
class AddFile(Operation, MCPTool):
19+
"""RDF/POST a file to a LinkedDataHub document, returning the minted upload URI.
20+
21+
The file's RDF description (`nfo:FileDataObject` + filename + MIME type +
22+
sha1 + title) is appended to the target document; the file bytes
23+
themselves are stored by LDH at `<base>/uploads/{sha1}` under its
24+
built-in upload namespace, independent of the target document's URI.
25+
26+
Unlike the rest of the `ldh-Add*` family, this op does not subclass
27+
`POST` — file upload uses `multipart/form-data` with LDH's RDF/POST
28+
dialect rather than an N-triples body, so it carries its own
29+
`FileClient` instance instead of inheriting `LinkedDataClient` plumbing.
30+
"""
31+
32+
def model_post_init(self, __context: Any) -> None:
33+
self.client = FileClient(
34+
cert_pem_path=getattr(self.settings, "cert_pem_path", None),
35+
cert_password=getattr(self.settings, "cert_password", None),
36+
verify_ssl=False,
37+
)
38+
39+
@classmethod
40+
def name(cls):
41+
return "ldh-AddFile"
42+
43+
@classmethod
44+
def description(cls) -> str:
45+
return """Adds a file to a LinkedDataHub document via multipart RDF/POST.
46+
47+
Appends `a nfo:FileDataObject ; nfo:fileName ; dct:title ; ...`
48+
to the target document and stores the file bytes at
49+
`<base>/uploads/{sha1}` (LDH's built-in upload namespace).
50+
51+
Arguments:
52+
- `url` — URI of the target document to add the file's description to.
53+
- `file` — absolute local file path. The bytes are read and streamed
54+
to the server.
55+
- `title` — human-readable title (`dct:title`).
56+
- `description` — optional description (`dct:description`).
57+
- `content_type` — optional MIME-type override; auto-detected from
58+
the file path if absent.
59+
60+
Returns a result with `url` (the minted `<base>/uploads/{sha1}` URI
61+
the file resource is now addressable at) and `status` (HTTP status
62+
code) bindings.
63+
"""
64+
65+
@classmethod
66+
def inputSchema(cls) -> dict:
67+
return {
68+
"type": "object",
69+
"properties": {
70+
"url": {
71+
"type": "string",
72+
"description": "Target document URI to add the file's description to.",
73+
},
74+
"file": {
75+
"type": "string",
76+
"description": "Absolute local file path. The bytes are read and uploaded.",
77+
},
78+
"title": {
79+
"type": "string",
80+
"description": "Title of the file (dct:title).",
81+
},
82+
"description": {
83+
"type": "string",
84+
"description": "Optional description (dct:description).",
85+
},
86+
"content_type": {
87+
"type": "string",
88+
"description": "Optional MIME-type override; auto-detected from path if absent.",
89+
},
90+
},
91+
"required": ["url", "file", "title"],
92+
}
93+
94+
def execute(
95+
self,
96+
url: URIRef,
97+
file_path: Literal,
98+
title: Literal,
99+
description: Optional[Literal] = None,
100+
content_type: Optional[Literal] = None,
101+
) -> Result:
102+
"""Pure function: RDF/POST a file from disk with RDFLib terms."""
103+
if not isinstance(url, URIRef):
104+
raise TypeError(
105+
f"AddFile.execute expects url to be URIRef, got {type(url)}"
106+
)
107+
if not isinstance(file_path, Literal):
108+
raise TypeError(
109+
f"AddFile.execute expects file_path to be Literal, got {type(file_path)}"
110+
)
111+
if not isinstance(title, Literal):
112+
raise TypeError(
113+
f"AddFile.execute expects title to be Literal, got {type(title)}"
114+
)
115+
if description is not None and not isinstance(description, Literal):
116+
raise TypeError(
117+
f"AddFile.execute expects description to be Literal or None, got {type(description)}"
118+
)
119+
if content_type is not None and not isinstance(content_type, Literal):
120+
raise TypeError(
121+
f"AddFile.execute expects content_type to be Literal or None, got {type(content_type)}"
122+
)
123+
124+
path_str = str(file_path)
125+
with open(path_str, "rb") as f:
126+
body = f.read()
127+
128+
ct: Optional[str] = str(content_type) if content_type is not None else None
129+
if ct is None:
130+
ct, _ = mimetypes.guess_type(path_str)
131+
if ct is None:
132+
ct = "application/octet-stream"
133+
134+
url_str = str(url)
135+
logging.info(
136+
"RDF/POSTing file %s (%d bytes, %s) to <%s>",
137+
path_str, len(body), ct, url_str,
138+
)
139+
140+
response, sha1 = self.client.add_file(
141+
target_url=url_str,
142+
file_body=body,
143+
content_type=ct,
144+
title=str(title),
145+
description=str(description) if description is not None else None,
146+
filename=Path(path_str).name,
147+
)
148+
149+
# The minted file URI lives at `<scheme>://<host>/uploads/<sha1>`
150+
# regardless of which target document we RDF/POSTed to. Reconstruct
151+
# from the target URL's host so callers don't need to thread the
152+
# base URL through separately.
153+
parsed = urllib.parse.urlparse(url_str)
154+
file_uri = f"{parsed.scheme}://{parsed.netloc}/uploads/{sha1}"
155+
156+
logging.info("AddFile status %s → <%s>", response.status, file_uri)
157+
158+
return JSONResult(
159+
vars=["status", "url"],
160+
bindings=[
161+
{
162+
"status": Literal(response.status, datatype=XSD.integer),
163+
"url": URIRef(file_uri),
164+
}
165+
],
166+
)
167+
168+
def execute_json(self, arguments: dict, variable_stack: list = []) -> Result:
169+
"""JSON execution: process arguments with strict type checking."""
170+
url_data = Operation.process_json(
171+
self.settings, arguments["url"], self.context, variable_stack
172+
)
173+
if not isinstance(url_data, URIRef):
174+
raise TypeError(
175+
f"ldh-AddFile expects 'url' to be URIRef, got {type(url_data)}"
176+
)
177+
178+
file_data = Operation.process_json(
179+
self.settings, arguments["file"], self.context, variable_stack
180+
)
181+
file_literal = self.to_string_literal(file_data)
182+
183+
title_data = Operation.process_json(
184+
self.settings, arguments["title"], self.context, variable_stack
185+
)
186+
title_literal = self.to_string_literal(title_data)
187+
188+
description_literal: Optional[Literal] = None
189+
if "description" in arguments:
190+
description_data = Operation.process_json(
191+
self.settings, arguments["description"], self.context, variable_stack
192+
)
193+
description_literal = self.to_string_literal(description_data)
194+
195+
content_type_literal: Optional[Literal] = None
196+
if "content_type" in arguments:
197+
content_type_data = Operation.process_json(
198+
self.settings, arguments["content_type"], self.context, variable_stack
199+
)
200+
content_type_literal = self.to_string_literal(content_type_data)
201+
202+
return self.execute(
203+
url_data,
204+
file_literal,
205+
title_literal,
206+
description_literal,
207+
content_type_literal,
208+
)
209+
210+
def mcp_run(self, arguments: dict, context: Any = None) -> Any:
211+
"""MCP execution: plain args → plain results."""
212+
url = URIRef(arguments["url"])
213+
file_path = Literal(arguments["file"], datatype=XSD.string)
214+
title = Literal(arguments["title"], datatype=XSD.string)
215+
description = (
216+
Literal(arguments["description"], datatype=XSD.string)
217+
if "description" in arguments
218+
else None
219+
)
220+
content_type = (
221+
Literal(arguments["content_type"], datatype=XSD.string)
222+
if "content_type" in arguments
223+
else None
224+
)
225+
226+
result = self.execute(url, file_path, title, description, content_type)
227+
url_binding = result.bindings[0]["url"]
228+
return [types.TextContent(type="text", text=f"File added: {url_binding}")]

0 commit comments

Comments
 (0)