Skip to content

Commit 44ad5a4

Browse files
authored
Merge pull request #24 from eccenca/feature/improveTypedEntities-CMEM-6243
Add RDF-Quad typed entities (CMEM-6243)
2 parents 2366c25 + 16e9cb7 commit 44ad5a4

7 files changed

Lines changed: 512 additions & 24 deletions

File tree

CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,13 @@ All notable changes to this project will be documented in this file.
55

66
The format is based on [Keep a Changelog](http://keepachangelog.com/) and this project adheres to [Semantic Versioning](https://semver.org/)
77

8+
## [Unreleased]
9+
10+
### Added
11+
12+
- RDF Quad entity type (CMEM-6243).
13+
14+
815
## [4.11.0] 2025-06-19
916

1017
### Added

cmem_plugin_base/dataintegration/typed_entities/file.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -99,15 +99,16 @@ class FileEntitySchema(TypedEntitySchema[File]):
9999
"""Entity schema that holds a collection of files."""
100100

101101
def __init__(self):
102-
super().__init__(
103-
type_uri=type_uri("File"),
104-
paths=[
105-
EntityPath(path_uri("filePath"), is_single_value=True),
106-
EntityPath(path_uri("fileType"), is_single_value=True),
107-
EntityPath(path_uri("mimeType"), is_single_value=True),
108-
EntityPath(path_uri("entryPath"), is_single_value=True),
109-
],
110-
)
102+
# The parent class TypedEntitySchema implements a singleton pattern
103+
if not hasattr(self, "_initialized"):
104+
super().__init__(
105+
type_uri=type_uri("File"),
106+
paths=[
107+
EntityPath(path_uri("filePath"), is_single_value=True),
108+
EntityPath(path_uri("fileType"), is_single_value=True),
109+
EntityPath(path_uri("mimeType"), is_single_value=True),
110+
],
111+
)
111112

112113
def to_entity(self, value: File) -> Entity:
113114
"""Create a generic entity from a file"""
Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
"""Quad entities"""
2+
3+
import uuid
4+
from typing import ClassVar, cast
5+
6+
from pydantic import BaseModel
7+
8+
from cmem_plugin_base.dataintegration.entity import Entity, EntityPath
9+
from cmem_plugin_base.dataintegration.typed_entities import path_uri, type_uri
10+
from cmem_plugin_base.dataintegration.typed_entities.typed_entities import (
11+
TypedEntitySchema,
12+
)
13+
14+
# --- RDF Node Types ---
15+
16+
17+
class RdfNode(BaseModel):
18+
"""Abstract base class for an RDF node."""
19+
20+
type: ClassVar[str]
21+
"""The type code that identifies this RDF node type. Must be defined in subclasses."""
22+
23+
value: str
24+
"""The value of the RDF node. This is typically a URI, a blank node identifier, or a literal."""
25+
26+
27+
class ConcreteNode(RdfNode):
28+
"""Abstract base class for an RdfNode which is either a Resource or a BlankNode."""
29+
30+
31+
class Resource(ConcreteNode):
32+
"""Represents an RDF resource (typically a URI)."""
33+
34+
type: ClassVar[str] = "URI"
35+
36+
value: str # The URI of the resource
37+
38+
39+
class BlankNode(ConcreteNode):
40+
"""Represents an RDF blank node."""
41+
42+
type: ClassVar[str] = "BlankNode"
43+
44+
value: str # Usually the identifier without the '_:' prefix internally
45+
46+
47+
class Literal(RdfNode):
48+
"""Abstract base class for an RDF literal."""
49+
50+
51+
class PlainLiteral(Literal):
52+
"""Represents a plain literal without a language tag or datatype."""
53+
54+
type: ClassVar[str] = "Literal"
55+
56+
value: str
57+
58+
59+
class LanguageLiteral(Literal):
60+
"""Represents a literal with a language tag."""
61+
62+
type: ClassVar[str] = "LangLiteral"
63+
64+
value: str
65+
language: str
66+
67+
68+
class DataTypeLiteral(Literal):
69+
"""Represents a literal with a specific datatype."""
70+
71+
type: ClassVar[str] = "TypedLiteral"
72+
73+
value: str
74+
data_type: str = "http://www.w3.org/2001/XMLSchema#string" # Default datatype IRI for literals
75+
76+
77+
class Quad(BaseModel):
78+
"""Represents an RDF Quad."""
79+
80+
subject: ConcreteNode
81+
predicate: Resource
82+
object: RdfNode
83+
graph: Resource | None = None
84+
85+
86+
def create_node(
87+
type_name: str, value: str, language: str | None = None, data_type: str | None = None
88+
) -> RdfNode:
89+
"""Create an RDF node for a given type name."""
90+
match type_name:
91+
case Resource.type:
92+
return Resource(value=value)
93+
case BlankNode.type:
94+
return BlankNode(value=value)
95+
case PlainLiteral.type:
96+
return PlainLiteral(value=value)
97+
case LanguageLiteral.type:
98+
if language is None:
99+
raise ValueError("Language must be provided for LanguageLiteral.")
100+
return LanguageLiteral(value=value, language=language)
101+
case DataTypeLiteral.type:
102+
if data_type is None:
103+
raise ValueError("Data type must be provided for DataTypeLiteral.")
104+
return DataTypeLiteral(value=value, data_type=data_type)
105+
case _:
106+
raise ValueError(f"Unknown type: {type_name}")
107+
108+
109+
# --- RDF Quad Schema ---
110+
111+
112+
class QuadEntitySchema(TypedEntitySchema[Quad]):
113+
"""Entity schema that holds a collection of RDF quads."""
114+
115+
def __init__(self):
116+
# The parent class TypedEntitySchema implements a singleton pattern
117+
if not hasattr(self, "_initialized"):
118+
super().__init__(
119+
type_uri=type_uri("Quad"),
120+
paths=[
121+
EntityPath(path_uri("quad/subject")),
122+
EntityPath(path_uri("quad/subjectType")),
123+
EntityPath(path_uri("quad/predicate")),
124+
EntityPath(path_uri("quad/object")),
125+
EntityPath(path_uri("quad/objectType")),
126+
EntityPath(path_uri("quad/objectLanguage")),
127+
EntityPath(path_uri("quad/objectDataType")),
128+
EntityPath(path_uri("quad/graph")),
129+
],
130+
)
131+
132+
def to_entity(self, quad: Quad) -> Entity:
133+
"""Create a generic entity from an RDF quad."""
134+
# Extract object language
135+
match quad.object:
136+
case LanguageLiteral(language=lang):
137+
object_language = [lang]
138+
case _:
139+
object_language = []
140+
141+
# Extract object data type
142+
match quad.object:
143+
case DataTypeLiteral(data_type=dt):
144+
object_data_type = [dt]
145+
case _:
146+
object_data_type = []
147+
148+
# Generate a UUID-based URI
149+
uri_components = "".join(
150+
[
151+
quad.subject.value,
152+
quad.predicate.value,
153+
quad.object.value,
154+
object_language[0] if object_language else "",
155+
object_data_type[0] if object_data_type else "",
156+
quad.graph.value if quad.graph else "",
157+
]
158+
)
159+
uri = f"urn:uuid:{uuid.uuid5(uuid.NAMESPACE_DNS, uri_components)}"
160+
161+
# Build entity
162+
return Entity(
163+
uri=uri,
164+
values=[
165+
[quad.subject.value],
166+
[quad.subject.type],
167+
[quad.predicate.value],
168+
[quad.object.value],
169+
[quad.object.type],
170+
object_language,
171+
object_data_type,
172+
[quad.graph.value] if quad.graph else [],
173+
],
174+
)
175+
176+
def from_entity(self, entity: Entity) -> Quad:
177+
"""Create an RDF quad entity from a generic entity."""
178+
# Indices for the values in the entity
179+
subject_index = 0
180+
subject_type_index = 1
181+
predicate_index = 2
182+
object_index = 3
183+
object_type_index = 4
184+
object_language_index = 5
185+
object_data_type_index = 6
186+
graph_index = 7
187+
188+
# Get entity values
189+
values = entity.values
190+
191+
# Subject
192+
subject_type_list = values[subject_type_index]
193+
if len(subject_type_list) == 1:
194+
subject = create_node(subject_type_list[0], values[subject_index][0])
195+
else:
196+
raise ValueError(f"Invalid subject type: {subject_type_list}. Expected a single value.")
197+
198+
# Predicate
199+
predicate = Resource(value=values[predicate_index][0])
200+
201+
# Object
202+
object_type_list = values[object_type_index]
203+
if len(object_type_list) == 1:
204+
lang_list = values[object_language_index]
205+
lang = lang_list[0] if lang_list else None
206+
type_list = values[object_data_type_index]
207+
type_id = type_list[0] if type_list else None
208+
object_value = create_node(object_type_list[0], values[object_index][0], lang, type_id)
209+
else:
210+
raise ValueError(f"Invalid object type: {object_type_list}. Expected a single element.")
211+
212+
# Graph
213+
graph_list = values[graph_index]
214+
graph: Resource | None = None
215+
if graph_list:
216+
graph = Resource(value=graph_list[0])
217+
218+
# Build the Quad
219+
return Quad(
220+
subject=cast("ConcreteNode", subject),
221+
predicate=predicate,
222+
object=object_value,
223+
graph=graph,
224+
)

cmem_plugin_base/dataintegration/typed_entities/typed_entities.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
from abc import abstractmethod
44
from collections.abc import Iterator, Sequence
5-
from typing import Generic, TypeVar
5+
from typing import ClassVar, Generic, TypeVar
66

77
from cmem_plugin_base.dataintegration.entity import Entities, Entity, EntityPath, EntitySchema
88

@@ -12,8 +12,20 @@
1212
class TypedEntitySchema(EntitySchema, Generic[T]):
1313
"""A custom entity schema that holds entities of a specific type (e.g. files)."""
1414

15-
def __init__(self, type_uri: str, paths: Sequence[EntityPath]):
16-
super().__init__(type_uri, paths)
15+
# Class variable to store singleton instances for each subclass
16+
_instances: ClassVar[dict[type["TypedEntitySchema"], "TypedEntitySchema"]] = {}
17+
18+
def __new__(cls, *args, **kwargs) -> "TypedEntitySchema": # noqa: ANN002, ANN003, ARG004
19+
"""Implement singleton pattern for all subclasses of TypedEntitySchema."""
20+
if cls not in cls._instances:
21+
cls._instances[cls] = super().__new__(cls)
22+
return cls._instances[cls]
23+
24+
def __init__(self, type_uri: str, paths: Sequence[EntityPath]) -> None:
25+
# Check if this instance has already been initialized
26+
if not hasattr(self, "_initialized"):
27+
super().__init__(type_uri, paths)
28+
self._initialized = True
1729

1830
@abstractmethod
1931
def to_entity(self, value: T) -> Entity:

0 commit comments

Comments
 (0)