Skip to content

Commit 4dce368

Browse files
authored
Merge pull request #28 from eccenca/feature/zipFileEntities-CMEM-6629
Add entry_path attribute and read_stream method to File classes
2 parents d431822 + 2c66984 commit 4dce368

3 files changed

Lines changed: 97 additions & 16 deletions

File tree

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
55

66
The format is based on [Keep a Changelog](http://keepachangelog.com/) and this project adheres to [Semantic Versioning](https://semver.org/)
77

8+
## [Unreleases]
9+
10+
### Added
11+
12+
- `File` entities: add `entry_path` attribute and `read_stream` method
13+
814
## [4.10.2] 2025-05-15 - shipped with DI v25.1.1
915

1016
### Fixed
Lines changed: 82 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,13 @@
11
"""File entities"""
22

3+
import zipfile
4+
from abc import abstractmethod
5+
from io import BytesIO
6+
from pathlib import Path
7+
from typing import IO
8+
9+
from cmem.cmempy.workspace.projects.resources.resource import get_resource_response
10+
311
from cmem_plugin_base.dataintegration.entity import Entity, EntityPath
412
from cmem_plugin_base.dataintegration.typed_entities import instance_uri, path_uri, type_uri
513
from cmem_plugin_base.dataintegration.typed_entities.typed_entities import (
@@ -8,26 +16,83 @@
816

917

1018
class File:
11-
"""A file entity that can be held in a FileEntitySchema."""
19+
"""A file entity that can be held in a FileEntitySchema.
20+
21+
:param path: The file path.
22+
:param file_type: The type of the file (one of: "Local", "Project").
23+
:param mime: The MIME type of the file, if known.
24+
:param entry_path: If the file path points to a archive, the entry within the archive.
25+
"""
1226

13-
def __init__(self, path: str, file_type: str, mime: str | None) -> None:
27+
def __init__(self, path: str, file_type: str, mime: str | None, entry_path: str | None) -> None:
1428
self.path = path
1529
self.file_type = file_type
1630
self.mime = mime
31+
self.entry_path = entry_path
32+
33+
@abstractmethod
34+
def read_stream(self, project_id: str) -> IO[bytes]:
35+
"""Open the referenced file as a stream.
36+
37+
Returns a file-like object (stream) in binary mode.
38+
Caller is responsible for closing the stream.
39+
"""
1740

1841

1942
class LocalFile(File):
2043
"""A file that's located on the local file system."""
2144

22-
def __init__(self, path: str, mime: str | None = None) -> None:
23-
super().__init__(path, "Local", mime)
45+
def __init__(self, path: str, mime: str | None = None, entry_path: str | None = None) -> None:
46+
super().__init__(path, "Local", mime, entry_path)
47+
48+
def read_stream(self, project_id: str) -> IO[bytes]:
49+
"""Open the referenced file as a stream.
50+
51+
Returns a file-like object (stream) in binary mode.
52+
Caller is responsible for closing the stream.
53+
"""
54+
if self.entry_path:
55+
archive = zipfile.ZipFile(self.path, "r")
56+
try:
57+
return archive.open(self.entry_path, "r")
58+
except KeyError as err:
59+
archive.close()
60+
raise FileNotFoundError(
61+
f"Entry '{self.entry_path}' not found in archive '{self.path}'."
62+
) from err
63+
else:
64+
if not Path(self.path).is_file():
65+
raise FileNotFoundError(f"File '{self.path}' does not exist.")
66+
return Path(self.path).open("rb")
2467

2568

2669
class ProjectFile(File):
2770
"""A project file"""
2871

29-
def __init__(self, path: str, mime: str | None = None) -> None:
30-
super().__init__(path, "Project", mime)
72+
def __init__(self, path: str, mime: str | None = None, entry_path: str | None = None) -> None:
73+
super().__init__(path, "Project", mime, entry_path)
74+
75+
def read_stream(self, project_id: str) -> IO[bytes]:
76+
"""Open the referenced file as a stream.
77+
78+
Returns a file-like object (stream) in binary mode.
79+
Caller is responsible for closing the stream.
80+
"""
81+
response = get_resource_response(project_id, self.path)
82+
if response.status_code != 200: # noqa: PLR2004
83+
raise FileNotFoundError(f"Project file '{self.path}' not found.")
84+
response_bytes = BytesIO(response.raw.read())
85+
if self.entry_path:
86+
archive = zipfile.ZipFile(response_bytes, "r")
87+
try:
88+
return archive.open(self.entry_path, "r")
89+
except KeyError as err:
90+
archive.close()
91+
raise FileNotFoundError(
92+
f"Entry '{self.entry_path}' not found in project file '{self.path}'."
93+
) from err
94+
else:
95+
return response_bytes
3196

3297

3398
class FileEntitySchema(TypedEntitySchema[File]):
@@ -40,25 +105,33 @@ def __init__(self):
40105
EntityPath(path_uri("filePath"), is_single_value=True),
41106
EntityPath(path_uri("fileType"), is_single_value=True),
42107
EntityPath(path_uri("mimeType"), is_single_value=True),
108+
EntityPath(path_uri("entryPath"), is_single_value=True),
43109
],
44110
)
45111

46112
def to_entity(self, value: File) -> Entity:
47113
"""Create a generic entity from a file"""
48114
return Entity(
49115
uri=instance_uri(value.path),
50-
values=[[value.path], [value.file_type], [value.mime] if value.mime else []],
116+
values=[
117+
[value.path],
118+
[value.file_type],
119+
[value.mime] if value.mime else [],
120+
[value.entry_path] if value.entry_path else [],
121+
],
51122
)
52123

53124
def from_entity(self, entity: Entity) -> File:
54125
"""Create a file entity from a generic entity."""
55126
path = entity.values[0][0]
56127
file_type = entity.values[1][0]
57128
mime = entity.values[2][0] if entity.values[2] and entity.values[2][0] else None
129+
entry_path = entity.values[3][0] if entity.values[3] and entity.values[3][0] else None
130+
58131
match file_type:
59132
case "Local":
60-
return LocalFile(path, mime)
133+
return LocalFile(path, mime, entry_path)
61134
case "Project":
62-
return ProjectFile(path, mime)
135+
return ProjectFile(path, mime, entry_path)
63136
case _:
64137
raise ValueError(f"File '{path}' has unexpected type '{file_type}'.")

tests/typed_entities/test_typed_entities.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from cmem_plugin_base.dataintegration.plugins import WorkflowPlugin
1313
from cmem_plugin_base.dataintegration.ports import FixedNumberOfInputs, FixedSchemaPort
1414
from cmem_plugin_base.dataintegration.typed_entities.file import FileEntitySchema, LocalFile
15+
from cmem_plugin_base.testing import TestTaskContext
1516

1617

1718
class ConcatFilesOperator(WorkflowPlugin):
@@ -29,9 +30,8 @@ def execute(self, inputs: Sequence[Entities], context: ExecutionContext) -> Enti
2930
output_name = o_file.name
3031
for file in input_files.values:
3132
if isinstance(file, LocalFile):
32-
with Path(file.path).open("rb") as f:
33-
contents = f.read()
34-
o_file.write(contents)
33+
with file.read_stream(context.task.project_id()) as in_stream:
34+
o_file.write(in_stream.read())
3535

3636
return FileEntitySchema().to_entities(iter([LocalFile(output_name)]))
3737

@@ -51,7 +51,9 @@ def test_files(self) -> None:
5151
input_entities = FileEntitySchema().to_entities(
5252
iter([LocalFile(temp1.name), LocalFile(temp2.name)])
5353
)
54-
output = ConcatFilesOperator().execute([input_entities], ExecutionContext())
54+
context = ExecutionContext()
55+
context.task = TestTaskContext(project_id="TestProject", task_id="TestTask")
56+
output = ConcatFilesOperator().execute([input_entities], context)
5557

5658
# Check output
5759
assert output is not None
@@ -63,15 +65,15 @@ def test_files(self) -> None:
6365

6466
def test_file_entity_conversion(self) -> None:
6567
"""Test conversion from entity to file"""
66-
file_entity = Entity(uri="test.uri", values=[["test.txt"], ["Project"], []])
68+
file_entity = Entity(uri="test.uri", values=[["test.txt"], ["Local"], [], []])
6769
assert FileEntitySchema().from_entity(file_entity)
6870

69-
file_entity = Entity(uri="test.uri", values=[["test.txt"], ["Project"], [""]])
71+
file_entity = Entity(uri="test.uri", values=[["test.txt"], ["Local"], [""], [""]])
7072
assert FileEntitySchema().from_entity(file_entity)
7173

7274
with pytest.raises(ValueError, match="File 'test.txt' has unexpected type 'Wrong Type'"):
7375
FileEntitySchema().from_entity(
74-
Entity(uri="test.uri", values=[["test.txt"], ["Wrong Type"], []])
76+
Entity(uri="test.uri", values=[["test.txt"], ["Wrong Type"], [], []])
7577
)
7678

7779

0 commit comments

Comments
 (0)