Skip to content

Commit be97119

Browse files
author
Thierry RAMORASOAVINA
committed
Support for Azure storage
- Among all the Azure storages, Khiops supports only (via its specific driver) "Files" and "Blobs" (Binary Large Objects) - The only supported authentication method is currently the `AZURE_STORAGE_CONNECTION_STRING` embedding the account name and account key
1 parent 17439c1 commit be97119

File tree

4 files changed

+476
-35
lines changed

4 files changed

+476
-35
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
### Added
1212
- (`sklearn`) `keep_selected_variables_only` parameter to the predictors (`KhiopsClassifier` and `KhiopsRegressor`)
13+
- (General) Support for Azure storage
1314

1415
### Changed
1516
- (`core`) Rename `variable_part_dimensions` to `inner_variable_dimensions` in Coclustering results.

khiops/core/internals/filesystems.py

Lines changed: 332 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,18 @@
3838
except ImportError as import_error:
3939
gcs_import_error = import_error
4040

41+
# Import azure packages if available
42+
# Delay an ImportError raising to an instantiation of the resource
43+
try:
44+
from azure.core import credentials, utils
45+
46+
# pylint: disable=redefined-outer-name,unused-import
47+
from azure.storage import blob, fileshare
48+
49+
azure_import_error = None
50+
except ImportError as import_error:
51+
azure_import_error = import_error
52+
4153
# pylint: enable=invalid-name
4254

4355
######################
@@ -76,6 +88,7 @@ def create_resource(uri_or_path):
7688
- ``file`` or empty: Local filesystem resource
7789
- ``s3``: Amazon S3 resource
7890
- ``gs``: Google Cloud Storage resource
91+
- ``https``: Azure Storage resource (files or blobs)
7992
8093
Returns
8194
-------
@@ -94,6 +107,14 @@ def create_resource(uri_or_path):
94107
return AmazonS3Resource(uri_or_path)
95108
elif uri_info.scheme == "gs":
96109
return GoogleCloudStorageResource(uri_or_path)
110+
elif uri_info.scheme == "https":
111+
# Create the corresponding instance of Azure storage resource
112+
# based on the well-known name pattern
113+
if AzureStorageResourceMixin.is_netloc_of_file_share(uri_info.netloc):
114+
return AzureStorageFileResource(uri_or_path)
115+
else:
116+
# Assume we have a netloc of a blob
117+
return AzureStorageBlobResource(uri_or_path)
97118
elif scheme == "file":
98119
# Reject URI if authority is not empty
99120
if uri_info.netloc:
@@ -503,7 +524,7 @@ def create_parent(self):
503524
class GoogleCloudStorageResource(FilesystemResource):
504525
"""Google Cloud Storage Resource
505526
506-
By default it reads the configuration from standard location.
527+
By default, it reads the configuration from standard location.
507528
"""
508529

509530
def __init__(self, uri):
@@ -733,3 +754,313 @@ def create_parent(self):
733754

734755

735756
# pylint: enable=no-member
757+
758+
759+
class AzureStorageResourceMixin:
760+
"""Azure compatible Storage Resource Mixin
761+
762+
Among all the Azure storages, Khiops supports only (via its specific driver) :
763+
- Files
764+
- Blobs (Binary Large Objects)
765+
766+
For shared Files, the URI pattern of a resource is the following :
767+
https://<storage-account-name>.file.core.windows.net/<share name>/...
768+
<folder s>/<file name>
769+
One may confuse the "share name" with a "folder name"
770+
because nothing helps distinguish one from the other.
771+
772+
For blobs, the URI pattern of a resource is the following :
773+
https://<storage-account-name>.blob.core.windows.net/<container name>/...
774+
<folder s>/<file name>
775+
One may confuse the "container name" with a "folder name"
776+
because nothing helps distinguish one from the other.
777+
778+
By default, this resource reads the configuration from standard location
779+
(environment variables for the moment)
780+
"""
781+
782+
def __init__(self, uri):
783+
"""
784+
Azure Storage Resource initializer common to Files and Blobs
785+
"""
786+
787+
# Stop initialization if Azure modules are not available
788+
if azure_import_error is not None:
789+
warnings.warn(
790+
"Could not import azure modules. "
791+
"Make sure you have installed the azure.core and "
792+
"azure.storage.fileshare packages to "
793+
"access Azure Storage files."
794+
)
795+
raise azure_import_error
796+
797+
super().__init__(uri)
798+
799+
# Create the authentication object using the connection string
800+
connection_string = os.environ.get("AZURE_STORAGE_CONNECTION_STRING")
801+
mappings = utils.parse_connection_string(connection_string)
802+
creds = credentials.AzureNamedKeyCredential(
803+
mappings.get("accountname"), mappings.get("accountkey")
804+
)
805+
806+
# Allow the extraction from the URL, the "share name" (for file shares)
807+
# or the "container name" (for blobs)
808+
# assuming it is the first part of the path.
809+
# Warning : parts[0] contains '/'
810+
parts = __class__._splitall(self.uri_info.path)
811+
812+
if __class__.is_netloc_of_file_share(self.uri_info.netloc):
813+
share_url = f"{self.uri_info.scheme}://{self.uri_info.netloc}/{parts[1]}"
814+
# Instantiate a ShareClient and attach it to the current instance.
815+
# Later, instances of ShareFileClient or ShareDirectoryClient
816+
# will be created.
817+
self.azure_share_client = fileshare.ShareClient.from_share_url(
818+
share_url=share_url, credential=creds
819+
)
820+
else:
821+
# For blobs : most of the time, there is no need to distinguish
822+
# between `container_url`
823+
# and the remaining parts of the path.
824+
# The `BlobClient` knows how to handle with the absolute path.
825+
# But for blobs listing, there is an exception
826+
# as we need a `ContainerClient`
827+
container_url = (
828+
f"{self.uri_info.scheme}://" f"{self.uri_info.netloc}/{parts[1]}"
829+
)
830+
831+
# Instantiate a `ContainerClient` and attach it to the current instance.
832+
self.azure_blob_container_client = blob.ContainerClient.from_container_url(
833+
container_url=container_url, credential=creds
834+
)
835+
836+
# Instantiate a `BlobClient` and attach it to the current instance.
837+
self.azure_blob_client = blob.BlobClient.from_blob_url(
838+
blob_url=self.uri_info.geturl(), credential=creds
839+
)
840+
841+
# This attribute will be used in a "Shared File" only,
842+
# for blobs the client will manage with absolute paths
843+
self.relative_remaining_path = "/".join(parts[2:])
844+
845+
@staticmethod
846+
def _splitall(path):
847+
"""
848+
Build a list of path parts form a path as a str
849+
"""
850+
allparts = []
851+
while 1:
852+
parts = os.path.split(path)
853+
if parts[0] == path: # sentinel for absolute paths
854+
allparts.insert(0, parts[0])
855+
break
856+
elif parts[1] == path: # sentinel for relative paths
857+
allparts.insert(0, parts[1])
858+
break
859+
else:
860+
path = parts[0]
861+
allparts.insert(0, parts[1])
862+
return allparts
863+
864+
@staticmethod
865+
def is_netloc_of_file_share(netloc):
866+
return netloc.endswith(".file.core.windows.net")
867+
868+
869+
class AzureStorageFileResource(AzureStorageResourceMixin, FilesystemResource):
870+
"""Azure compatible Storage File Resource
871+
872+
The remote "file share" on Azure MUST exist before any call
873+
874+
"""
875+
876+
def write(self, data):
877+
"""
878+
Notes:
879+
FIXME :
880+
In order to be consistent with all the other filesystems,
881+
if the directory hierarchy does not exist,
882+
it must be created
883+
"""
884+
file_share_client = self.azure_share_client.get_file_client(
885+
self.relative_remaining_path
886+
)
887+
file_share_client.upload_file(data)
888+
889+
def exists(self):
890+
"""
891+
Check if the target resource exists (file or directory)
892+
"""
893+
file_share_client = self.azure_share_client.get_file_client(
894+
self.relative_remaining_path
895+
)
896+
directory_share_client = self.azure_share_client.get_file_client(
897+
self.relative_remaining_path
898+
)
899+
# Both clients are required because
900+
# - the check against `file_share_client` is ``False``
901+
# if the target is a directory.
902+
# - the check against `directory_share_client` is ``False``
903+
# if the target is a file.
904+
return file_share_client.exists() or directory_share_client.exists()
905+
906+
def remove(self):
907+
"""
908+
Remove the target resource either it is a file or a directory.
909+
If an error occurs an exception is raised.
910+
"""
911+
file_share_client = self.azure_share_client.get_file_client(
912+
self.relative_remaining_path
913+
)
914+
delete_file_error = None
915+
delete_directory_error = None
916+
try:
917+
file_share_client.delete_file()
918+
# If the deletion of the file succeeds then return immediately
919+
return
920+
except Exception as delete_error: # pylint: disable=broad-exception-caught
921+
delete_file_error = delete_error
922+
923+
directory_share_client = self.azure_share_client.get_directory_client(
924+
self.relative_remaining_path
925+
)
926+
try:
927+
directory_share_client.delete_directory()
928+
# If the deletion of the directory succeeds then return immediately
929+
return
930+
except Exception as delete_error: # pylint: disable=broad-exception-caught
931+
delete_directory_error = delete_error
932+
933+
raise delete_file_error or delete_directory_error
934+
935+
def copy_from_local(self, local_path):
936+
"""
937+
Notes:
938+
Compared to other storages implementations,
939+
if the parent directories are missing they are NOT created.
940+
The caller must ensure the whole hierarchy exist otherwise an error will occur.
941+
"""
942+
file_share_client = self.azure_share_client.get_file_client(
943+
self.relative_remaining_path
944+
)
945+
with open(local_path) as input_file:
946+
file_share_client.upload_file(input_file.read())
947+
948+
def copy_to_local(self, local_path):
949+
file_share_client = self.azure_share_client.get_file_client(
950+
self.relative_remaining_path
951+
)
952+
with open(local_path, "wb", encoding="utf8") as output_file:
953+
data = file_share_client.download_file()
954+
data.readinto(output_file)
955+
956+
def list_dir(self):
957+
"""
958+
List the files (not the directories) of the current directory
959+
Notes:
960+
This is not a recursive listing operation
961+
"""
962+
directory_share_client = self.azure_share_client.get_directory_client(
963+
self.relative_remaining_path
964+
)
965+
return [
966+
item["name"]
967+
for item in directory_share_client.list_directories_and_files()
968+
if not item["is_directory"]
969+
]
970+
971+
def make_dir(self):
972+
"""
973+
Notes:
974+
To avoid any exception while attempting to create an existing directory,
975+
that would occur otherwise, its existence is first checked
976+
"""
977+
directory_share_client = self.azure_share_client.get_directory_client(
978+
self.relative_remaining_path
979+
)
980+
if not directory_share_client.exists():
981+
self.azure_share_client.create_directory(self.relative_remaining_path)
982+
983+
def read(self, size=None):
984+
file_share_client = self.azure_share_client.get_file_client(
985+
self.relative_remaining_path
986+
)
987+
return file_share_client.download_file(length=size).readall()
988+
989+
def create_child(self, file_name):
990+
return create_resource(child_uri_info(self.uri_info, file_name).geturl())
991+
992+
def create_parent(self):
993+
return create_resource(parent_uri_info(self.uri_info).geturl())
994+
995+
996+
class AzureStorageBlobResource(AzureStorageResourceMixin, FilesystemResource):
997+
"""Azure compatible Storage Blob Resource
998+
999+
The remote "container" on Azure MUST exist before any call.
1000+
1001+
"""
1002+
1003+
def read(self, size=None):
1004+
return self.azure_blob_client.download_blob(length=size).readall()
1005+
1006+
def write(self, data):
1007+
"""
1008+
Notes:
1009+
in order to be consistent with the other drivers,
1010+
this method will overwrite the destination blob if it exists.
1011+
This is not the default behavior.
1012+
"""
1013+
self.azure_blob_client.upload_blob(data, overwrite=True)
1014+
1015+
def exists(self):
1016+
return self.azure_blob_client.exists()
1017+
1018+
def remove(self):
1019+
self.azure_blob_client.delete_blob()
1020+
1021+
def copy_from_local(self, local_path):
1022+
"""
1023+
Notes:
1024+
in order to be consistent with the other drivers,
1025+
this method will overwrite the destination blob if it exists.
1026+
This is not the default behavior.
1027+
"""
1028+
with open(local_path) as input_file:
1029+
self.azure_blob_client.upload_blob(input_file.read(), overwrite=True)
1030+
1031+
def copy_to_local(self, local_path):
1032+
with open(local_path, "wb", encoding="utf8") as output_file:
1033+
data = self.azure_blob_client.download_blob()
1034+
data.readinto(output_file)
1035+
1036+
def list_dir(self):
1037+
"""
1038+
Notes:
1039+
by default, the sdk will list all the blobs belonging to the container.
1040+
An extract filter is then required to simulate a directory listing.
1041+
FIXME : suppress the virtual parent directory
1042+
"""
1043+
return [
1044+
item
1045+
for item in self.azure_blob_container_client.list_blob_names(
1046+
# Keep only the blobs belonging to the "virtual directory"
1047+
name_starts_with=self.relative_remaining_path,
1048+
)
1049+
]
1050+
1051+
def make_dir(self):
1052+
warnings.warn(
1053+
"'make_dir' is a non-operation on Azure Storage for Blobs. "
1054+
"See the documentation at "
1055+
"https://learn.microsoft.com/en-us/rest/api/storageservices/"
1056+
"operations-on-containers and "
1057+
"https://learn.microsoft.com/en-us/rest/api/storageservices/"
1058+
"naming-and-referencing-containers--blobs--and-metadata#blob-names "
1059+
"(a virtual hierarchy can be created in naming blobs)"
1060+
)
1061+
1062+
def create_child(self, file_name):
1063+
return create_resource(child_uri_info(self.uri_info, file_name).geturl())
1064+
1065+
def create_parent(self, file_name):
1066+
return create_resource(parent_uri_info(self.uri_info).geturl())

pyproject.toml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,13 +113,20 @@ dependencies = [
113113
Homepage = "https://khiops.org"
114114

115115
[project.optional-dependencies]
116+
# Warning : Conda does not fully support PEP508 requirements specification
117+
# namely the "~=" operator. Use instead version ranges.
116118
s3 = [
117119
# do not use the latest version, to avoid undesired breaking changes
118120
"boto3>=1.17.39,<=1.35.69",
119121
]
120122
gcs = [
121123
"google-cloud-storage>=1.37.0",
122124
]
125+
azure = [
126+
"azure-core>=1.39.0,<2.0.0",
127+
"azure-storage-file-share>=12.10.1,<13.0.0",
128+
"azure-storage-blob>=12.28.0,<13.0.0",
129+
]
123130

124131
[tool.setuptools.packages.find]
125132
include = ["khiops", "khiops.*"]

0 commit comments

Comments
 (0)