diff --git a/.github/workflows/dev-docker.yml b/.github/workflows/dev-docker.yml index bb3f236c..cd152169 100644 --- a/.github/workflows/dev-docker.yml +++ b/.github/workflows/dev-docker.yml @@ -5,8 +5,9 @@ env: DEFAULT_IMAGE_INCREMENT: 0 DEFAULT_SERVER_REVISION: main DEFAULT_PYTHON_VERSIONS: 3.10 3.11 3.12 3.13 3.14 - DEFAULT_KHIOPS_GCS_DRIVER_REVISION: 0.0.14 - DEFAULT_KHIOPS_S3_DRIVER_REVISION: 0.0.14 + DEFAULT_KHIOPS_GCS_DRIVER_REVISION: 0.0.16 + DEFAULT_KHIOPS_S3_DRIVER_REVISION: 0.0.15 + DEFAULT_KHIOPS_AZURE_DRIVER_REVISION: 0.0.6 on: pull_request: paths: [packaging/docker/khiopspydev/Dockerfile.*, .github/workflows/dev-docker.yml] @@ -38,12 +39,16 @@ on: description: Khiops Server Revision khiops-gcs-driver-revision: type: string - default: 0.0.14 + default: 0.0.16 description: Driver version for Google Cloud Storage remote files khiops-s3-driver-revision: type: string - default: 0.0.14 + default: 0.0.15 description: Driver version for AWS-S3 remote files + khiops-azure-driver-revision: + type: string + default: 0.0.6 + description: Driver version for Azure remote files and blobs concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true @@ -67,6 +72,7 @@ jobs: echo "IMAGE_URL=ghcr.io/khiopsml/khiops-python/khiopspydev-${{ matrix.khiopsdev-os }}" >> "$GITHUB_ENV" echo "KHIOPS_GCS_DRIVER_REVISION=${{ inputs.khiops-gcs-driver-revision || env.DEFAULT_KHIOPS_GCS_DRIVER_REVISION }}" >> "$GITHUB_ENV" echo "KHIOPS_S3_DRIVER_REVISION=${{ inputs.khiops-s3-driver-revision || env.DEFAULT_KHIOPS_S3_DRIVER_REVISION }}" >> "$GITHUB_ENV" + echo "KHIOPS_AZURE_DRIVER_REVISION=${{ inputs.khiops-azure-driver-revision || env.DEFAULT_KHIOPS_AZURE_DRIVER_REVISION }}" >> "$GITHUB_ENV" - name: Checkout khiops-python sources uses: actions/checkout@v4 - name: Set up Docker Buildx @@ -105,6 +111,7 @@ jobs: "PYTHON_VERSIONS=${{ inputs.python-versions || env.DEFAULT_PYTHON_VERSIONS }}" "KHIOPS_GCS_DRIVER_REVISION=${{ env.KHIOPS_GCS_DRIVER_REVISION }}" "KHIOPS_S3_DRIVER_REVISION=${{ env.KHIOPS_S3_DRIVER_REVISION }}" + "KHIOPS_AZURE_DRIVER_REVISION=${{ env.KHIOPS_AZURE_DRIVER_REVISION }}" tags: ${{ env.DOCKER_IMAGE_TAGS }} # Push only on manual request push: ${{ inputs.push || false }} diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index b92fdc5f..0eb20e5c 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -138,14 +138,22 @@ jobs: echo "Generated AWS configuration..." cat ${GITHUB_WORKSPACE}/.aws/configuration /scripts/run_fake_remote_file_servers.sh . # launch the servers in the background + # Set environment variables for the tests with GCS GCS_BUCKET_NAME=data-test-khiops-driver-gcs/khiops_data GCS_DRIVER_LOGLEVEL=info # set to debug for diagnosis + # Set environment variables for the tests with S3 S3_DRIVER_LOGLEVEL=info # set to debug for diagnosis S3_BUCKET_NAME=s3-bucket AWS_SHARED_CREDENTIALS_FILE=${{ github.workspace }}/.aws/credentials AWS_CONFIG_FILE=${{ github.workspace }}/.aws/configuration + + # Set environment variables for the tests with Azure + AZURE_STORAGE_CONNECTION_STRING='${{ secrets.AZURE_CONNECTION_STRING }}' + CLOUD_BLOB_URI_PREFIX=${{ vars.CLOUD_BLOB_URI_PREFIX }} + CLOUD_FILE_URI_PREFIX=${{ vars.CLOUD_FILE_URI_PREFIX }} + # Persist environment variables for subsequent steps echo "GCS_BUCKET_NAME=${GCS_BUCKET_NAME}" >> "$GITHUB_ENV" echo "GCS_DRIVER_LOGLEVEL=${GCS_DRIVER_LOGLEVEL}" >> "$GITHUB_ENV" @@ -153,6 +161,9 @@ jobs: echo "S3_BUCKET_NAME=${S3_BUCKET_NAME}" >> "$GITHUB_ENV" echo "AWS_SHARED_CREDENTIALS_FILE=${AWS_SHARED_CREDENTIALS_FILE}" >> "$GITHUB_ENV" echo "AWS_CONFIG_FILE=${AWS_CONFIG_FILE}" >> "$GITHUB_ENV" + echo "AZURE_STORAGE_CONNECTION_STRING=${AZURE_STORAGE_CONNECTION_STRING}" >> "$GITHUB_ENV" + echo "CLOUD_BLOB_URI_PREFIX=${CLOUD_BLOB_URI_PREFIX}" >> "$GITHUB_ENV" + echo "CLOUD_FILE_URI_PREFIX=${CLOUD_FILE_URI_PREFIX}" >> "$GITHUB_ENV" - name: Authenticate to GCP using "Workload Identity Federation" if: env.SKIP_EXPENSIVE_TESTS != 'true' # For integration tests on GCS we use a real Google account diff --git a/CHANGELOG.md b/CHANGELOG.md index 7b3768d0..fa18a324 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ ### Added - (`sklearn`) `keep_selected_variables_only` parameter to the predictors (`KhiopsClassifier` and `KhiopsRegressor`) +- (General) Support for Azure storage ### Changed - (`core`) Rename `variable_part_dimensions` to `inner_variable_dimensions` in Coclustering results. diff --git a/khiops/core/internals/filesystems.py b/khiops/core/internals/filesystems.py index 2698844c..afa46dfc 100644 --- a/khiops/core/internals/filesystems.py +++ b/khiops/core/internals/filesystems.py @@ -38,6 +38,18 @@ except ImportError as import_error: gcs_import_error = import_error +# Import azure packages if available +# Delay an ImportError raising to an instantiation of the resource +try: + from azure.core import credentials, utils + + # pylint: disable=redefined-outer-name + from azure.storage import blob, fileshare + + azure_import_error = None +except ImportError as import_error: + azure_import_error = import_error + # pylint: enable=invalid-name ###################### @@ -76,6 +88,7 @@ def create_resource(uri_or_path): - ``file`` or empty: Local filesystem resource - ``s3``: Amazon S3 resource - ``gs``: Google Cloud Storage resource + - ``https``: Azure Storage resource (files or blobs) Returns ------- @@ -94,6 +107,14 @@ def create_resource(uri_or_path): return AmazonS3Resource(uri_or_path) elif uri_info.scheme == "gs": return GoogleCloudStorageResource(uri_or_path) + elif uri_info.scheme == "https": + # Create the corresponding instance of Azure storage resource + # based on a well-known name pattern of the uri netloc + if AzureStorageResourceMixin.is_netloc_of_file_share(uri_info.netloc): + return AzureStorageFileResource(uri_or_path) + else: + # Assume we have a netloc of a blob + return AzureStorageBlobResource(uri_or_path) elif scheme == "file": # Reject URI if authority is not empty if uri_info.netloc: @@ -503,7 +524,7 @@ def create_parent(self): class GoogleCloudStorageResource(FilesystemResource): """Google Cloud Storage Resource - By default it reads the configuration from standard location. + By default, it reads the configuration from standard location. """ def __init__(self, uri): @@ -733,3 +754,358 @@ def create_parent(self): # pylint: enable=no-member + + +class AzureStorageResourceMixin: + """Azure compatible Storage Resource Mixin + + Among all the Azure storages, Khiops supports only (via its specific driver) : + - Files + - Blobs (Binary Large Objects) + + For shared Files, the URI pattern of a resource is the following : + https://.file.core.windows.net//... + <0 to n folder name(s)>/ + The first name after the netloc is the "share name" not a simple "folder name". + + For blobs, the URI pattern of a resource is the following : + https://.blob.core.windows.net//... + <0 to n virtual folder name(s)>/ + The "virtual folder names" are part of the blob name but can help simulate + a folder hierarchy. + The first name after the netloc is the "container name" not a simple + "virtual folder name". + + By default, this resource reads the configuration from standard location + (environment variables for the moment) + """ + + def __init__(self, uri): + """ + Azure Storage Resource initializer common to Files and Blobs + """ + + # Stop initialization if Azure modules are not available + if azure_import_error is not None: + warnings.warn( + "Could not import azure modules. " + "Make sure you have installed the azure.core, " + "azure.storage.fileshare and azure.storage.blob packages to " + "access Azure Storage files." + ) + raise azure_import_error + + super().__init__(uri) + + # Create the authentication object using the connection string + connection_string = os.environ.get("AZURE_STORAGE_CONNECTION_STRING") + mappings = utils.parse_connection_string(connection_string) + creds = credentials.AzureNamedKeyCredential( + mappings.get("accountname"), mappings.get("accountkey") + ) + + # Allow the extraction of the "share name" (for file shares) + # or the "container name" (for blobs) from the URL. + # These names are always located at the first position in the path. + parts = __class__._splitall(self.uri_info.path) + + if len(parts) < 3: + # parts[0] contains '/', + # parts[1] contains either the "share name" or the "container name" + # parts[2:] contains the remaining of the path + raise ValueError( + f"Invalid Azure Resource URI '{uri}'. " + "The allowed patterns are: \n" + "- File : https://.file.core.windows.net/" + "/<0 to n folder name(s)>/\n" + "- Blob : https://.blob.core.windows.net/" + "/<0 to n virtual folder name(s)>/" + ) + + if __class__.is_netloc_of_file_share(self.uri_info.netloc): + # Warning : parts[0] contains '/' + share_url = f"{self.uri_info.scheme}://{self.uri_info.netloc}/{parts[1]}" + # Instantiate a `ShareClient` and attach it to the current instance. + # Later, instances of `ShareFileClient` or `ShareDirectoryClient` + # will be created. + self.azure_share_client = fileshare.ShareClient.from_share_url( + share_url=share_url, credential=creds + ) + else: + # For blobs : most of the time, there is no need to distinguish + # between `container_url` + # and the remaining parts of the path because the `BlobClient` knows + # how to handle with absolute paths. + # But for blobs listing only, a `ContainerClient` is needed. + # Therefore, the `container_url` must still be built. + # Warning : parts[0] contains '/' + container_url = ( + f"{self.uri_info.scheme}://" f"{self.uri_info.netloc}/{parts[1]}" + ) + + # Instantiate a `ContainerClient` and attach it to the current instance. + self.azure_blob_container_client = blob.ContainerClient.from_container_url( + container_url=container_url, credential=creds + ) + + # Instantiate a `BlobClient` and attach it to the current instance. + self.azure_blob_client = blob.BlobClient.from_blob_url( + blob_url=self.uri_info.geturl(), credential=creds + ) + + # This attribute will mostly be used in a "Shared File" context, + # for blobs the client will manage with absolute paths + # except when listing blobs of a "virtual folder" + self.relative_remaining_path = "/".join(parts[2:]) + + def create_child(self, file_name): + return create_resource(child_uri_info(self.uri_info, file_name).geturl()) + + def create_parent(self, file_name): # pylint: disable=unused-argument + return create_resource(parent_uri_info(self.uri_info).geturl()) + + @staticmethod + def _splitall(path): + """ + Build a list of path parts from a path as a str + """ + allparts = [] + while 1: + parts = os.path.split(path) + if parts[0] == path: # sentinel for absolute paths + allparts.insert(0, parts[0]) + break + elif parts[1] == path: # sentinel for relative paths + allparts.insert(0, parts[1]) + break + else: + path = parts[0] + allparts.insert(0, parts[1]) + return allparts + + @staticmethod + def is_netloc_of_file_share(netloc): + return netloc.endswith(".file.core.windows.net") + + +class AzureStorageFileResource(AzureStorageResourceMixin, FilesystemResource): + """Azure compatible Storage File Resource + + As a prerequisite, the remote storage account and "file share" on Azure + MUST have been created by the user. + + """ + + def write(self, data): + """ + Notes: + In order to be consistent with the other storage implementations, + if the parent directories are missing they are created one by one. + """ + relative_folders_hierarchy = os.path.dirname(self.relative_remaining_path) + self._create_the_whole_folders_hierarchy(relative_folders_hierarchy) + + file_share_client = self.azure_share_client.get_file_client( + self.relative_remaining_path + ) + file_share_client.upload_file(data) + + def exists(self): + """ + Check if the target resource exists (file or directory) + """ + file_share_client = self.azure_share_client.get_file_client( + self.relative_remaining_path + ) + directory_share_client = self.azure_share_client.get_file_client( + self.relative_remaining_path + ) + # Both clients are required because + # - the check against `file_share_client` is ``False`` + # if the target is a directory. + # - the check against `directory_share_client` is ``False`` + # if the target is a file. + return file_share_client.exists() or directory_share_client.exists() + + def remove(self): + """ + Remove the target resource either it is a file or a directory. + If an error occurs an exception is raised. + """ + file_share_client = self.azure_share_client.get_file_client( + self.relative_remaining_path + ) + delete_file_error = None + delete_directory_error = None + try: + file_share_client.delete_file() + # If the deletion of the file succeeds then return immediately + return + except Exception as delete_error: # pylint: disable=broad-exception-caught + delete_file_error = delete_error + + directory_share_client = self.azure_share_client.get_directory_client( + self.relative_remaining_path + ) + try: + directory_share_client.delete_directory() + # If the deletion of the directory succeeds then return immediately + return + except Exception as delete_error: # pylint: disable=broad-exception-caught + delete_directory_error = delete_error + + raise delete_directory_error or delete_file_error + + def copy_from_local(self, local_path): + """ + Notes: + In order to be consistent with the other storage implementations, + if the parent directories are missing they are created one by one. + """ + relative_folders_hierarchy = os.path.dirname(self.relative_remaining_path) + self._create_the_whole_folders_hierarchy(relative_folders_hierarchy) + + file_share_client = self.azure_share_client.get_file_client( + self.relative_remaining_path + ) + with open(local_path) as input_file: + file_share_client.upload_file(input_file.read()) + + def copy_to_local(self, local_path): + file_share_client = self.azure_share_client.get_file_client( + self.relative_remaining_path + ) + with open(local_path, "wb") as output_file: + data = file_share_client.download_file() + data.readinto(output_file) + + def list_dir(self): + """ + List the files (not the directories) of the current directory + Notes: + This is not a recursive listing operation + """ + directory_share_client = self.azure_share_client.get_directory_client( + self.relative_remaining_path + ) + return [ + item["name"] + for item in directory_share_client.list_directories_and_files() + if not item["is_directory"] + ] + + def make_dir(self): + """ + Notes: + To avoid any exception while attempting to create an existing directory, + that would occur otherwise, its existence is first checked + """ + directory_share_client = self.azure_share_client.get_directory_client( + self.relative_remaining_path + ) + if not directory_share_client.exists(): + self.azure_share_client.create_directory(self.relative_remaining_path) + + def read(self, size=None): + file_share_client = self.azure_share_client.get_file_client( + self.relative_remaining_path + ) + return file_share_client.download_file(length=size).readall() + + def _create_the_whole_folders_hierarchy(self, relative_folders_hierarchy): + """ + Create the whole folders hierarchy in checking and + creating each parent if needed + """ + parts = relative_folders_hierarchy.split("/") + current_hierarchy = "" + for current_folder in parts: + current_hierarchy += current_folder + "/" + directory_share_client = self.azure_share_client.get_directory_client( + current_hierarchy + ) + if not directory_share_client.exists(): + self.azure_share_client.create_directory(current_hierarchy) + + +class AzureStorageBlobResource(AzureStorageResourceMixin, FilesystemResource): + """Azure compatible Storage Blob Resource + + As a prerequisite, the remote storage account and the blob "container" on Azure + MUST have been created by the user. + + """ + + def read(self, size=None): + return self.azure_blob_client.download_blob(length=size).readall() + + def write(self, data): + """ + Notes: + in order to be consistent with the other drivers, + this method will overwrite the destination blob if it exists. + This is not the default behavior. + """ + self.azure_blob_client.upload_blob(data, overwrite=True) + + def exists(self): + """ + Notes: + as the virtual folder names are part of the blob name, + there is no such test like checking a virtual folder existence. + Only a blob existence is checked here. + """ + return self.azure_blob_client.exists() + + def remove(self): + """ + Notes: + as the virtual folder names are part of the blob name, + there is no such action like removing a virtual folder. + Only a blob removal is done here. + """ + self.azure_blob_client.delete_blob() + + def copy_from_local(self, local_path): + """ + Notes: + in order to be consistent with the other drivers, + this method will overwrite the destination blob if it exists. + This is not the default behavior. + Moreover, if the virtual folders hierarchy does not exist + it is created automatically as it is part of the blob name. + """ + with open(local_path) as input_file: + self.azure_blob_client.upload_blob(input_file.read(), overwrite=True) + + def copy_to_local(self, local_path): + with open(local_path, "wb") as output_file: + data = self.azure_blob_client.download_blob() + data.readinto(output_file) + + def list_dir(self): + """ + Notes: + by default, the sdk will list all the blobs belonging to the container. + An extract filter is then required to simulate a directory listing. + Moreover, the virtual folders hierarchy must be deleted from the result. + """ + len_virtual_folder_hierarchy = len(self.relative_remaining_path) + return [ + item[len_virtual_folder_hierarchy:] + for item in self.azure_blob_container_client.list_blob_names( + # Keep only the blobs belonging to the "virtual folder" + name_starts_with=self.relative_remaining_path, + ) + ] + + def make_dir(self): + warnings.warn( + "'make_dir' is a non-operation on Azure Storage for Blobs. " + "See the documentation at " + "https://learn.microsoft.com/en-us/rest/api/storageservices/" + "operations-on-containers and " + "https://learn.microsoft.com/en-us/rest/api/storageservices/" + "naming-and-referencing-containers--blobs--and-metadata#blob-names " + "(a virtual hierarchy can be created in naming blobs)" + ) diff --git a/packaging/docker/khiopspydev/Dockerfile.debian b/packaging/docker/khiopspydev/Dockerfile.debian index 0b9672b0..b4aaad61 100644 --- a/packaging/docker/khiopspydev/Dockerfile.debian +++ b/packaging/docker/khiopspydev/Dockerfile.debian @@ -49,6 +49,7 @@ RUN true \ ARG PYTHON_VERSIONS ARG KHIOPS_GCS_DRIVER_REVISION ARG KHIOPS_S3_DRIVER_REVISION +ARG KHIOPS_AZURE_DRIVER_REVISION # Install Conda packages # Use `rc` label for alpha or RC khiops-core pre-releases @@ -74,6 +75,8 @@ RUN true \ $CONDA install -y -n py${version}_conda \ khiops-driver-s3==${KHIOPS_S3_DRIVER_REVISION} \ khiops-driver-gcs==${KHIOPS_GCS_DRIVER_REVISION}; \ + # hardcoded version because the latest version is not released on conda-forge \ + khiops-driver-azure==0.0.5; \ done; \ # Install Khiops from a different major version in a dedicated conda environment \ # The python interpreter version of the base environment is used as no specific version is given \ @@ -94,9 +97,11 @@ RUN true \ # Force the installation of the debian 12 versions (bookworm) && wget -O khiops-gcs.deb https://github.com/KhiopsML/khiopsdriver-gcs/releases/download/${KHIOPS_GCS_DRIVER_REVISION}/khiops-driver-gcs_${KHIOPS_GCS_DRIVER_REVISION}-1-bookworm.amd64.deb \ && wget -O khiops-s3.deb https://github.com/KhiopsML/khiopsdriver-s3/releases/download/${KHIOPS_S3_DRIVER_REVISION}/khiops-driver-s3_${KHIOPS_S3_DRIVER_REVISION}-1-bookworm.amd64.deb \ - && (dpkg -i --force-all khiops-gcs.deb khiops-s3.deb || true) \ + # There is a typo in the latest release tag. Until it is fixed, we need to use the hardcoded folder name + && wget -O khiops-azure.deb https://github.com/KhiopsML/khiopsdriver-azure/releases/download/0.0.7/khiops-driver-azure_${KHIOPS_AZURE_DRIVER_REVISION}-1-bookworm.amd64.deb \ + && (dpkg -i --force-all khiops-gcs.deb khiops-s3.deb khiops-azure.deb || true) \ && apt-get -f -y install \ - && rm -f khiops-gcs.deb khiops-s3.deb \ + && rm -f khiops-gcs.deb khiops-s3.deb khiops-azure.deb \ && true FROM ghcr.io/khiopsml/khiops-server:${SERVER_REVISION} AS server diff --git a/packaging/docker/khiopspydev/Dockerfile.ubuntu b/packaging/docker/khiopspydev/Dockerfile.ubuntu index 48b811da..51d22018 100644 --- a/packaging/docker/khiopspydev/Dockerfile.ubuntu +++ b/packaging/docker/khiopspydev/Dockerfile.ubuntu @@ -47,6 +47,7 @@ RUN true \ ARG PYTHON_VERSIONS ARG KHIOPS_GCS_DRIVER_REVISION ARG KHIOPS_S3_DRIVER_REVISION +ARG KHIOPS_AZURE_DRIVER_REVISION # Install Conda packages # Use `rc` label for alpha or RC khiops-core pre-releases @@ -72,6 +73,8 @@ RUN true \ $CONDA install -y -n py${version}_conda \ khiops-driver-s3==${KHIOPS_S3_DRIVER_REVISION} \ khiops-driver-gcs==${KHIOPS_GCS_DRIVER_REVISION}; \ + # hardcoded version because the latest version is not released on conda-forge \ + khiops-driver-azure==0.0.5; \ done; \ # Install Khiops from a different major version in a dedicated conda environment \ # The python interpreter version of the base environment is used as no specific version is given \ @@ -91,9 +94,11 @@ RUN true \ && if [ -f /etc/os-release ]; then . /etc/os-release; fi \ && wget -O khiops-gcs.deb https://github.com/KhiopsML/khiopsdriver-gcs/releases/download/${KHIOPS_GCS_DRIVER_REVISION}/khiops-driver-gcs_${KHIOPS_GCS_DRIVER_REVISION}-1-${VERSION_CODENAME}.amd64.deb \ && wget -O khiops-s3.deb https://github.com/KhiopsML/khiopsdriver-s3/releases/download/${KHIOPS_S3_DRIVER_REVISION}/khiops-driver-s3_${KHIOPS_S3_DRIVER_REVISION}-1-${VERSION_CODENAME}.amd64.deb \ - && (dpkg -i --force-all khiops-gcs.deb khiops-s3.deb || true) \ + # There is a typo in the latest release tag. Until it is fixed, we need to use the hardcoded folder name + && wget -O khiops-azure.deb https://github.com/KhiopsML/khiopsdriver-azure/releases/download/0.0.7/khiops-driver-azure_${KHIOPS_AZURE_DRIVER_REVISION}-1-${VERSION_CODENAME}.amd64.deb \ + && (dpkg -i --force-all khiops-gcs.deb khiops-s3.deb khiops-azure.deb || true) \ && apt-get -f -y install \ - && rm -f khiops-gcs.deb khiops-s3.deb \ + && rm -f khiops-gcs.deb khiops-s3.deb khiops-azure.deb \ && true FROM ghcr.io/khiopsml/khiops-server:${SERVER_REVISION} AS server diff --git a/pyproject.toml b/pyproject.toml index b6956fbe..2ba38551 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -113,6 +113,9 @@ dependencies = [ Homepage = "https://khiops.org" [project.optional-dependencies] +# Warning : the following dependencies are also extracted for other installation tools. +# However Conda does not fully support PEP508 requirements specification, +# namely the "~=" operator. Use instead version ranges. s3 = [ # do not use the latest version, to avoid undesired breaking changes "boto3>=1.17.39,<=1.35.69", @@ -120,6 +123,14 @@ s3 = [ gcs = [ "google-cloud-storage>=1.37.0", ] +azure = [ + "azure-core>=1.39.0,<2.0.0", + "azure-storage-blob>=12.28.0,<13.0.0", + # the latest conda-forge package version of azure-storage-file-share (12.10.1) + # is far behind the one available on PyPI (12.24.0) + # lacking essential methods (`ShareFileClient.exists` was introduced only in 12.16.0 for example) + "azure-storage-file-share>=12.10.1,<13.0.0", +] [tool.setuptools.packages.find] include = ["khiops", "khiops.*"] diff --git a/tests/test_remote_access.py b/tests/test_remote_access.py index 93b4e985..7b20f0b2 100644 --- a/tests/test_remote_access.py +++ b/tests/test_remote_access.py @@ -54,6 +54,30 @@ def docker_runner_config_exists(): ) +def azure_config_for_files_exists(): + # Notes : + # `AZURE_STORAGE_CONNECTION_STRING` is a string composed of + # the storage account name and account key (amount other technical details) + # `CLOUD_FILE_URI_PREFIX` follows this pattern (for files) : + # .file.core.windows.net + return ( + "AZURE_STORAGE_CONNECTION_STRING" in os.environ + and "CLOUD_FILE_URI_PREFIX" in os.environ + ) + + +def azure_config_for_blobs_exists(): + # Notes : + # `AZURE_STORAGE_CONNECTION_STRING` is a string composed of + # the storage account name and account key (amount other technical details) + # `CLOUD_BLOB_URI_PREFIX` follows this pattern (for blobs) : + # .blob.core.windows.net + return ( + "AZURE_STORAGE_CONNECTION_STRING" in os.environ + and "CLOUD_BLOB_URI_PREFIX" in os.environ + ) + + class KhiopsRemoteAccessTestsContainer: """Container class to allow unittest.TestCase inheritance""" @@ -61,12 +85,19 @@ class KhiopsRemoteAccessTests(unittest.TestCase, KhiopsTestHelper): """Generic class to test remote filesystems and Khiops runners""" @classmethod - def init_remote_bucket(cls, bucket_name=None, proto=None): - # create the remote root_temp_dir - remote_resource = fs.create_resource( - f"{proto}://{bucket_name}/khiops-cicd/tmp" - ) - remote_resource.make_dir() + def init_remote_storage(cls, bucket_name_or_storage_prefix=None, proto=None): + # create the remote parent directories for clarity's sake + # even it is not required as all the remote storage implementations + # create the missing folders if needed + for remote_path in ( + # root_temp_dir + f"{proto}://{bucket_name_or_storage_prefix}/khiops-cicd/tmp/", + # samples dir + f"{proto}://{bucket_name_or_storage_prefix}/khiops-cicd/samples/", + ): + remote_resource = fs.create_resource(remote_path) + # This action should always be idempotent if the folder already exists + remote_resource.make_dir() # copy to /samples each file for file in ( @@ -77,13 +108,23 @@ def init_remote_bucket(cls, bucket_name=None, proto=None): "SpliceJunction/SpliceJunction.kdic", ): fs.copy_from_local( - f"{proto}://{bucket_name}/khiops-cicd/samples/{file}", + f"{proto}://{bucket_name_or_storage_prefix}/khiops-cicd/" + f"samples/{file}", os.path.join(kh.get_samples_dir(), file), ) + # symmetric call to ensure the upload was OK + try: + os.remove("/tmp/dummy") + except: # pylint: disable=broad-exception-caught + pass fs.copy_to_local( - f"{proto}://{bucket_name}/khiops-cicd/samples/{file}", "/tmp/dummy" + f"{proto}://{bucket_name_or_storage_prefix}/khiops-cicd/" + f"samples/{file}", + "/tmp/dummy", ) + # We cannot use `unittest.TestCase.assertTrue` in a class method + assert os.path.isfile("/tmp/dummy") def results_dir_root(self): """To be overridden by descendants if needed @@ -100,10 +141,6 @@ def remote_access_test_case(self): """To be overridden by descendants""" return "" - def should_skip_in_a_conda_env(self): - """To be overridden by descendants""" - return True - def print_test_title(self): print(f"\n Remote System: {self.remote_access_test_case()}") @@ -142,11 +179,6 @@ def setUp(self): KhiopsTestHelper.skip_expensive_test(self) self.skip_if_no_config() - if self.is_in_a_conda_env() and self.should_skip_in_a_conda_env(): - self.skipTest( - f"Remote test case {self.remote_access_test_case()} " - "in a conda environment is currently skipped" - ) self.print_test_title() # Save the runner that can be modified by the test @@ -293,7 +325,9 @@ def setUpClass(cls): runner = kh.get_runner() bucket_name = os.environ["S3_BUCKET_NAME"] - cls.init_remote_bucket(bucket_name=bucket_name, proto="s3") + cls.init_remote_storage( + bucket_name_or_storage_prefix=bucket_name, proto="s3" + ) runner.samples_dir = f"s3://{bucket_name}/khiops-cicd/samples" resources_directory = KhiopsTestHelper.get_resources_dir() @@ -312,11 +346,6 @@ def tearDownClass(cls): if s3_config_exists(): kh.get_runner().__init__() - def should_skip_in_a_conda_env(self): - # The S3 driver is now released for conda too. - # No need to skip the tests any longer in a conda environment - return False - def config_exists(self): return s3_config_exists() @@ -336,7 +365,9 @@ def setUpClass(cls): runner = kh.get_runner() bucket_name = os.environ["GCS_BUCKET_NAME"] - cls.init_remote_bucket(bucket_name=bucket_name, proto="gs") + cls.init_remote_storage( + bucket_name_or_storage_prefix=bucket_name, proto="gs" + ) runner.samples_dir = f"gs://{bucket_name}/khiops-cicd/samples" resources_directory = KhiopsTestHelper.get_resources_dir() @@ -355,11 +386,6 @@ def tearDownClass(cls): if gcs_config_exists(): kh.get_runner().__init__() - def should_skip_in_a_conda_env(self): - # The GCS driver is now released for conda too. - # No need to skip the tests any longer in a conda environment - return False - def config_exists(self): return gcs_config_exists() @@ -367,6 +393,86 @@ def remote_access_test_case(self): return "GCS" +class KhiopsAzureRemoteFileTests( + KhiopsRemoteAccessTestsContainer.KhiopsRemoteAccessTests +): + """Integration tests with Azure Storage filesystems""" + + @classmethod + def setUpClass(cls): + """Sets up remote directories in runner""" + if azure_config_for_files_exists(): + runner = kh.get_runner() + cloud_file_uri_prefix = os.environ["CLOUD_FILE_URI_PREFIX"] + + cls.init_remote_storage( + bucket_name_or_storage_prefix=cloud_file_uri_prefix, proto="https" + ) + + runner.samples_dir = f"https://{cloud_file_uri_prefix}/khiops-cicd/samples" + resources_directory = KhiopsTestHelper.get_resources_dir() + + # WARNING : khiops temp files cannot be remote + cls._khiops_temp_dir = f"{resources_directory}/tmp/khiops-cicd" + + # root_temp_dir + # (where the log file is saved by default when using `kh`) + # can be remote + runner.root_temp_dir = f"https://{cloud_file_uri_prefix}/khiops-cicd/tmp" + + @classmethod + def tearDownClass(cls): + """Sets back the runner defaults""" + if azure_config_for_files_exists(): + kh.get_runner().__init__() + + def config_exists(self): + return azure_config_for_files_exists() + + def remote_access_test_case(self): + return "Azure Files" + + +class KhiopsAzureRemoteBlobTests( + KhiopsRemoteAccessTestsContainer.KhiopsRemoteAccessTests +): + """Integration tests with Azure Storage filesystems : Blobs""" + + @classmethod + def setUpClass(cls): + """Sets up remote directories in runner""" + if azure_config_for_blobs_exists(): + runner = kh.get_runner() + cloud_file_uri_prefix = os.environ["CLOUD_BLOB_URI_PREFIX"] + + cls.init_remote_storage( + bucket_name_or_storage_prefix=cloud_file_uri_prefix, proto="https" + ) + + runner.samples_dir = f"https://{cloud_file_uri_prefix}/khiops-cicd/samples" + resources_directory = KhiopsTestHelper.get_resources_dir() + + # WARNING : khiops temp files cannot be remote + cls._khiops_temp_dir = f"{resources_directory}/tmp/khiops-cicd" + + # root_temp_dir + # (where the log file is saved by default when using `kh`) + # can be remote + runner.root_temp_dir = f"https://{cloud_file_uri_prefix}/khiops-cicd/tmp" + + @classmethod + def tearDownClass(cls): + """Sets back the runner defaults""" + if azure_config_for_blobs_exists(): + kh.get_runner().__init__() + + def config_exists(self): + return azure_config_for_blobs_exists() + + def remote_access_test_case(self): + return "Azure Blobs" + + class KhiopsDockerRunnerTests(KhiopsRemoteAccessTestsContainer.KhiopsRemoteAccessTests): """Integration tests with the Docker runner service""" @@ -462,11 +568,6 @@ def tearDownClass(cls): def config_exists(self): return docker_runner_config_exists() - def should_skip_in_a_conda_env(self): - # Tests using a docker runner should never be skipped - # even in a conda environment - return False - def remote_access_test_case(self): return "KhiopsDockerRunner"