Skip to content

Commit d565fb9

Browse files
committed
Eliminate assumption that asset keys are usable as filename #820
1 parent c6825eb commit d565fb9

5 files changed

Lines changed: 239 additions & 77 deletions

File tree

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
88
## [Unreleased]
99

1010
### Added
11+
1112
- `DataCube.resample_spatial()` now supports parameterized `resolution` and `projection` arguments. ([#897](https://github.com/Open-EO/openeo-python-client/issues/897))
13+
- Sanitize asset download filenames, instead of blindly using the asset key as filename. ([#820](https://github.com/Open-EO/openeo-python-client/issues/820))
1214

1315
### Changed
1416

docs/batch_jobs.rst

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -378,11 +378,11 @@ If you know that there is just a single result file, you can also download it di
378378
This will fail however if there are multiple assets in the job result
379379
(like in the metadata example above).
380380
In that case you can still download a single by specifying which one you
381-
want to download with the ``name`` argument:
381+
want to download with the ``key`` argument:
382382
383383
.. code-block:: python
384384
385-
results.download_file("data/out/result.tiff", name="res002.tiff")
385+
results.download_file("data/out/result.tiff", key="res002.tiff")
386386
387387
388388
Fine-grained asset downloads
@@ -396,9 +396,14 @@ with :py:meth:`~openeo.rest.job.ResultAsset.download`, like this:
396396
.. code-block:: python
397397
398398
for asset in results.get_assets():
399-
if asset.metadata["type"].startswith("image/tiff"):
400-
asset.download("data/out/result-v2-" + asset.name)
399+
if asset.media_type.startswith("image/tiff"):
400+
asset.download("data/out/result-v2-" + asset.key)
401401
402+
.. note::
403+
The ``key`` of an asset is not guaranteed to be directly usable as filename safe,
404+
so while the snippet above glosses over that aspect,
405+
make sure to properly sanitize the filename or
406+
at least check the behavior of the backend you are working with.
402407
403408
Directly load batch job results
404409
===============================

openeo/internal/warnings.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,13 @@ class UserDeprecationWarning(Warning):
2727
pass
2828

2929

30+
def user_deprecation_warning(message: str, stacklevel: int = 3):
31+
"""
32+
Helper to compactly raise a `UserDeprecationWarning` with proper stacklevel.
33+
"""
34+
warnings.warn(message, category=UserDeprecationWarning, stacklevel=stacklevel)
35+
36+
3037
def test_warnings(stacklevel=1):
3138
"""Trigger some warnings (for test contexts)."""
3239
for warning in [UserWarning, DeprecationWarning, UserDeprecationWarning]:

openeo/rest/job.py

Lines changed: 113 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,18 @@
33
import datetime
44
import json
55
import logging
6+
import re
67
import time
78
import typing
89
from pathlib import Path
910
from typing import Dict, List, Optional, Union
11+
from urllib.parse import urlparse
1012

1113
import requests
1214

1315
from openeo.internal.documentation import openeo_endpoint
1416
from openeo.internal.jupyter import VisualDict, render_component, render_error
15-
from openeo.internal.warnings import deprecated, legacy_alias
17+
from openeo.internal.warnings import deprecated, legacy_alias, user_deprecation_warning
1618
from openeo.rest import (
1719
DEFAULT_DOWNLOAD_CHUNK_SIZE,
1820
DEFAULT_DOWNLOAD_RANGE_SIZE,
@@ -393,29 +395,90 @@ class RESTJob(BatchJob):
393395
"""
394396

395397

398+
FILENAME_UNSAFE_REGEX = re.compile(r"[^a-zA-Z0-9_.-]+")
399+
400+
401+
def _sanitize_filename(s: str, replacement: str = "") -> str:
402+
"""
403+
Sanitize a filename (strip/replace risky characters)
404+
so that it can be safely used as a filename.
405+
"""
406+
s = str(s).strip()
407+
return FILENAME_UNSAFE_REGEX.sub(replacement, s)
408+
409+
410+
_MEDIA_TYPE_EXTENSION_MAP = {
411+
"image/tiff": ".tiff",
412+
"image/tiff; application=geotiff": ".tiff",
413+
"image/tiff; application=geotiff; profile=cloud-optimized": ".tiff",
414+
"application/x-netcdf": ".nc",
415+
"image/png": ".png",
416+
"application/json": ".json",
417+
"application/geo+json": ".json",
418+
"text/csv": ".csv",
419+
"x-gis/x-shapefile": ".shp",
420+
"application/geopackage+sqlite3": ".gpkg",
421+
"application/parquet; profile=geo": ".parquet",
422+
"application/zip+zarr": ".zarr",
423+
"application/zip": ".zip",
424+
}
425+
426+
396427
class ResultAsset:
397428
"""
398429
Result asset of a batch job (e.g. a GeoTIFF or JSON file)
399430
400431
.. versionadded:: 0.4.10
401432
"""
402433

403-
def __init__(self, job: BatchJob, name: str, href: str, metadata: dict):
404-
self.job = job
434+
__slots__ = ("job", "key", "href", "media_type", "metadata")
405435

406-
self.name = name
407-
"""Asset name as advertised by the backend."""
436+
def __init__(self, job: BatchJob, key: str, href: str, metadata: dict):
437+
# TODO: the owning job is actually only used to obtain a Connection for downloading. Eliminate BatchJob as requirement
438+
self.job = job
439+
self.key = key
408440

441+
# URL to the downloadable asset
409442
self.href = href
410-
"""Download URL of the asset."""
411443

444+
self.media_type = metadata.get("type")
445+
446+
# Asset metadata provided by the backend, possibly containing keys "roles", "title", "description".
412447
self.metadata = metadata
413-
"""Asset metadata provided by the backend, possibly containing keys "type" (for media type), "roles", "title", "description"."""
414448

415449
def __repr__(self):
416-
return "<ResultAsset {n!r} (type {t}) at {h!r}>".format(
417-
n=self.name, t=self.metadata.get("type", "unknown"), h=self.href
418-
)
450+
return f"<ResultAsset {self.key!r} (media type {self.media_type}) at {self.href!r}>"
451+
452+
@property
453+
def name(self):
454+
# TODO: remove this once users got enough time to migrate
455+
user_deprecation_warning("`ResultAsset.name` is deprecated and will be removed, use `ResultAsset.key` instead")
456+
return self.key
457+
458+
def _make_filename(self) -> str:
459+
"""
460+
Produce a filename for downloading the asset to
461+
as fallback when user did not provide something,
462+
based on: asset key (which is not guaranteed to consist of filename-safe characters)
463+
and filename in href (if any)
464+
"""
465+
466+
if re.fullmatch(r"^[a-zA-Z0-9_.-]+\.[a-zA-Z0-9]{1,10}$", self.key):
467+
# Legacy mode: asset key already looks like a filename
468+
return self.key
469+
470+
# Build filename from key, href's path (if any)
471+
# and guess extension from media type if necessary
472+
sanitized_key = _sanitize_filename(self.key)
473+
href_path = urlparse(str(self.href)).path
474+
href_basename = _sanitize_filename(Path(href_path).name)
475+
filename = f"{sanitized_key}-{href_basename}"
476+
477+
if not re.fullmatch(r".*\.[a-zA-Z0-9]{1,10}$", filename):
478+
# Extension seems missing, do media type based guess (best effort)
479+
if extension := _MEDIA_TYPE_EXTENSION_MAP.get(self.media_type):
480+
filename += extension
481+
return filename
419482

420483
def download(
421484
self,
@@ -427,16 +490,18 @@ def download(
427490
"""
428491
Download asset to given location
429492
430-
:param target: download target path. Can be an existing folder
431-
(in which case the filename advertised by backend will be used)
432-
or full file name. By default, the working directory will be used.
493+
:param target: target path to download to.
494+
Can be a path to file, or to an existing folder
495+
(in which case the filename will be constructed
496+
in best-effort fashion, based on available metadata)
497+
By default, the working directory will be used.
433498
:param chunk_size: chunk size for streaming response.
434499
"""
435500
target = Path(target or Path.cwd())
436501
if target.is_dir():
437-
target = target / self.name
502+
target = target / self._make_filename()
438503
ensure_dir(target.parent)
439-
logger.info("Downloading Job result asset {n!r} from {h!s} to {t!s}".format(n=self.name, h=self.href, t=target))
504+
logger.info(f"Downloading job result asset {self.key!r} from {self.href!s} to {target!s}")
440505
self._download_to_file(url=self.href, target=target, chunk_size=chunk_size, range_size=range_size)
441506
return target
442507

@@ -445,7 +510,7 @@ def _get_response(self, stream=True) -> requests.Response:
445510

446511
def load_json(self) -> dict:
447512
"""Load asset in memory and parse as JSON."""
448-
if not (self.name.lower().endswith(".json") or self.metadata.get("type") == "application/json"):
513+
if self.media_type not in {"application/json", "application/geo+json"}:
449514
logger.warning("Asset might not be JSON")
450515
return self._get_response().json()
451516

@@ -559,50 +624,65 @@ def get_assets(self) -> List[ResultAsset]:
559624
if not assets:
560625
logger.warning("No assets found in job result metadata.")
561626
return [
562-
ResultAsset(job=self._job, name=name, href=asset["href"], metadata=asset) for name, asset in assets.items()
627+
ResultAsset(job=self._job, key=key, href=asset["href"], metadata=asset) for key, asset in assets.items()
563628
]
564629

565-
def get_asset(self, name: str = None) -> ResultAsset:
630+
def get_asset(self, key: Optional[str] = None, *, name: Optional[str] = None) -> ResultAsset:
566631
"""
567-
Get single asset by name or without name if there is only one.
632+
Get single asset by asset key or without key if there is only one.
568633
"""
569634
# TODO: also support getting a single asset by type or role?
635+
if name:
636+
# TODO: remove this legacy `name` support when users got enough time to migrate
637+
user_deprecation_warning("Argument `name` is deprecated and will be removed, use `key` instead.")
638+
key = name
639+
del name
640+
570641
assets = self.get_assets()
571642
if len(assets) == 0:
572643
raise OpenEoClientException("No assets in result.")
573-
if name is None:
644+
if key is None:
574645
if len(assets) == 1:
575646
return assets[0]
576647
else:
577648
raise MultipleAssetException(
578-
"Multiple result assets for job {j}: {a}".format(j=self._job.job_id, a=[a.name for a in assets])
649+
"Multiple result assets for job {j}: {a}".format(j=self._job.job_id, a=[a.key for a in assets])
579650
)
580651
else:
581652
try:
582-
return next(a for a in assets if a.name == name)
653+
return next(a for a in assets if a.key == key)
583654
except StopIteration:
584-
raise OpenEoClientException("No asset {n!r} in: {a}".format(n=name, a=[a.name for a in assets]))
655+
raise OpenEoClientException("No asset {k!r} in: {a}".format(k=key, a=[a.key for a in assets]))
585656

586657
def download_file(
587658
self,
588-
target: Union[Path, str] = None,
589-
name: str = None,
659+
target: Union[Path, str, None] = None,
660+
key: Optional[str] = None,
590661
*,
591-
chunk_size=DEFAULT_DOWNLOAD_CHUNK_SIZE,
662+
chunk_size: int = DEFAULT_DOWNLOAD_CHUNK_SIZE,
592663
range_size: int = DEFAULT_DOWNLOAD_RANGE_SIZE,
664+
name: Optional[str] = None,
593665
) -> Path:
594666
"""
595667
Download single asset. Can be used when there is only one asset in the
596-
:py:class:`JobResults`, or when the desired asset name is given explicitly.
597-
598-
:param target: path to download to. Can be an existing directory
599-
(in which case the filename advertised by backend will be used)
600-
or full file name. By default, the working directory will be used.
601-
:param name: asset name to download (not required when there is only one asset)
668+
:py:class:`JobResults`, or when the desired asset key is given explicitly.
669+
670+
:param target: target path to download to.
671+
Can be a path to file, or to an existing folder
672+
(in which case the filename will be constructed
673+
in best-effort fashion, based on available metadata)
674+
By default, the working directory will be used.
675+
:param key: asset key to download (not required when there is only one asset)
602676
:return: path of downloaded asset
603677
"""
678+
if name:
679+
# TODO: remove this legacy `name` support when users got enough time to migrate
680+
user_deprecation_warning("Argument `name` is deprecated and will be removed, use `key` instead.")
681+
key = name
682+
del name
683+
604684
try:
605-
return self.get_asset(name=name).download(target=target, chunk_size=chunk_size, range_size=range_size)
685+
return self.get_asset(key=key).download(target=target, chunk_size=chunk_size, range_size=range_size)
606686
except MultipleAssetException:
607687
raise OpenEoClientException(
608688
"Can not use `download_file` with multiple assets. Use `download_files` instead."

0 commit comments

Comments
 (0)