33import datetime
44import json
55import logging
6+ import re
67import time
78import typing
89from pathlib import Path
910from typing import Dict , List , Optional , Union
11+ from urllib .parse import urlparse
1012
1113import requests
1214
1315from openeo .internal .documentation import openeo_endpoint
1416from openeo .internal .jupyter import VisualDict , render_component , render_error
15- from openeo .internal .warnings import deprecated , legacy_alias
17+ from openeo .internal .warnings import deprecated , legacy_alias , user_deprecation_warning
1618from openeo .rest import (
1719 DEFAULT_DOWNLOAD_CHUNK_SIZE ,
1820 DEFAULT_DOWNLOAD_RANGE_SIZE ,
@@ -393,29 +395,90 @@ class RESTJob(BatchJob):
393395 """
394396
395397
398+ FILENAME_UNSAFE_REGEX = re .compile (r"[^a-zA-Z0-9_.-]+" )
399+
400+
401+ def _sanitize_filename (s : str , replacement : str = "" ) -> str :
402+ """
403+ Sanitize a filename (strip/replace risky characters)
404+ so that it can be safely used as a filename.
405+ """
406+ s = str (s ).strip ()
407+ return FILENAME_UNSAFE_REGEX .sub (replacement , s )
408+
409+
410+ _MEDIA_TYPE_EXTENSION_MAP = {
411+ "image/tiff" : ".tiff" ,
412+ "image/tiff; application=geotiff" : ".tiff" ,
413+ "image/tiff; application=geotiff; profile=cloud-optimized" : ".tiff" ,
414+ "application/x-netcdf" : ".nc" ,
415+ "image/png" : ".png" ,
416+ "application/json" : ".json" ,
417+ "application/geo+json" : ".json" ,
418+ "text/csv" : ".csv" ,
419+ "x-gis/x-shapefile" : ".shp" ,
420+ "application/geopackage+sqlite3" : ".gpkg" ,
421+ "application/parquet; profile=geo" : ".parquet" ,
422+ "application/zip+zarr" : ".zarr" ,
423+ "application/zip" : ".zip" ,
424+ }
425+
426+
396427class ResultAsset :
397428 """
398429 Result asset of a batch job (e.g. a GeoTIFF or JSON file)
399430
400431 .. versionadded:: 0.4.10
401432 """
402433
403- def __init__ (self , job : BatchJob , name : str , href : str , metadata : dict ):
404- self .job = job
434+ __slots__ = ("job" , "key" , "href" , "media_type" , "metadata" )
405435
406- self .name = name
407- """Asset name as advertised by the backend."""
436+ def __init__ (self , job : BatchJob , key : str , href : str , metadata : dict ):
437+ # TODO: the owning job is actually only used to obtain a Connection for downloading. Eliminate BatchJob as requirement
438+ self .job = job
439+ self .key = key
408440
441+ # URL to the downloadable asset
409442 self .href = href
410- """Download URL of the asset."""
411443
444+ self .media_type = metadata .get ("type" )
445+
446+ # Asset metadata provided by the backend, possibly containing keys "roles", "title", "description".
412447 self .metadata = metadata
413- """Asset metadata provided by the backend, possibly containing keys "type" (for media type), "roles", "title", "description"."""
414448
415449 def __repr__ (self ):
416- return "<ResultAsset {n!r} (type {t}) at {h!r}>" .format (
417- n = self .name , t = self .metadata .get ("type" , "unknown" ), h = self .href
418- )
450+ return f"<ResultAsset { self .key !r} (media type { self .media_type } ) at { self .href !r} >"
451+
452+ @property
453+ def name (self ):
454+ # TODO: remove this once users got enough time to migrate
455+ user_deprecation_warning ("`ResultAsset.name` is deprecated and will be removed, use `ResultAsset.key` instead" )
456+ return self .key
457+
458+ def _make_filename (self ) -> str :
459+ """
460+ Produce a filename for downloading the asset to
461+ as fallback when user did not provide something,
462+ based on: asset key (which is not guaranteed to consist of filename-safe characters)
463+ and filename in href (if any)
464+ """
465+
466+ if re .fullmatch (r"^[a-zA-Z0-9_.-]+\.[a-zA-Z0-9]{1,10}$" , self .key ):
467+ # Legacy mode: asset key already looks like a filename
468+ return self .key
469+
470+ # Build filename from key, href's path (if any)
471+ # and guess extension from media type if necessary
472+ sanitized_key = _sanitize_filename (self .key )
473+ href_path = urlparse (str (self .href )).path
474+ href_basename = _sanitize_filename (Path (href_path ).name )
475+ filename = f"{ sanitized_key } -{ href_basename } "
476+
477+ if not re .fullmatch (r".*\.[a-zA-Z0-9]{1,10}$" , filename ):
478+ # Extension seems missing, do media type based guess (best effort)
479+ if extension := _MEDIA_TYPE_EXTENSION_MAP .get (self .media_type ):
480+ filename += extension
481+ return filename
419482
420483 def download (
421484 self ,
@@ -427,16 +490,18 @@ def download(
427490 """
428491 Download asset to given location
429492
430- :param target: download target path. Can be an existing folder
431- (in which case the filename advertised by backend will be used)
432- or full file name. By default, the working directory will be used.
493+ :param target: target path to download to.
494+ Can be a path to file, or to an existing folder
495+ (in which case the filename will be constructed
496+ in best-effort fashion, based on available metadata)
497+ By default, the working directory will be used.
433498 :param chunk_size: chunk size for streaming response.
434499 """
435500 target = Path (target or Path .cwd ())
436501 if target .is_dir ():
437- target = target / self .name
502+ target = target / self ._make_filename ()
438503 ensure_dir (target .parent )
439- logger .info ("Downloading Job result asset {n !r} from {h !s} to {t !s}" . format ( n = self . name , h = self . href , t = target ) )
504+ logger .info (f "Downloading job result asset { self . key !r} from { self . href !s} to { target !s} " )
440505 self ._download_to_file (url = self .href , target = target , chunk_size = chunk_size , range_size = range_size )
441506 return target
442507
@@ -445,7 +510,7 @@ def _get_response(self, stream=True) -> requests.Response:
445510
446511 def load_json (self ) -> dict :
447512 """Load asset in memory and parse as JSON."""
448- if not ( self .name . lower (). endswith ( ".json" ) or self . metadata . get ( "type" ) == "application/json" ) :
513+ if self .media_type not in { "application/json" , "application/geo+ json" } :
449514 logger .warning ("Asset might not be JSON" )
450515 return self ._get_response ().json ()
451516
@@ -559,50 +624,65 @@ def get_assets(self) -> List[ResultAsset]:
559624 if not assets :
560625 logger .warning ("No assets found in job result metadata." )
561626 return [
562- ResultAsset (job = self ._job , name = name , href = asset ["href" ], metadata = asset ) for name , asset in assets .items ()
627+ ResultAsset (job = self ._job , key = key , href = asset ["href" ], metadata = asset ) for key , asset in assets .items ()
563628 ]
564629
565- def get_asset (self , name : str = None ) -> ResultAsset :
630+ def get_asset (self , key : Optional [ str ] = None , * , name : Optional [ str ] = None ) -> ResultAsset :
566631 """
567- Get single asset by name or without name if there is only one.
632+ Get single asset by asset key or without key if there is only one.
568633 """
569634 # TODO: also support getting a single asset by type or role?
635+ if name :
636+ # TODO: remove this legacy `name` support when users got enough time to migrate
637+ user_deprecation_warning ("Argument `name` is deprecated and will be removed, use `key` instead." )
638+ key = name
639+ del name
640+
570641 assets = self .get_assets ()
571642 if len (assets ) == 0 :
572643 raise OpenEoClientException ("No assets in result." )
573- if name is None :
644+ if key is None :
574645 if len (assets ) == 1 :
575646 return assets [0 ]
576647 else :
577648 raise MultipleAssetException (
578- "Multiple result assets for job {j}: {a}" .format (j = self ._job .job_id , a = [a .name for a in assets ])
649+ "Multiple result assets for job {j}: {a}" .format (j = self ._job .job_id , a = [a .key for a in assets ])
579650 )
580651 else :
581652 try :
582- return next (a for a in assets if a .name == name )
653+ return next (a for a in assets if a .key == key )
583654 except StopIteration :
584- raise OpenEoClientException ("No asset {n !r} in: {a}" .format (n = name , a = [a .name for a in assets ]))
655+ raise OpenEoClientException ("No asset {k !r} in: {a}" .format (k = key , a = [a .key for a in assets ]))
585656
586657 def download_file (
587658 self ,
588- target : Union [Path , str ] = None ,
589- name : str = None ,
659+ target : Union [Path , str , None ] = None ,
660+ key : Optional [ str ] = None ,
590661 * ,
591- chunk_size = DEFAULT_DOWNLOAD_CHUNK_SIZE ,
662+ chunk_size : int = DEFAULT_DOWNLOAD_CHUNK_SIZE ,
592663 range_size : int = DEFAULT_DOWNLOAD_RANGE_SIZE ,
664+ name : Optional [str ] = None ,
593665 ) -> Path :
594666 """
595667 Download single asset. Can be used when there is only one asset in the
596- :py:class:`JobResults`, or when the desired asset name is given explicitly.
597-
598- :param target: path to download to. Can be an existing directory
599- (in which case the filename advertised by backend will be used)
600- or full file name. By default, the working directory will be used.
601- :param name: asset name to download (not required when there is only one asset)
668+ :py:class:`JobResults`, or when the desired asset key is given explicitly.
669+
670+ :param target: target path to download to.
671+ Can be a path to file, or to an existing folder
672+ (in which case the filename will be constructed
673+ in best-effort fashion, based on available metadata)
674+ By default, the working directory will be used.
675+ :param key: asset key to download (not required when there is only one asset)
602676 :return: path of downloaded asset
603677 """
678+ if name :
679+ # TODO: remove this legacy `name` support when users got enough time to migrate
680+ user_deprecation_warning ("Argument `name` is deprecated and will be removed, use `key` instead." )
681+ key = name
682+ del name
683+
604684 try :
605- return self .get_asset (name = name ).download (target = target , chunk_size = chunk_size , range_size = range_size )
685+ return self .get_asset (key = key ).download (target = target , chunk_size = chunk_size , range_size = range_size )
606686 except MultipleAssetException :
607687 raise OpenEoClientException (
608688 "Can not use `download_file` with multiple assets. Use `download_files` instead."
0 commit comments