Skip to content

Commit d4a4ad2

Browse files
authored
Merge pull request #295 from Duke-GCB/simplify-downloading
simplify downloading
2 parents 43d8b05 + 90bfcab commit d4a4ad2

16 files changed

Lines changed: 1167 additions & 1514 deletions

ddsc/config.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
FILE_EXCLUDE_REGEX_DEFAULT = '^\.DS_Store$|^\.ddsclient$|^\.\_'
2525
MAX_DEFAULT_WORKERS = 8
2626
GET_PAGE_SIZE_DEFAULT = 100 # fetch 100 items per page
27+
DEFAULT_FILE_DOWNLOAD_RETRIES = 5
2728

2829

2930
def get_user_config_filename():
@@ -74,6 +75,7 @@ class Config(object):
7475
FILE_EXCLUDE_REGEX = 'file_exclude_regex' # allows customization of which filenames will be uploaded
7576
GET_PAGE_SIZE = 'get_page_size' # page size used for GET pagination requests
7677
STORAGE_PROVIDER_ID = 'storage_provider_id' # setting to override the default storage provider
78+
FILE_DOWNLOAD_RETRIES = 'file_download_retries' # number of times to retry a failed file download
7779

7880
def __init__(self):
7981
self.values = {}
@@ -160,8 +162,7 @@ def download_workers(self):
160162
Return the number of parallel works to use when downloading a file.
161163
:return: int number of workers. Specify None or 1 to disable parallel downloading
162164
"""
163-
# Profiling download on different servers showed half the number of CPUs to be optimum for speed.
164-
default_workers = int(math.ceil(default_num_workers() / 2))
165+
default_workers = int(math.ceil(default_num_workers()))
165166
return self.values.get(Config.DOWNLOAD_WORKERS, default_workers)
166167

167168
@property
@@ -224,3 +225,11 @@ def storage_provider_id(self):
224225
:return: str: uuid of storage provider
225226
"""
226227
return self.values.get(Config.STORAGE_PROVIDER_ID, None)
228+
229+
@property
230+
def file_download_retries(self):
231+
"""
232+
Returns number of times to retry failed external file downloads
233+
:return: int: number of retries allowed before failure
234+
"""
235+
return self.values.get(Config.FILE_DOWNLOAD_RETRIES, DEFAULT_FILE_DOWNLOAD_RETRIES)

ddsc/core/d4s2.py

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,12 @@
88
import tempfile
99
import requests
1010
from ddsc.core.upload import ProjectUpload
11-
from ddsc.core.download import ProjectDownload
11+
from ddsc.core.download import ProjectFileDownloader
1212
from ddsc.core.ddsapi import DataServiceAuth
1313
from ddsc.core.util import KindType
1414
from ddsc.versioncheck import get_internal_version_str
1515
from ddsc.core.remotestore import ProjectNameOrId, RemotePath
16+
from ddsc.sdk.client import Client
1617

1718
UNAUTHORIZED_MESSAGE = """
1819
ERROR: Your account does not have authorization for D4S2 (the deliver/share service).
@@ -242,6 +243,7 @@ def __init__(self, config, remote_store, print_func):
242243
:param print_func: func used to print output somewhere
243244
"""
244245
self.config = config
246+
self.client = Client(config)
245247
auth = DataServiceAuth(self.config)
246248
api_token = auth.get_auth()
247249
self.api = D4S2Api(config.d4s2_url, api_token)
@@ -345,24 +347,32 @@ def _copy_project(self, project, new_project_name, path_filter):
345347
if remote_project:
346348
raise ValueError("A project with name '{}' already exists.".format(new_project_name))
347349
activity = CopyActivity(self.remote_store.data_service, project, new_project_name)
348-
self._download_project(activity, project, temp_directory, path_filter)
350+
self._download_project(activity, project.id, temp_directory, path_filter)
349351
self._upload_project(activity, new_project_name, temp_directory)
350352
activity.finished()
351353
shutil.rmtree(temp_directory)
352354
return self.remote_store.fetch_remote_project(new_project_name_or_id, must_exist=True)
353355

354-
def _download_project(self, activity, project, temp_directory, path_filter):
356+
def _download_project(self, activity, project_id, temp_directory, path_filter):
355357
"""
356358
Download the project with project_name to temp_directory.
357359
:param activity: CopyActivity: info about the copy activity are downloading for
358-
:param project: remotestore.RemoteProject project to download
360+
:param project_id: uuid of the project to download
359361
:param temp_directory: str path to directory we can download into
360362
:param path_filter: PathFilter: filters what files are shared
361363
"""
364+
project = self.client.get_project_by_id(project_id)
362365
self.print_func("Downloading a copy of '{}'.".format(project.name))
363-
project_download = ProjectDownload(self.remote_store, project, temp_directory, path_filter,
364-
file_download_pre_processor=DownloadedFileRelations(activity))
365-
project_download.run()
366+
367+
downloader = ProjectFileDownloader(self.config, temp_directory, project, path_filter)
368+
downloader.run()
369+
370+
downloaded_file_relations = DownloadedFileRelations(activity)
371+
for remote_path, dds_file in project.get_path_to_files().items():
372+
if path_filter.include_path(remote_path):
373+
downloaded_file_relations.add(self.remote_store.data_service,
374+
remote_path,
375+
dds_file.current_version['id'])
366376

367377
def _upload_project(self, activity, project_name, temp_directory):
368378
"""
@@ -435,15 +445,10 @@ def __init__(self, activity):
435445
"""
436446
self.activity = activity
437447

438-
def run(self, data_service, project_file):
448+
def add(self, data_service, remote_path, file_version_id):
439449
"""
440450
Attach a remote file to activity with used relationship.
441-
:param data_service: DataServiceApi: service used to attach relationship
442-
:param project_file: ProjectFile: contains details about a file we will attach
443451
"""
444-
remote_path = project_file.path
445-
file_dict = data_service.get_file(project_file.id).json()
446-
file_version_id = file_dict['current_version']['id']
447452
data_service.create_used_relation(self.activity.id, KindType.file_str, file_version_id)
448453
self.activity.remote_path_to_file_version_id[remote_path] = file_version_id
449454

ddsc/core/ddsapi.py

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
To cancel this operation, press Ctrl+C.
1717
"""
1818
CONNECTION_RETRY_MESSAGE = "Connection failed. Retrying."
19+
DDS_TOTAL_PAGES_HEADER = 'x-total-pages'
20+
DDS_TOTAL_HEADER = 'x-total'
1921

2022

2123
def get_user_agent_str():
@@ -339,7 +341,7 @@ def _get_single_item(self, url_suffix, data, content_type=ContentType.json):
339341
return self._check_err(resp, url_suffix, data, allow_pagination=False)
340342

341343
@retry_connection_exceptions
342-
def _get_single_page(self, url_suffix, data, page_num):
344+
def _get_single_page(self, url_suffix, data, page_num, page_size=None):
343345
"""
344346
Send GET request to API at url_suffix with post_data adding page and per_page parameters to
345347
retrieve a single page. Page size is determined by config.page_size.
@@ -350,7 +352,9 @@ def _get_single_page(self, url_suffix, data, page_num):
350352
"""
351353
data_with_per_page = dict(data)
352354
data_with_per_page['page'] = page_num
353-
data_with_per_page['per_page'] = self._get_page_size()
355+
if not page_size:
356+
page_size = self._get_page_size()
357+
data_with_per_page['per_page'] = page_size
354358
(url, data_str, headers) = self._url_parts(url_suffix, data_with_per_page,
355359
content_type=ContentType.form)
356360
resp = self.http.get(url, headers=headers, params=data_str)
@@ -558,6 +562,26 @@ def get_project_files(self, project_id):
558562
url_prefix = "/projects/{}/files".format(project_id)
559563
return self._get_collection(url_prefix, {})
560564

565+
def get_project_files_generator(self, project_id, page_size):
566+
"""
567+
Send GET to /projects/{project_id}/files
568+
:param project_id: str uuid of the project
569+
:param page_size: int page size to fetch
570+
:return: generator that returns (DDS file download JSON, header metadata) pairs
571+
"""
572+
url_suffix = "/projects/{}/files".format(project_id)
573+
data = {}
574+
# process first page separately to read total pages
575+
response = self._get_single_page(url_suffix, data, page_size=page_size, page_num=1)
576+
total_pages = int(response.headers.get('x-total-pages'))
577+
for item in response.json()["results"]:
578+
yield item, response.headers
579+
# process the rest of the pages
580+
for page in range(2, total_pages + 1):
581+
response = self._get_single_page(url_suffix, data, page_size=page_size, page_num=page)
582+
for item in response.json()["results"]:
583+
yield item, response.headers
584+
561585
def get_folder_children(self, folder_id, name_contains):
562586
"""
563587
Send GET to /folders/{folder_id} filtering by a name.

0 commit comments

Comments
 (0)