-
Notifications
You must be signed in to change notification settings - Fork 35
Expand file tree
/
Copy pathdata_service_interface.py
More file actions
129 lines (111 loc) · 3.7 KB
/
data_service_interface.py
File metadata and controls
129 lines (111 loc) · 3.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
from abc import ABC, abstractmethod
from io import IOBase
from typing import Callable, Iterable, List, Optional, Sequence
from cdisc_rules_engine.models.dataset.dataset_interface import DatasetInterface
from cdisc_rules_engine.models.dataset_metadata import DatasetMetadata
from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata
from .cache_service_interface import CacheServiceInterface
class DataServiceInterface(ABC):
"""
Interface that defines a set of methods
that must be implemented by all services
that download datasets from a certain storage.
"""
dataset_implementation: DatasetInterface
@classmethod
@abstractmethod
def get_instance(
cls, cache_service: CacheServiceInterface, config, **kwargs
) -> "DataServiceInterface":
"""
Creates an instance of data service
"""
@abstractmethod
def get_datasets(self) -> List[SDTMDatasetMetadata]:
"""
Gets a list of datasets.
"""
@abstractmethod
def get_dataset(self, dataset_name: str, **params) -> DatasetInterface:
"""
Gets dataset from blob storage.
"""
@abstractmethod
def get_dataset_metadata(self, dataset_name: str, **kwargs) -> DatasetInterface:
"""
Gets dataset metadata and returns it as DatasetInterface.
"""
@abstractmethod
def get_raw_dataset_metadata(
self, dataset_name: str, **kwargs
) -> SDTMDatasetMetadata:
"""
Gets dataset metadata and returns it as DatasetMetadata instance.
"""
@abstractmethod
def get_variables_metadata(self, dataset_name: str, **params) -> DatasetInterface:
"""
Gets variables metadata of a dataset.
"""
@abstractmethod
def get_dataset_by_type(
self, dataset_name: str, dataset_type: str, **params
) -> DatasetInterface:
"""
Generic function to return dataset based on the type.
dataset_type param can be: contents, metadata, variables_metadata.
"""
@abstractmethod
def concat_split_datasets(
self,
func_to_call: Callable,
datasets_metadata: Iterable[DatasetMetadata],
**kwargs,
):
"""
Accepts a list of split dataset filenames,
downloads all of them and merges into a single DataFrame.
"""
@abstractmethod
def get_define_xml_contents(self, dataset_name: str) -> bytes:
"""
Returns contents of define.xml file.
"""
@abstractmethod
def has_all_files(self, prefix: str, file_names: List[str]) -> bool:
"""
Checks if all files exist
"""
@abstractmethod
def get_file_matching_pattern(self, prefix: str, pattern: str) -> str:
"""
Returns the path to the file if one matches the pattern given, otherwise
return None.
"""
@abstractmethod
def read_data(self, file_path: str) -> IOBase:
"""
Reads byte data from the given path and returns BinaryIO instance.
"""
@abstractmethod
def get_dataset_class(
self,
dataset: DatasetInterface,
file_path: str,
datasets: Iterable[SDTMDatasetMetadata],
dataset_metadata: SDTMDatasetMetadata,
) -> Optional[str]:
"""
Returns dataset class based on its contents
"""
@abstractmethod
def to_parquet(self, file_path: str) -> str:
"""
Converts a given file_path to parquet. Returns path to new file
"""
@staticmethod
@abstractmethod
def is_valid_data(dataset_paths: Sequence[str]) -> bool:
"""
Checks if the data within the given dataset paths belong in this data service
"""